{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "761e99a0", "metadata": {}, "outputs": [], "source": [ "!pip install imblearn" ] }, { "cell_type": "code", "execution_count": null, "id": "741344ca", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from imblearn.over_sampling import SMOTE\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from imblearn.pipeline import Pipeline\n", "from sklearn.model_selection import train_test_split\n", "from collections import Counter\n", "import sagemaker\n", "import boto3\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "id": "0e1818f9", "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv('processed_data_classification_v2.csv', delimiter=',')" ] }, { "cell_type": "code", "execution_count": null, "id": "b4877dfa", "metadata": {}, "outputs": [], "source": [ "# Random sampling 90% of the data\n", "data = data.loc[np.random.choice(data.index, int(0.9*len(data)), replace=False)]\n", "len(data)" ] }, { "cell_type": "code", "execution_count": null, "id": "439cbf51", "metadata": {}, "outputs": [], "source": [ "data.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "ed16606e", "metadata": {}, "outputs": [], "source": [ "data['fraudulent_provider'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "ac5d33dc", "metadata": {}, "outputs": [], "source": [ "feature_columns = data.columns[1:]\n", "label_column = data.columns[0]\n", "\n", "features = data[feature_columns].values.astype('float32')\n", "labels = (data[label_column].values).astype('float32')" ] }, { "cell_type": "code", "execution_count": null, "id": "f189e3d0", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " features, labels, test_size=0.1, stratify=labels)" ] }, { "cell_type": "code", "execution_count": null, "id": "09c350b0", "metadata": {}, "outputs": [], "source": [ "# Apply SMOTE\n", "over = SMOTE(sampling_strategy=0.25)\n", "under = RandomUnderSampler(sampling_strategy=1)\n", "steps = [('o', over), ('u', under)]\n", "pipeline = Pipeline(steps=steps)\n", "X_smote, y_smote = pipeline.fit_resample(X_train, y_train)\n", "\n", "# check for imbalance again\n", "print(sorted(Counter(y_smote).items()))" ] }, { "cell_type": "code", "execution_count": null, "id": "79c6f902", "metadata": {}, "outputs": [], "source": [ "X_smote_train, X_smote_validation, y_smote_train, y_smote_validation = train_test_split(\n", " X_smote, y_smote, test_size=0.1, stratify=y_smote)" ] }, { "cell_type": "code", "execution_count": null, "id": "998890a6", "metadata": {}, "outputs": [], "source": [ "# first column is target column\n", "trainX_concate = np.concatenate((y_smote_train.reshape(len(y_smote_train),1), X_smote_train), axis=1)\n", "trainX = pd.DataFrame(trainX_concate, index=None, columns=None)\n", "\n", "validationX_concate = np.concatenate((y_smote_validation.reshape(len(y_smote_validation),1), X_smote_validation), axis=1)\n", "validationX = pd.DataFrame(validationX_concate, index=None, columns=None)\n", "\n", "testX_concate = np.concatenate((y_test.reshape(len(y_test),1), X_test), axis=1)\n", "testX = pd.DataFrame(testX_concate, index=None, columns=None)" ] }, { "cell_type": "code", "execution_count": null, "id": "58b2072f", "metadata": {}, "outputs": [], "source": [ "trainX.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "63e39410", "metadata": {}, "outputs": [], "source": [ "testX.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "a0387c1d", "metadata": {}, "outputs": [], "source": [ "validationX.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "ed7bfeb7", "metadata": {}, "outputs": [], "source": [ "trainX.to_csv(\"cms_payment_train.csv\", header=False, index=False)\n", "validationX.to_csv(\"cms_payment_validation.csv\", header=False, index=False)\n", "testX.to_csv(\"cms_payment_test.csv\", header=False, index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "4eb3be51", "metadata": {}, "outputs": [], "source": [ "session = sagemaker.Session()\n", "bucket = session.default_bucket()\n", "prefix = 'fraud-detect-demo'\n", "\n", "# Upload training and validation data to a S3 bucket in client account\n", "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, \"cms_payment_train.csv\")).upload_file(\"cms_payment_train.csv\")\n", "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, \"cms_payment_validation.csv\")).upload_file(\"cms_payment_validation.csv\")\n", "\n", "# Testing dataset is used in server account for testing purpose only" ] } ], "metadata": { "availableInstances": [ { "_defaultOrder": 0, "_isFastLaunch": true, "category": "General purpose", "gpuNum": 0, "memoryGiB": 4, "name": "ml.t3.medium", "vcpuNum": 2 }, { "_defaultOrder": 1, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 8, "name": "ml.t3.large", "vcpuNum": 2 }, { "_defaultOrder": 2, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 16, "name": "ml.t3.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 3, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 32, "name": "ml.t3.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 4, "_isFastLaunch": true, "category": "General purpose", "gpuNum": 0, "memoryGiB": 8, "name": "ml.m5.large", "vcpuNum": 2 }, { "_defaultOrder": 5, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 16, "name": "ml.m5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 6, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 32, "name": "ml.m5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 7, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 64, "name": "ml.m5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 8, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 128, "name": "ml.m5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 9, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 192, "name": "ml.m5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 10, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 256, "name": "ml.m5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 11, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 384, "name": "ml.m5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 12, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 8, "name": "ml.m5d.large", "vcpuNum": 2 }, { "_defaultOrder": 13, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 16, "name": "ml.m5d.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 14, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 32, "name": "ml.m5d.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 15, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 64, "name": "ml.m5d.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 16, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 128, "name": "ml.m5d.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 17, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 192, "name": "ml.m5d.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 18, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 256, "name": "ml.m5d.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 19, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "memoryGiB": 384, "name": "ml.m5d.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 20, "_isFastLaunch": true, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 4, "name": "ml.c5.large", "vcpuNum": 2 }, { "_defaultOrder": 21, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 8, "name": "ml.c5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 22, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 16, "name": "ml.c5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 23, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 32, "name": "ml.c5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 24, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 72, "name": "ml.c5.9xlarge", "vcpuNum": 36 }, { "_defaultOrder": 25, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 96, "name": "ml.c5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 26, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 144, "name": "ml.c5.18xlarge", "vcpuNum": 72 }, { "_defaultOrder": 27, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "memoryGiB": 192, "name": "ml.c5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 28, "_isFastLaunch": true, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 16, "name": "ml.g4dn.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 29, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 32, "name": "ml.g4dn.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 30, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 64, "name": "ml.g4dn.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 31, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 128, "name": "ml.g4dn.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 32, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "memoryGiB": 192, "name": "ml.g4dn.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 33, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 256, "name": "ml.g4dn.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 34, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 61, "name": "ml.p3.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 35, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "memoryGiB": 244, "name": "ml.p3.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 36, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "memoryGiB": 488, "name": "ml.p3.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 37, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "memoryGiB": 768, "name": "ml.p3dn.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 38, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 16, "name": "ml.r5.large", "vcpuNum": 2 }, { "_defaultOrder": 39, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 32, "name": "ml.r5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 40, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 64, "name": "ml.r5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 41, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 128, "name": "ml.r5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 42, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 256, "name": "ml.r5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 43, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 384, "name": "ml.r5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 44, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 512, "name": "ml.r5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 45, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "memoryGiB": 768, "name": "ml.r5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 46, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 16, "name": "ml.g5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 47, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 32, "name": "ml.g5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 48, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 64, "name": "ml.g5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 49, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 128, "name": "ml.g5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 50, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "memoryGiB": 256, "name": "ml.g5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 51, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "memoryGiB": 192, "name": "ml.g5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 52, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "memoryGiB": 384, "name": "ml.g5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 53, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "memoryGiB": 768, "name": "ml.g5.48xlarge", "vcpuNum": 192 } ], "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 5 }