In [None]:
!pip install imblearn

In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from collections import Counter
import sagemaker
import boto3
import os

In [None]:
data = pd.read_csv('processed_data_classification_v2.csv', delimiter=',')

In [None]:
# Random sampling 90% of the data
data = data.loc[np.random.choice(data.index, int(0.9*len(data)), replace=False)]
len(data)

In [None]:
data.shape

In [None]:
data['fraudulent_provider'].value_counts()

In [None]:
feature_columns = data.columns[1:]
label_column = data.columns[0]

features = data[feature_columns].values.astype('float32')
labels = (data[label_column].values).astype('float32')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.1, stratify=labels)

In [None]:
# Apply SMOTE
over = SMOTE(sampling_strategy=0.25)
under = RandomUnderSampler(sampling_strategy=1)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_smote, y_smote = pipeline.fit_resample(X_train, y_train)

# check for imbalance again
print(sorted(Counter(y_smote).items()))

In [None]:
X_smote_train, X_smote_validation, y_smote_train, y_smote_validation = train_test_split(
    X_smote, y_smote, test_size=0.1, stratify=y_smote)

In [None]:
# first column is target column
trainX_concate = np.concatenate((y_smote_train.reshape(len(y_smote_train),1), X_smote_train), axis=1)
trainX = pd.DataFrame(trainX_concate, index=None, columns=None)

validationX_concate = np.concatenate((y_smote_validation.reshape(len(y_smote_validation),1), X_smote_validation), axis=1)
validationX = pd.DataFrame(validationX_concate, index=None, columns=None)

testX_concate = np.concatenate((y_test.reshape(len(y_test),1), X_test), axis=1)
testX = pd.DataFrame(testX_concate, index=None, columns=None)

In [None]:
trainX.shape

In [None]:
testX.shape

In [None]:
validationX.shape

In [None]:
trainX.to_csv("cms_payment_train.csv", header=False, index=False)
validationX.to_csv("cms_payment_validation.csv", header=False, index=False)
testX.to_csv("cms_payment_test.csv", header=False, index=False)

In [None]:
session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'fraud-detect-demo'

# Upload training and validation data to a S3 bucket in client account
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, "cms_payment_train.csv")).upload_file("cms_payment_train.csv")
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, "cms_payment_validation.csv")).upload_file("cms_payment_validation.csv")

# Testing dataset is used in server account for testing purpose only