{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Credit card fraud detector using Amazon Fraud Detector"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Investigate and process the data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import clear_output\n",
"from datetime import datetime\n",
"from io import StringIO\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"from imblearn.over_sampling import SMOTENC\n",
"from collections import Counter\n",
"from sklearn.metrics import roc_curve, roc_auc_score, auc, roc_auc_score\n",
"%matplotlib inline\n",
"\n",
"import os\n",
"import sys\n",
"import json\n",
"import uuid\n",
"import numpy as np \n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import boto3\n",
"import time"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's start by downloading the dataset from: https://www.kaggle.com/mlg-ulb/creditcardfraud?select=creditcard.csv and upload it into the notebook file system as creditcard.csv."
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"# Resources and env variables setup\n",
"s3_resource = boto3.resource('s3')\n",
"afd_resource = boto3.client('frauddetector')\n",
"\n",
"# suffix is appended to detector and model name for uniqueness \n",
"sufx = datetime.now().strftime(\"%Y%m%d\")\n",
"# replace with the bucket created in the CloudFormation\n",
"S3_BUCKET = \"afd-poc-trainingbucket-1i37svk9elcoe\"\n",
"# Replace the ARN Role with the resources created in CloudFormation stack\n",
"ARN_ROLE = \"arn:aws:iam::387461613214:role/afd-poc-NotebookInstanceExecutionRole-1FNQ41S8H2G68\" "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_rows', 500)\n",
"pd.set_option('display.max_columns', 500)\n",
"pd.set_option('display.width', 1000)\n",
"\n",
"data = pd.read_csv('creditcard.csv', delimiter=',')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's take a peek at our data (we only show a subset of the columns in the table):"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class'], dtype='object')\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Time \n",
" V1 \n",
" V2 \n",
" V3 \n",
" V4 \n",
" V5 \n",
" V6 \n",
" V7 \n",
" V8 \n",
" V9 \n",
" V10 \n",
" V11 \n",
" V12 \n",
" V13 \n",
" V14 \n",
" V15 \n",
" V16 \n",
" V17 \n",
" V18 \n",
" V19 \n",
" V20 \n",
" V21 \n",
" V22 \n",
" V23 \n",
" V24 \n",
" V25 \n",
" V26 \n",
" V27 \n",
" V28 \n",
" Amount \n",
" Class \n",
" \n",
" \n",
" \n",
" \n",
" count \n",
" 284807.000000 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 2.848070e+05 \n",
" 284807.000000 \n",
" 284807.000000 \n",
" \n",
" \n",
" mean \n",
" 94813.859575 \n",
" 3.919560e-15 \n",
" 5.688174e-16 \n",
" -8.769071e-15 \n",
" 2.782312e-15 \n",
" -1.552563e-15 \n",
" 2.010663e-15 \n",
" -1.694249e-15 \n",
" -1.927028e-16 \n",
" -3.137024e-15 \n",
" 1.768627e-15 \n",
" 9.170318e-16 \n",
" -1.810658e-15 \n",
" 1.693438e-15 \n",
" 1.479045e-15 \n",
" 3.482336e-15 \n",
" 1.392007e-15 \n",
" -7.528491e-16 \n",
" 4.328772e-16 \n",
" 9.049732e-16 \n",
" 5.085503e-16 \n",
" 1.537294e-16 \n",
" 7.959909e-16 \n",
" 5.367590e-16 \n",
" 4.458112e-15 \n",
" 1.453003e-15 \n",
" 1.699104e-15 \n",
" -3.660161e-16 \n",
" -1.206049e-16 \n",
" 88.349619 \n",
" 0.001727 \n",
" \n",
" \n",
" std \n",
" 47488.145955 \n",
" 1.958696e+00 \n",
" 1.651309e+00 \n",
" 1.516255e+00 \n",
" 1.415869e+00 \n",
" 1.380247e+00 \n",
" 1.332271e+00 \n",
" 1.237094e+00 \n",
" 1.194353e+00 \n",
" 1.098632e+00 \n",
" 1.088850e+00 \n",
" 1.020713e+00 \n",
" 9.992014e-01 \n",
" 9.952742e-01 \n",
" 9.585956e-01 \n",
" 9.153160e-01 \n",
" 8.762529e-01 \n",
" 8.493371e-01 \n",
" 8.381762e-01 \n",
" 8.140405e-01 \n",
" 7.709250e-01 \n",
" 7.345240e-01 \n",
" 7.257016e-01 \n",
" 6.244603e-01 \n",
" 6.056471e-01 \n",
" 5.212781e-01 \n",
" 4.822270e-01 \n",
" 4.036325e-01 \n",
" 3.300833e-01 \n",
" 250.120109 \n",
" 0.041527 \n",
" \n",
" \n",
" min \n",
" 0.000000 \n",
" -5.640751e+01 \n",
" -7.271573e+01 \n",
" -4.832559e+01 \n",
" -5.683171e+00 \n",
" -1.137433e+02 \n",
" -2.616051e+01 \n",
" -4.355724e+01 \n",
" -7.321672e+01 \n",
" -1.343407e+01 \n",
" -2.458826e+01 \n",
" -4.797473e+00 \n",
" -1.868371e+01 \n",
" -5.791881e+00 \n",
" -1.921433e+01 \n",
" -4.498945e+00 \n",
" -1.412985e+01 \n",
" -2.516280e+01 \n",
" -9.498746e+00 \n",
" -7.213527e+00 \n",
" -5.449772e+01 \n",
" -3.483038e+01 \n",
" -1.093314e+01 \n",
" -4.480774e+01 \n",
" -2.836627e+00 \n",
" -1.029540e+01 \n",
" -2.604551e+00 \n",
" -2.256568e+01 \n",
" -1.543008e+01 \n",
" 0.000000 \n",
" 0.000000 \n",
" \n",
" \n",
" 25% \n",
" 54201.500000 \n",
" -9.203734e-01 \n",
" -5.985499e-01 \n",
" -8.903648e-01 \n",
" -8.486401e-01 \n",
" -6.915971e-01 \n",
" -7.682956e-01 \n",
" -5.540759e-01 \n",
" -2.086297e-01 \n",
" -6.430976e-01 \n",
" -5.354257e-01 \n",
" -7.624942e-01 \n",
" -4.055715e-01 \n",
" -6.485393e-01 \n",
" -4.255740e-01 \n",
" -5.828843e-01 \n",
" -4.680368e-01 \n",
" -4.837483e-01 \n",
" -4.988498e-01 \n",
" -4.562989e-01 \n",
" -2.117214e-01 \n",
" -2.283949e-01 \n",
" -5.423504e-01 \n",
" -1.618463e-01 \n",
" -3.545861e-01 \n",
" -3.171451e-01 \n",
" -3.269839e-01 \n",
" -7.083953e-02 \n",
" -5.295979e-02 \n",
" 5.600000 \n",
" 0.000000 \n",
" \n",
" \n",
" 50% \n",
" 84692.000000 \n",
" 1.810880e-02 \n",
" 6.548556e-02 \n",
" 1.798463e-01 \n",
" -1.984653e-02 \n",
" -5.433583e-02 \n",
" -2.741871e-01 \n",
" 4.010308e-02 \n",
" 2.235804e-02 \n",
" -5.142873e-02 \n",
" -9.291738e-02 \n",
" -3.275735e-02 \n",
" 1.400326e-01 \n",
" -1.356806e-02 \n",
" 5.060132e-02 \n",
" 4.807155e-02 \n",
" 6.641332e-02 \n",
" -6.567575e-02 \n",
" -3.636312e-03 \n",
" 3.734823e-03 \n",
" -6.248109e-02 \n",
" -2.945017e-02 \n",
" 6.781943e-03 \n",
" -1.119293e-02 \n",
" 4.097606e-02 \n",
" 1.659350e-02 \n",
" -5.213911e-02 \n",
" 1.342146e-03 \n",
" 1.124383e-02 \n",
" 22.000000 \n",
" 0.000000 \n",
" \n",
" \n",
" 75% \n",
" 139320.500000 \n",
" 1.315642e+00 \n",
" 8.037239e-01 \n",
" 1.027196e+00 \n",
" 7.433413e-01 \n",
" 6.119264e-01 \n",
" 3.985649e-01 \n",
" 5.704361e-01 \n",
" 3.273459e-01 \n",
" 5.971390e-01 \n",
" 4.539234e-01 \n",
" 7.395934e-01 \n",
" 6.182380e-01 \n",
" 6.625050e-01 \n",
" 4.931498e-01 \n",
" 6.488208e-01 \n",
" 5.232963e-01 \n",
" 3.996750e-01 \n",
" 5.008067e-01 \n",
" 4.589494e-01 \n",
" 1.330408e-01 \n",
" 1.863772e-01 \n",
" 5.285536e-01 \n",
" 1.476421e-01 \n",
" 4.395266e-01 \n",
" 3.507156e-01 \n",
" 2.409522e-01 \n",
" 9.104512e-02 \n",
" 7.827995e-02 \n",
" 77.165000 \n",
" 0.000000 \n",
" \n",
" \n",
" max \n",
" 172792.000000 \n",
" 2.454930e+00 \n",
" 2.205773e+01 \n",
" 9.382558e+00 \n",
" 1.687534e+01 \n",
" 3.480167e+01 \n",
" 7.330163e+01 \n",
" 1.205895e+02 \n",
" 2.000721e+01 \n",
" 1.559499e+01 \n",
" 2.374514e+01 \n",
" 1.201891e+01 \n",
" 7.848392e+00 \n",
" 7.126883e+00 \n",
" 1.052677e+01 \n",
" 8.877742e+00 \n",
" 1.731511e+01 \n",
" 9.253526e+00 \n",
" 5.041069e+00 \n",
" 5.591971e+00 \n",
" 3.942090e+01 \n",
" 2.720284e+01 \n",
" 1.050309e+01 \n",
" 2.252841e+01 \n",
" 4.584549e+00 \n",
" 7.519589e+00 \n",
" 3.517346e+00 \n",
" 3.161220e+01 \n",
" 3.384781e+01 \n",
" 25691.160000 \n",
" 1.000000 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 Amount Class\n",
"count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 284807.000000 284807.000000\n",
"mean 94813.859575 3.919560e-15 5.688174e-16 -8.769071e-15 2.782312e-15 -1.552563e-15 2.010663e-15 -1.694249e-15 -1.927028e-16 -3.137024e-15 1.768627e-15 9.170318e-16 -1.810658e-15 1.693438e-15 1.479045e-15 3.482336e-15 1.392007e-15 -7.528491e-16 4.328772e-16 9.049732e-16 5.085503e-16 1.537294e-16 7.959909e-16 5.367590e-16 4.458112e-15 1.453003e-15 1.699104e-15 -3.660161e-16 -1.206049e-16 88.349619 0.001727\n",
"std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00 1.380247e+00 1.332271e+00 1.237094e+00 1.194353e+00 1.098632e+00 1.088850e+00 1.020713e+00 9.992014e-01 9.952742e-01 9.585956e-01 9.153160e-01 8.762529e-01 8.493371e-01 8.381762e-01 8.140405e-01 7.709250e-01 7.345240e-01 7.257016e-01 6.244603e-01 6.056471e-01 5.212781e-01 4.822270e-01 4.036325e-01 3.300833e-01 250.120109 0.041527\n",
"min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01 -2.458826e+01 -4.797473e+00 -1.868371e+01 -5.791881e+00 -1.921433e+01 -4.498945e+00 -1.412985e+01 -2.516280e+01 -9.498746e+00 -7.213527e+00 -5.449772e+01 -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00 -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01 0.000000 0.000000\n",
"25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01 -5.354257e-01 -7.624942e-01 -4.055715e-01 -6.485393e-01 -4.255740e-01 -5.828843e-01 -4.680368e-01 -4.837483e-01 -4.988498e-01 -4.562989e-01 -2.117214e-01 -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01 -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02 5.600000 0.000000\n",
"50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02 -5.433583e-02 -2.741871e-01 4.010308e-02 2.235804e-02 -5.142873e-02 -9.291738e-02 -3.275735e-02 1.400326e-01 -1.356806e-02 5.060132e-02 4.807155e-02 6.641332e-02 -6.567575e-02 -3.636312e-03 3.734823e-03 -6.248109e-02 -2.945017e-02 6.781943e-03 -1.119293e-02 4.097606e-02 1.659350e-02 -5.213911e-02 1.342146e-03 1.124383e-02 22.000000 0.000000\n",
"75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01 6.119264e-01 3.985649e-01 5.704361e-01 3.273459e-01 5.971390e-01 4.539234e-01 7.395934e-01 6.182380e-01 6.625050e-01 4.931498e-01 6.488208e-01 5.232963e-01 3.996750e-01 5.008067e-01 4.589494e-01 1.330408e-01 1.863772e-01 5.285536e-01 1.476421e-01 4.395266e-01 3.507156e-01 2.409522e-01 9.104512e-02 7.827995e-02 77.165000 0.000000\n",
"max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01 3.480167e+01 7.330163e+01 1.205895e+02 2.000721e+01 1.559499e+01 2.374514e+01 1.201891e+01 7.848392e+00 7.126883e+00 1.052677e+01 8.877742e+00 1.731511e+01 9.253526e+00 5.041069e+00 5.591971e+00 3.942090e+01 2.720284e+01 1.050309e+01 2.252841e+01 4.584549e+00 7.519589e+00 3.517346e+00 3.161220e+01 3.384781e+01 25691.160000 1.000000"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(data.columns)\n",
"data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of frauds: 492\n",
"Number of non-frauds: 284315\n",
"Percentage of fradulent data: 0.1727485630620034\n"
]
}
],
"source": [
"nonfrauds, frauds = data.groupby('Class').size()\n",
"print('Number of frauds: ', frauds)\n",
"print('Number of non-frauds: ', nonfrauds)\n",
"print('Percentage of fradulent data:', 100.*frauds/(frauds + nonfrauds))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The class column corresponds to whether or not a transaction is fradulent. We see that the majority of data is non-fraudulent with only $492$ ($.172\\%$), check the Class column mean, of the data corresponding to fraudulent examples."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A PCA have been made lets check the mean and standard deviation of the features."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"data.hist(bins=50,figsize=(20,15))\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Looks good, columns have been normalized to have 0 mean and unit standard deviation as the result of a PCA. Now, lets change the data to be Amazon Fraud Detector compatible."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['time', 'va', 'vb', 'vc', 'vd', 've', 'vf', 'vg', 'vh', 'vi', 'vj', 'vk', 'vl', 'vm', 'vn', 'vo', 'vp', 'vq', 'vr', 'vs', 'vt', 'vu', 'vv', 'vw', 'vx', 'vy', 'vz', 'vaa', 'vab', 'amount', 'class'], dtype='object')\n"
]
}
],
"source": [
"# to lowercase\n",
"data.columns = map(str.lower, data.columns)\n",
"print(data.columns)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['time', 'va', 'vb', 'vc', 'vd', 've', 'vf', 'vg', 'vh', 'vi', 'vj', 'vk', 'vl', 'vm', 'vn', 'vo', 'vp', 'vq', 'vr', 'vs', 'vt', 'vu', 'vv', 'vw', 'vx', 'vy', 'vz', 'vaa', 'vab', 'amount', 'class'], dtype='object')\n"
]
}
],
"source": [
"# mapping column names numbers to letters\n",
"\n",
"def standardize_headers(x):\n",
" if any(char.isdigit() for char in x):\n",
" if int(x[1:]) > 26:\n",
" return 'va'+chr(int(x[1:])+70)\n",
" return 'v'+chr(int(x[1:])+96)\n",
" return x\n",
"\n",
"data.rename(columns=standardize_headers, inplace=True)\n",
"print(data.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Then change the timestamp and label column names"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" EVENT_TIMESTAMP \n",
" va \n",
" vb \n",
" vc \n",
" vd \n",
" ve \n",
" vf \n",
" vg \n",
" vh \n",
" vi \n",
" vj \n",
" vk \n",
" vl \n",
" vm \n",
" vn \n",
" vo \n",
" vp \n",
" vq \n",
" vr \n",
" vs \n",
" vt \n",
" vu \n",
" vv \n",
" vw \n",
" vx \n",
" vy \n",
" vz \n",
" vaa \n",
" vab \n",
" amount \n",
" EVENT_LABEL \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0.0 \n",
" -1.359807 \n",
" -0.072781 \n",
" 2.536347 \n",
" 1.378155 \n",
" -0.338321 \n",
" 0.462388 \n",
" 0.239599 \n",
" 0.098698 \n",
" 0.363787 \n",
" 0.090794 \n",
" -0.551600 \n",
" -0.617801 \n",
" -0.991390 \n",
" -0.311169 \n",
" 1.468177 \n",
" -0.470401 \n",
" 0.207971 \n",
" 0.025791 \n",
" 0.403993 \n",
" 0.251412 \n",
" -0.018307 \n",
" 0.277838 \n",
" -0.110474 \n",
" 0.066928 \n",
" 0.128539 \n",
" -0.189115 \n",
" 0.133558 \n",
" -0.021053 \n",
" 149.62 \n",
" 0 \n",
" \n",
" \n",
" 1 \n",
" 0.0 \n",
" 1.191857 \n",
" 0.266151 \n",
" 0.166480 \n",
" 0.448154 \n",
" 0.060018 \n",
" -0.082361 \n",
" -0.078803 \n",
" 0.085102 \n",
" -0.255425 \n",
" -0.166974 \n",
" 1.612727 \n",
" 1.065235 \n",
" 0.489095 \n",
" -0.143772 \n",
" 0.635558 \n",
" 0.463917 \n",
" -0.114805 \n",
" -0.183361 \n",
" -0.145783 \n",
" -0.069083 \n",
" -0.225775 \n",
" -0.638672 \n",
" 0.101288 \n",
" -0.339846 \n",
" 0.167170 \n",
" 0.125895 \n",
" -0.008983 \n",
" 0.014724 \n",
" 2.69 \n",
" 0 \n",
" \n",
" \n",
" 2 \n",
" 1.0 \n",
" -1.358354 \n",
" -1.340163 \n",
" 1.773209 \n",
" 0.379780 \n",
" -0.503198 \n",
" 1.800499 \n",
" 0.791461 \n",
" 0.247676 \n",
" -1.514654 \n",
" 0.207643 \n",
" 0.624501 \n",
" 0.066084 \n",
" 0.717293 \n",
" -0.165946 \n",
" 2.345865 \n",
" -2.890083 \n",
" 1.109969 \n",
" -0.121359 \n",
" -2.261857 \n",
" 0.524980 \n",
" 0.247998 \n",
" 0.771679 \n",
" 0.909412 \n",
" -0.689281 \n",
" -0.327642 \n",
" -0.139097 \n",
" -0.055353 \n",
" -0.059752 \n",
" 378.66 \n",
" 0 \n",
" \n",
" \n",
" 3 \n",
" 1.0 \n",
" -0.966272 \n",
" -0.185226 \n",
" 1.792993 \n",
" -0.863291 \n",
" -0.010309 \n",
" 1.247203 \n",
" 0.237609 \n",
" 0.377436 \n",
" -1.387024 \n",
" -0.054952 \n",
" -0.226487 \n",
" 0.178228 \n",
" 0.507757 \n",
" -0.287924 \n",
" -0.631418 \n",
" -1.059647 \n",
" -0.684093 \n",
" 1.965775 \n",
" -1.232622 \n",
" -0.208038 \n",
" -0.108300 \n",
" 0.005274 \n",
" -0.190321 \n",
" -1.175575 \n",
" 0.647376 \n",
" -0.221929 \n",
" 0.062723 \n",
" 0.061458 \n",
" 123.50 \n",
" 0 \n",
" \n",
" \n",
" 4 \n",
" 2.0 \n",
" -1.158233 \n",
" 0.877737 \n",
" 1.548718 \n",
" 0.403034 \n",
" -0.407193 \n",
" 0.095921 \n",
" 0.592941 \n",
" -0.270533 \n",
" 0.817739 \n",
" 0.753074 \n",
" -0.822843 \n",
" 0.538196 \n",
" 1.345852 \n",
" -1.119670 \n",
" 0.175121 \n",
" -0.451449 \n",
" -0.237033 \n",
" -0.038195 \n",
" 0.803487 \n",
" 0.408542 \n",
" -0.009431 \n",
" 0.798278 \n",
" -0.137458 \n",
" 0.141267 \n",
" -0.206010 \n",
" 0.502292 \n",
" 0.219422 \n",
" 0.215153 \n",
" 69.99 \n",
" 0 \n",
" \n",
" \n",
" 5 \n",
" 2.0 \n",
" -0.425966 \n",
" 0.960523 \n",
" 1.141109 \n",
" -0.168252 \n",
" 0.420987 \n",
" -0.029728 \n",
" 0.476201 \n",
" 0.260314 \n",
" -0.568671 \n",
" -0.371407 \n",
" 1.341262 \n",
" 0.359894 \n",
" -0.358091 \n",
" -0.137134 \n",
" 0.517617 \n",
" 0.401726 \n",
" -0.058133 \n",
" 0.068653 \n",
" -0.033194 \n",
" 0.084968 \n",
" -0.208254 \n",
" -0.559825 \n",
" -0.026398 \n",
" -0.371427 \n",
" -0.232794 \n",
" 0.105915 \n",
" 0.253844 \n",
" 0.081080 \n",
" 3.67 \n",
" 0 \n",
" \n",
" \n",
" 6 \n",
" 4.0 \n",
" 1.229658 \n",
" 0.141004 \n",
" 0.045371 \n",
" 1.202613 \n",
" 0.191881 \n",
" 0.272708 \n",
" -0.005159 \n",
" 0.081213 \n",
" 0.464960 \n",
" -0.099254 \n",
" -1.416907 \n",
" -0.153826 \n",
" -0.751063 \n",
" 0.167372 \n",
" 0.050144 \n",
" -0.443587 \n",
" 0.002821 \n",
" -0.611987 \n",
" -0.045575 \n",
" -0.219633 \n",
" -0.167716 \n",
" -0.270710 \n",
" -0.154104 \n",
" -0.780055 \n",
" 0.750137 \n",
" -0.257237 \n",
" 0.034507 \n",
" 0.005168 \n",
" 4.99 \n",
" 0 \n",
" \n",
" \n",
" 7 \n",
" 7.0 \n",
" -0.644269 \n",
" 1.417964 \n",
" 1.074380 \n",
" -0.492199 \n",
" 0.948934 \n",
" 0.428118 \n",
" 1.120631 \n",
" -3.807864 \n",
" 0.615375 \n",
" 1.249376 \n",
" -0.619468 \n",
" 0.291474 \n",
" 1.757964 \n",
" -1.323865 \n",
" 0.686133 \n",
" -0.076127 \n",
" -1.222127 \n",
" -0.358222 \n",
" 0.324505 \n",
" -0.156742 \n",
" 1.943465 \n",
" -1.015455 \n",
" 0.057504 \n",
" -0.649709 \n",
" -0.415267 \n",
" -0.051634 \n",
" -1.206921 \n",
" -1.085339 \n",
" 40.80 \n",
" 0 \n",
" \n",
" \n",
" 8 \n",
" 7.0 \n",
" -0.894286 \n",
" 0.286157 \n",
" -0.113192 \n",
" -0.271526 \n",
" 2.669599 \n",
" 3.721818 \n",
" 0.370145 \n",
" 0.851084 \n",
" -0.392048 \n",
" -0.410430 \n",
" -0.705117 \n",
" -0.110452 \n",
" -0.286254 \n",
" 0.074355 \n",
" -0.328783 \n",
" -0.210077 \n",
" -0.499768 \n",
" 0.118765 \n",
" 0.570328 \n",
" 0.052736 \n",
" -0.073425 \n",
" -0.268092 \n",
" -0.204233 \n",
" 1.011592 \n",
" 0.373205 \n",
" -0.384157 \n",
" 0.011747 \n",
" 0.142404 \n",
" 93.20 \n",
" 0 \n",
" \n",
" \n",
" 9 \n",
" 9.0 \n",
" -0.338262 \n",
" 1.119593 \n",
" 1.044367 \n",
" -0.222187 \n",
" 0.499361 \n",
" -0.246761 \n",
" 0.651583 \n",
" 0.069539 \n",
" -0.736727 \n",
" -0.366846 \n",
" 1.017614 \n",
" 0.836390 \n",
" 1.006844 \n",
" -0.443523 \n",
" 0.150219 \n",
" 0.739453 \n",
" -0.540980 \n",
" 0.476677 \n",
" 0.451773 \n",
" 0.203711 \n",
" -0.246914 \n",
" -0.633753 \n",
" -0.120794 \n",
" -0.385050 \n",
" -0.069733 \n",
" 0.094199 \n",
" 0.246219 \n",
" 0.083076 \n",
" 3.68 \n",
" 0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" EVENT_TIMESTAMP va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount EVENT_LABEL\n",
"0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0\n",
"1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0\n",
"2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0\n",
"3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0\n",
"4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0\n",
"5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 0.260314 -0.568671 -0.371407 1.341262 0.359894 -0.358091 -0.137134 0.517617 0.401726 -0.058133 0.068653 -0.033194 0.084968 -0.208254 -0.559825 -0.026398 -0.371427 -0.232794 0.105915 0.253844 0.081080 3.67 0\n",
"6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 0.081213 0.464960 -0.099254 -1.416907 -0.153826 -0.751063 0.167372 0.050144 -0.443587 0.002821 -0.611987 -0.045575 -0.219633 -0.167716 -0.270710 -0.154104 -0.780055 0.750137 -0.257237 0.034507 0.005168 4.99 0\n",
"7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 -3.807864 0.615375 1.249376 -0.619468 0.291474 1.757964 -1.323865 0.686133 -0.076127 -1.222127 -0.358222 0.324505 -0.156742 1.943465 -1.015455 0.057504 -0.649709 -0.415267 -0.051634 -1.206921 -1.085339 40.80 0\n",
"8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 0.851084 -0.392048 -0.410430 -0.705117 -0.110452 -0.286254 0.074355 -0.328783 -0.210077 -0.499768 0.118765 0.570328 0.052736 -0.073425 -0.268092 -0.204233 1.011592 0.373205 -0.384157 0.011747 0.142404 93.20 0\n",
"9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 0.069539 -0.736727 -0.366846 1.017614 0.836390 1.006844 -0.443523 0.150219 0.739453 -0.540980 0.476677 0.451773 0.203711 -0.246914 -0.633753 -0.120794 -0.385050 -0.069733 0.094199 0.246219 0.083076 3.68 0"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# rename to the Amazon Fraud Detector name conventions \n",
"data.rename(columns={'time':'EVENT_TIMESTAMP','class':'EVENT_LABEL'}, inplace=True)\n",
"data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The date column is represented as incremental seconds, lets translate that to real dates."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1600788845.180449\n"
]
}
],
"source": [
"# Get epoch time for the initial dataset date\n",
"epoch = datetime.utcfromtimestamp(0)\n",
"def unix_time_seconds(dt):\n",
" return (dt - epoch).total_seconds()\n",
"\n",
"# Lets pretend that the data is from yesterday and could can test at the end with todays date.\n",
"start_dt = datetime.strptime('Sep 22 2020 12:00AM', '%b %d %Y %I:%M%p')\n",
"start_dt = datetime.now()\n",
"start_ep = unix_time_seconds(start_dt)\n",
"print(start_ep)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Datetime parse test"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2020-09-22 00:00:00\n"
]
}
],
"source": [
"date_str = '9/22/2020 12:00:00 AM'\n",
"date = datetime.strptime(date_str, \"%m/%d/%Y %I:%M:%S %p\")\n",
"print(date)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Translate the current timestamp format (increasing seconds) to ISO 8601 standard"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" EVENT_TIMESTAMP \n",
" va \n",
" vb \n",
" vc \n",
" vd \n",
" ve \n",
" vf \n",
" vg \n",
" vh \n",
" vi \n",
" vj \n",
" vk \n",
" vl \n",
" vm \n",
" vn \n",
" vo \n",
" vp \n",
" vq \n",
" vr \n",
" vs \n",
" vt \n",
" vu \n",
" vv \n",
" vw \n",
" vx \n",
" vy \n",
" vz \n",
" vaa \n",
" vab \n",
" amount \n",
" EVENT_LABEL \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 2020-09-22T15:34:05Z \n",
" -1.359807 \n",
" -0.072781 \n",
" 2.536347 \n",
" 1.378155 \n",
" -0.338321 \n",
" 0.462388 \n",
" 0.239599 \n",
" 0.098698 \n",
" 0.363787 \n",
" 0.090794 \n",
" -0.551600 \n",
" -0.617801 \n",
" -0.991390 \n",
" -0.311169 \n",
" 1.468177 \n",
" -0.470401 \n",
" 0.207971 \n",
" 0.025791 \n",
" 0.403993 \n",
" 0.251412 \n",
" -0.018307 \n",
" 0.277838 \n",
" -0.110474 \n",
" 0.066928 \n",
" 0.128539 \n",
" -0.189115 \n",
" 0.133558 \n",
" -0.021053 \n",
" 149.62 \n",
" 0 \n",
" \n",
" \n",
" 1 \n",
" 2020-09-22T15:34:05Z \n",
" 1.191857 \n",
" 0.266151 \n",
" 0.166480 \n",
" 0.448154 \n",
" 0.060018 \n",
" -0.082361 \n",
" -0.078803 \n",
" 0.085102 \n",
" -0.255425 \n",
" -0.166974 \n",
" 1.612727 \n",
" 1.065235 \n",
" 0.489095 \n",
" -0.143772 \n",
" 0.635558 \n",
" 0.463917 \n",
" -0.114805 \n",
" -0.183361 \n",
" -0.145783 \n",
" -0.069083 \n",
" -0.225775 \n",
" -0.638672 \n",
" 0.101288 \n",
" -0.339846 \n",
" 0.167170 \n",
" 0.125895 \n",
" -0.008983 \n",
" 0.014724 \n",
" 2.69 \n",
" 0 \n",
" \n",
" \n",
" 2 \n",
" 2020-09-22T15:34:06Z \n",
" -1.358354 \n",
" -1.340163 \n",
" 1.773209 \n",
" 0.379780 \n",
" -0.503198 \n",
" 1.800499 \n",
" 0.791461 \n",
" 0.247676 \n",
" -1.514654 \n",
" 0.207643 \n",
" 0.624501 \n",
" 0.066084 \n",
" 0.717293 \n",
" -0.165946 \n",
" 2.345865 \n",
" -2.890083 \n",
" 1.109969 \n",
" -0.121359 \n",
" -2.261857 \n",
" 0.524980 \n",
" 0.247998 \n",
" 0.771679 \n",
" 0.909412 \n",
" -0.689281 \n",
" -0.327642 \n",
" -0.139097 \n",
" -0.055353 \n",
" -0.059752 \n",
" 378.66 \n",
" 0 \n",
" \n",
" \n",
" 3 \n",
" 2020-09-22T15:34:06Z \n",
" -0.966272 \n",
" -0.185226 \n",
" 1.792993 \n",
" -0.863291 \n",
" -0.010309 \n",
" 1.247203 \n",
" 0.237609 \n",
" 0.377436 \n",
" -1.387024 \n",
" -0.054952 \n",
" -0.226487 \n",
" 0.178228 \n",
" 0.507757 \n",
" -0.287924 \n",
" -0.631418 \n",
" -1.059647 \n",
" -0.684093 \n",
" 1.965775 \n",
" -1.232622 \n",
" -0.208038 \n",
" -0.108300 \n",
" 0.005274 \n",
" -0.190321 \n",
" -1.175575 \n",
" 0.647376 \n",
" -0.221929 \n",
" 0.062723 \n",
" 0.061458 \n",
" 123.50 \n",
" 0 \n",
" \n",
" \n",
" 4 \n",
" 2020-09-22T15:34:07Z \n",
" -1.158233 \n",
" 0.877737 \n",
" 1.548718 \n",
" 0.403034 \n",
" -0.407193 \n",
" 0.095921 \n",
" 0.592941 \n",
" -0.270533 \n",
" 0.817739 \n",
" 0.753074 \n",
" -0.822843 \n",
" 0.538196 \n",
" 1.345852 \n",
" -1.119670 \n",
" 0.175121 \n",
" -0.451449 \n",
" -0.237033 \n",
" -0.038195 \n",
" 0.803487 \n",
" 0.408542 \n",
" -0.009431 \n",
" 0.798278 \n",
" -0.137458 \n",
" 0.141267 \n",
" -0.206010 \n",
" 0.502292 \n",
" 0.219422 \n",
" 0.215153 \n",
" 69.99 \n",
" 0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" EVENT_TIMESTAMP va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount EVENT_LABEL\n",
"0 2020-09-22T15:34:05Z -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0\n",
"1 2020-09-22T15:34:05Z 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0\n",
"2 2020-09-22T15:34:06Z -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0\n",
"3 2020-09-22T15:34:06Z -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0\n",
"4 2020-09-22T15:34:07Z -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# translate seconds delta to actual datetimes in ISO 8601\n",
"def to_datetime(x):\n",
" current_ep = start_ep + x\n",
" current_dt = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.localtime(current_ep))\n",
" return current_dt\n",
"\n",
"\n",
"data['EVENT_TIMESTAMP'] = data['EVENT_TIMESTAMP'].apply(to_datetime)\n",
"data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Lets check for null values"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"EVENT_TIMESTAMP 0\n",
"va 0\n",
"vb 0\n",
"vc 0\n",
"vd 0\n",
"ve 0\n",
"vf 0\n",
"vg 0\n",
"vh 0\n",
"vi 0\n",
"vj 0\n",
"vk 0\n",
"vl 0\n",
"vm 0\n",
"vn 0\n",
"vo 0\n",
"vp 0\n",
"vq 0\n",
"vr 0\n",
"vs 0\n",
"vt 0\n",
"vu 0\n",
"vv 0\n",
"vw 0\n",
"vx 0\n",
"vy 0\n",
"vz 0\n",
"vaa 0\n",
"vab 0\n",
"amount 0\n",
"EVENT_LABEL 0\n",
"dtype: int64"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isnull().sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We will split our dataset into a train and test to evaluate the performance of our models. It's important to do so before any techniques meant to alleviate the class imbalance are used. This ensures that we don't leak information from the test set into the train set."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"features = data.drop('EVENT_LABEL', axis=1).values\n",
"labels = (data['EVENT_LABEL'].values).astype('float32')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"X, X_test, y, y_test = train_test_split(\n",
" features, labels, test_size=0.1, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Counter({0.0: 255880, 1.0: 446})\n"
]
}
],
"source": [
"counter = Counter(y)\n",
"print(counter)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Getting the train DataFrame back together"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['EVENT_TIMESTAMP', 'va', 'vb', 'vc', 'vd', 've', 'vf', 'vg', 'vh', 'vi', 'vj', 'vk', 'vl', 'vm', 'vn', 'vo', 'vp', 'vq', 'vr', 'vs', 'vt', 'vu', 'vv', 'vw', 'vx', 'vy', 'vz', 'vaa', 'vab', 'amount'], dtype='object')\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" EVENT_TIMESTAMP \n",
" va \n",
" vb \n",
" vc \n",
" vd \n",
" ve \n",
" vf \n",
" vg \n",
" vh \n",
" vi \n",
" vj \n",
" vk \n",
" vl \n",
" vm \n",
" vn \n",
" vo \n",
" vp \n",
" vq \n",
" vr \n",
" vs \n",
" vt \n",
" vu \n",
" vv \n",
" vw \n",
" vx \n",
" vy \n",
" vz \n",
" vaa \n",
" vab \n",
" amount \n",
" EVENT_LABEL \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 2020-09-22T23:29:20Z \n",
" 1.22664 \n",
" 0.101988 \n",
" -0.0870724 \n",
" 0.111524 \n",
" -0.281992 \n",
" -1.35603 \n",
" 0.46905 \n",
" -0.371725 \n",
" -0.153672 \n",
" -0.145105 \n",
" -0.143505 \n",
" 0.320964 \n",
" 0.149313 \n",
" 0.452515 \n",
" 0.776253 \n",
" 0.0287393 \n",
" -0.177867 \n",
" -1.00475 \n",
" 0.264953 \n",
" 0.0196923 \n",
" -0.3551 \n",
" -1.15366 \n",
" 0.109793 \n",
" 0.420318 \n",
" 0.197932 \n",
" 0.699218 \n",
" -0.114861 \n",
" 0.00758255 \n",
" 50.4 \n",
" 0 \n",
" \n",
" \n",
" 1 \n",
" 2020-09-23T14:39:30Z \n",
" 1.12485 \n",
" 0.125602 \n",
" 0.249962 \n",
" 0.489744 \n",
" -0.0403864 \n",
" 0.167561 \n",
" -0.247614 \n",
" 0.284736 \n",
" -0.0673023 \n",
" -0.170139 \n",
" 1.89512 \n",
" 0.507733 \n",
" -1.00865 \n",
" 0.187249 \n",
" 1.02166 \n",
" 0.13632 \n",
" 0.370186 \n",
" -0.573559 \n",
" -0.625413 \n",
" -0.204465 \n",
" -0.192467 \n",
" -0.576819 \n",
" 0.190343 \n",
" -0.357451 \n",
" 0.000869796 \n",
" 0.139971 \n",
" -0.000993408 \n",
" 0.0115049 \n",
" 1.98 \n",
" 0 \n",
" \n",
" \n",
" 2 \n",
" 2020-09-23T12:33:02Z \n",
" -0.307902 \n",
" 1.00371 \n",
" 1.40428 \n",
" 0.592627 \n",
" 0.311014 \n",
" -0.382106 \n",
" 0.531393 \n",
" -0.0152922 \n",
" -0.758638 \n",
" -0.511597 \n",
" 0.149643 \n",
" 0.245866 \n",
" 0.752802 \n",
" -0.382214 \n",
" 1.59018 \n",
" -0.332546 \n",
" 0.611852 \n",
" -0.510495 \n",
" 0.563779 \n",
" 0.12522 \n",
" -0.131802 \n",
" -0.329268 \n",
" 0.04699 \n",
" 0.0574128 \n",
" -0.65696 \n",
" 0.193192 \n",
" 0.142038 \n",
" 0.157501 \n",
" 1.98 \n",
" 0 \n",
" \n",
" \n",
" 3 \n",
" 2020-09-24T11:00:03Z \n",
" 2.17492 \n",
" -1.53544 \n",
" -0.726428 \n",
" -1.43079 \n",
" -1.51726 \n",
" -0.751038 \n",
" -1.15534 \n",
" -0.180811 \n",
" -1.11188 \n",
" 1.5361 \n",
" -0.735705 \n",
" -0.771714 \n",
" 0.238603 \n",
" -0.44705 \n",
" -0.0858861 \n",
" -0.568345 \n",
" 0.591162 \n",
" -0.104975 \n",
" -0.327292 \n",
" -0.334351 \n",
" -0.112766 \n",
" 0.0500184 \n",
" 0.294666 \n",
" 1.12332 \n",
" -0.306025 \n",
" -0.241343 \n",
" 0.00655296 \n",
" -0.0275668 \n",
" 64 \n",
" 0 \n",
" \n",
" \n",
" 4 \n",
" 2020-09-24T12:42:48Z \n",
" -2.22156 \n",
" 1.26199 \n",
" 2.04764 \n",
" 4.65927 \n",
" -0.535941 \n",
" 4.54204 \n",
" -3.71553 \n",
" -5.3117 \n",
" -0.955321 \n",
" 0.200601 \n",
" -1.34262 \n",
" 0.879905 \n",
" 0.241171 \n",
" -0.36554 \n",
" -1.73541 \n",
" 0.564495 \n",
" 0.380648 \n",
" 1.21669 \n",
" 1.87271 \n",
" 0.89599 \n",
" -1.82039 \n",
" 0.873723 \n",
" -2.6486 \n",
" -0.16218 \n",
" -0.492111 \n",
" 0.60149 \n",
" 0.62703 \n",
" 0.088289 \n",
" 379.29 \n",
" 0 \n",
" \n",
" \n",
" 5 \n",
" 2020-09-23T23:13:57Z \n",
" -0.335198 \n",
" 0.871378 \n",
" 0.632703 \n",
" 4.16424 \n",
" 1.70258 \n",
" 1.95435 \n",
" 0.396722 \n",
" 0.495056 \n",
" -2.50602 \n",
" 1.60914 \n",
" 1.02925 \n",
" -0.143569 \n",
" -0.129187 \n",
" 0.862428 \n",
" 1.05951 \n",
" -0.862144 \n",
" 0.597711 \n",
" -0.509967 \n",
" 0.217672 \n",
" 0.260404 \n",
" 0.529635 \n",
" 1.47356 \n",
" 0.0334131 \n",
" -1.33327 \n",
" -0.779961 \n",
" 0.595196 \n",
" 0.231547 \n",
" 0.193332 \n",
" 57.78 \n",
" 0 \n",
" \n",
" \n",
" 6 \n",
" 2020-09-22T20:44:58Z \n",
" -1.30598 \n",
" 1.77212 \n",
" 0.74173 \n",
" 0.912351 \n",
" 0.498898 \n",
" 1.73749 \n",
" -0.957795 \n",
" -1.6923 \n",
" 0.755233 \n",
" -0.641461 \n",
" 0.900131 \n",
" -0.935381 \n",
" 3.06823 \n",
" 1.56476 \n",
" -1.35744 \n",
" -0.216156 \n",
" 0.712878 \n",
" 0.354701 \n",
" 0.95993 \n",
" -0.579386 \n",
" 1.98545 \n",
" -1.29011 \n",
" 0.108807 \n",
" -1.42794 \n",
" 0.140905 \n",
" -0.393444 \n",
" 0.078297 \n",
" -0.0525052 \n",
" 1 \n",
" 0 \n",
" \n",
" \n",
" 7 \n",
" 2020-09-24T04:38:10Z \n",
" -0.303356 \n",
" 1.145 \n",
" -0.843639 \n",
" -1.04661 \n",
" 0.945826 \n",
" -1.8227 \n",
" 1.69451 \n",
" -0.465604 \n",
" -0.0954349 \n",
" -0.150239 \n",
" -1.00453 \n",
" -0.509581 \n",
" -1.0396 \n",
" 0.805069 \n",
" -0.0770889 \n",
" -0.649795 \n",
" -0.363229 \n",
" -0.307477 \n",
" -0.163633 \n",
" 0.00746885 \n",
" 0.208148 \n",
" 0.84297 \n",
" -0.20149 \n",
" 0.0761925 \n",
" -0.277275 \n",
" 0.0933392 \n",
" 0.497711 \n",
" 0.33648 \n",
" 17 \n",
" 0 \n",
" \n",
" \n",
" 8 \n",
" 2020-09-23T15:48:17Z \n",
" -0.46592 \n",
" 0.628365 \n",
" 1.44957 \n",
" 4.41735 \n",
" 1.03465 \n",
" 1.08391 \n",
" 0.103565 \n",
" -0.624355 \n",
" -0.22619 \n",
" 2.9792 \n",
" -1.31971 \n",
" -1.13684 \n",
" -0.027942 \n",
" -0.972251 \n",
" 0.758803 \n",
" -0.104912 \n",
" -0.462734 \n",
" 0.521972 \n",
" 1.0497 \n",
" -0.0554636 \n",
" 0.192156 \n",
" 1.23173 \n",
" -0.113115 \n",
" 0.603061 \n",
" -1.89004 \n",
" 0.0698536 \n",
" -0.907822 \n",
" -0.13317 \n",
" 11.31 \n",
" 0 \n",
" \n",
" \n",
" 9 \n",
" 2020-09-23T02:50:52Z \n",
" 0.271095 \n",
" -2.72049 \n",
" 0.427427 \n",
" -0.080256 \n",
" -2.07731 \n",
" 0.334682 \n",
" -0.620412 \n",
" 0.174167 \n",
" 0.0874801 \n",
" 0.347286 \n",
" 0.319218 \n",
" -0.528176 \n",
" -1.56046 \n",
" -0.28146 \n",
" -1.10355 \n",
" 1.18135 \n",
" 0.433565 \n",
" -0.623177 \n",
" 1.16548 \n",
" 1.06845 \n",
" 0.341998 \n",
" -0.188797 \n",
" -0.504412 \n",
" 0.0272953 \n",
" 0.140939 \n",
" -0.294037 \n",
" -0.0638468 \n",
" 0.102788 \n",
" 552.89 \n",
" 0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" EVENT_TIMESTAMP va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount EVENT_LABEL\n",
"0 2020-09-22T23:29:20Z 1.22664 0.101988 -0.0870724 0.111524 -0.281992 -1.35603 0.46905 -0.371725 -0.153672 -0.145105 -0.143505 0.320964 0.149313 0.452515 0.776253 0.0287393 -0.177867 -1.00475 0.264953 0.0196923 -0.3551 -1.15366 0.109793 0.420318 0.197932 0.699218 -0.114861 0.00758255 50.4 0\n",
"1 2020-09-23T14:39:30Z 1.12485 0.125602 0.249962 0.489744 -0.0403864 0.167561 -0.247614 0.284736 -0.0673023 -0.170139 1.89512 0.507733 -1.00865 0.187249 1.02166 0.13632 0.370186 -0.573559 -0.625413 -0.204465 -0.192467 -0.576819 0.190343 -0.357451 0.000869796 0.139971 -0.000993408 0.0115049 1.98 0\n",
"2 2020-09-23T12:33:02Z -0.307902 1.00371 1.40428 0.592627 0.311014 -0.382106 0.531393 -0.0152922 -0.758638 -0.511597 0.149643 0.245866 0.752802 -0.382214 1.59018 -0.332546 0.611852 -0.510495 0.563779 0.12522 -0.131802 -0.329268 0.04699 0.0574128 -0.65696 0.193192 0.142038 0.157501 1.98 0\n",
"3 2020-09-24T11:00:03Z 2.17492 -1.53544 -0.726428 -1.43079 -1.51726 -0.751038 -1.15534 -0.180811 -1.11188 1.5361 -0.735705 -0.771714 0.238603 -0.44705 -0.0858861 -0.568345 0.591162 -0.104975 -0.327292 -0.334351 -0.112766 0.0500184 0.294666 1.12332 -0.306025 -0.241343 0.00655296 -0.0275668 64 0\n",
"4 2020-09-24T12:42:48Z -2.22156 1.26199 2.04764 4.65927 -0.535941 4.54204 -3.71553 -5.3117 -0.955321 0.200601 -1.34262 0.879905 0.241171 -0.36554 -1.73541 0.564495 0.380648 1.21669 1.87271 0.89599 -1.82039 0.873723 -2.6486 -0.16218 -0.492111 0.60149 0.62703 0.088289 379.29 0\n",
"5 2020-09-23T23:13:57Z -0.335198 0.871378 0.632703 4.16424 1.70258 1.95435 0.396722 0.495056 -2.50602 1.60914 1.02925 -0.143569 -0.129187 0.862428 1.05951 -0.862144 0.597711 -0.509967 0.217672 0.260404 0.529635 1.47356 0.0334131 -1.33327 -0.779961 0.595196 0.231547 0.193332 57.78 0\n",
"6 2020-09-22T20:44:58Z -1.30598 1.77212 0.74173 0.912351 0.498898 1.73749 -0.957795 -1.6923 0.755233 -0.641461 0.900131 -0.935381 3.06823 1.56476 -1.35744 -0.216156 0.712878 0.354701 0.95993 -0.579386 1.98545 -1.29011 0.108807 -1.42794 0.140905 -0.393444 0.078297 -0.0525052 1 0\n",
"7 2020-09-24T04:38:10Z -0.303356 1.145 -0.843639 -1.04661 0.945826 -1.8227 1.69451 -0.465604 -0.0954349 -0.150239 -1.00453 -0.509581 -1.0396 0.805069 -0.0770889 -0.649795 -0.363229 -0.307477 -0.163633 0.00746885 0.208148 0.84297 -0.20149 0.0761925 -0.277275 0.0933392 0.497711 0.33648 17 0\n",
"8 2020-09-23T15:48:17Z -0.46592 0.628365 1.44957 4.41735 1.03465 1.08391 0.103565 -0.624355 -0.22619 2.9792 -1.31971 -1.13684 -0.027942 -0.972251 0.758803 -0.104912 -0.462734 0.521972 1.0497 -0.0554636 0.192156 1.23173 -0.113115 0.603061 -1.89004 0.0698536 -0.907822 -0.13317 11.31 0\n",
"9 2020-09-23T02:50:52Z 0.271095 -2.72049 0.427427 -0.080256 -2.07731 0.334682 -0.620412 0.174167 0.0874801 0.347286 0.319218 -0.528176 -1.56046 -0.28146 -1.10355 1.18135 0.433565 -0.623177 1.16548 1.06845 0.341998 -0.188797 -0.504412 0.0272953 0.140939 -0.294037 -0.0638468 0.102788 552.89 0"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"saved_cols = data.drop('EVENT_LABEL', axis=1).columns\n",
"print(saved_cols)\n",
"data = pd.DataFrame(X, columns = saved_cols)\n",
"data['EVENT_LABEL']=y.astype(int)\n",
"data.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" EVENT_TIMESTAMP \n",
" va \n",
" vb \n",
" vc \n",
" vd \n",
" ve \n",
" vf \n",
" vg \n",
" vh \n",
" vi \n",
" vj \n",
" vk \n",
" vl \n",
" vm \n",
" vn \n",
" vo \n",
" vp \n",
" vq \n",
" vr \n",
" vs \n",
" vt \n",
" vu \n",
" vv \n",
" vw \n",
" vx \n",
" vy \n",
" vz \n",
" vaa \n",
" vab \n",
" amount \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 2020-09-23T03:05:50Z \n",
" -16.5265 \n",
" 8.58497 \n",
" -18.6499 \n",
" 9.50559 \n",
" -13.7938 \n",
" -2.8324 \n",
" -16.7017 \n",
" 7.51734 \n",
" -8.50706 \n",
" -14.1102 \n",
" 5.29924 \n",
" -10.834 \n",
" 1.67112 \n",
" -9.37386 \n",
" 0.360806 \n",
" -9.89925 \n",
" -19.2363 \n",
" -8.39855 \n",
" 3.10174 \n",
" -1.51492 \n",
" 1.19074 \n",
" -1.12767 \n",
" -2.35858 \n",
" 0.673461 \n",
" -1.4137 \n",
" -0.462762 \n",
" -2.01858 \n",
" -1.0428 \n",
" 364.19 \n",
" \n",
" \n",
" 1 \n",
" 2020-09-23T03:51:46Z \n",
" 0.339812 \n",
" -2.74375 \n",
" -0.13407 \n",
" -1.38573 \n",
" -1.45141 \n",
" 1.01589 \n",
" -0.524379 \n",
" 0.22406 \n",
" 0.899746 \n",
" -0.565012 \n",
" -0.0876703 \n",
" 0.979427 \n",
" 0.0768828 \n",
" -0.217884 \n",
" -0.13683 \n",
" -2.14289 \n",
" 0.126956 \n",
" 1.75266 \n",
" 0.432546 \n",
" 0.506044 \n",
" -0.213436 \n",
" -0.942525 \n",
" -0.526819 \n",
" -1.15699 \n",
" 0.311211 \n",
" -0.746647 \n",
" 0.0409958 \n",
" 0.102038 \n",
" 520.12 \n",
" \n",
" \n",
" 2 \n",
" 2020-09-23T01:25:29Z \n",
" 1.39959 \n",
" -0.590701 \n",
" 0.168619 \n",
" -1.02995 \n",
" -0.539806 \n",
" 0.0404441 \n",
" -0.712567 \n",
" 0.00229859 \n",
" -0.971747 \n",
" 0.756801 \n",
" 0.543827 \n",
" 0.112453 \n",
" 1.07538 \n",
" -0.245772 \n",
" 0.180483 \n",
" 1.76986 \n",
" -0.533172 \n",
" -0.5333 \n",
" 1.19225 \n",
" 0.212877 \n",
" 0.102398 \n",
" 0.168269 \n",
" -0.166639 \n",
" -0.81025 \n",
" 0.505083 \n",
" -0.23234 \n",
" 0.0114086 \n",
" 0.00463414 \n",
" 31 \n",
" \n",
" \n",
" 3 \n",
" 2020-09-24T13:59:28Z \n",
" -0.432071 \n",
" 1.6479 \n",
" -1.66936 \n",
" -0.349504 \n",
" 0.785785 \n",
" -0.630647 \n",
" 0.27699 \n",
" 0.586025 \n",
" -0.484715 \n",
" -1.37665 \n",
" -1.32834 \n",
" 0.223621 \n",
" 1.13263 \n",
" -0.550875 \n",
" 0.616568 \n",
" 0.497974 \n",
" 0.502195 \n",
" 0.981343 \n",
" 0.101264 \n",
" -0.244633 \n",
" 0.358932 \n",
" 0.873663 \n",
" -0.178642 \n",
" -0.0171708 \n",
" -0.207392 \n",
" -0.157756 \n",
" -0.237386 \n",
" 0.00193412 \n",
" 1.5 \n",
" \n",
" \n",
" 4 \n",
" 2020-09-24T14:21:58Z \n",
" 2.01416 \n",
" -0.137394 \n",
" -1.01584 \n",
" 0.327269 \n",
" -0.182179 \n",
" -0.956571 \n",
" 0.0432408 \n",
" -0.160746 \n",
" 0.363241 \n",
" 0.259452 \n",
" 0.942162 \n",
" 0.850038 \n",
" -0.616166 \n",
" 0.592634 \n",
" -0.603845 \n",
" 0.0910772 \n",
" -0.471867 \n",
" -0.333816 \n",
" 0.404711 \n",
" -0.255293 \n",
" -0.238644 \n",
" -0.6164 \n",
" 0.347045 \n",
" 0.0615612 \n",
" -0.360196 \n",
" 0.17473 \n",
" -0.0780435 \n",
" -0.0705705 \n",
" 0.89 \n",
" \n",
" \n",
" 5 \n",
" 2020-09-23T10:25:23Z \n",
" -0.64133 \n",
" -0.0573039 \n",
" 1.49 \n",
" -1.68813 \n",
" -1.15104 \n",
" 0.259996 \n",
" -1.39107 \n",
" -2.33408 \n",
" 1.16864 \n",
" -2.08408 \n",
" 0.480381 \n",
" 0.473738 \n",
" -2.19228 \n",
" 0.773942 \n",
" 0.294484 \n",
" 0.406074 \n",
" -0.541855 \n",
" 1.03145 \n",
" 0.0170758 \n",
" 0.618411 \n",
" -1.23163 \n",
" 0.257164 \n",
" -0.371953 \n",
" -0.0385661 \n",
" 1.39751 \n",
" -0.665947 \n",
" 0.031003 \n",
" 0.180357 \n",
" 100 \n",
" \n",
" \n",
" 6 \n",
" 2020-09-24T11:56:48Z \n",
" 2.02395 \n",
" -0.12014 \n",
" -1.08692 \n",
" 0.423019 \n",
" -0.142901 \n",
" -1.12775 \n",
" 0.178493 \n",
" -0.303234 \n",
" 0.564509 \n",
" 0.0628307 \n",
" -0.720047 \n",
" 0.366835 \n",
" -0.110857 \n",
" 0.319094 \n",
" 0.108359 \n",
" -0.153633 \n",
" -0.221312 \n",
" -0.934141 \n",
" 0.0705527 \n",
" -0.210864 \n",
" -0.276175 \n",
" -0.697708 \n",
" 0.335631 \n",
" -0.0171964 \n",
" -0.324904 \n",
" 0.200023 \n",
" -0.071566 \n",
" -0.0582239 \n",
" 16.99 \n",
" \n",
" \n",
" 7 \n",
" 2020-09-24T06:21:16Z \n",
" -0.688944 \n",
" 1.29215 \n",
" -0.564281 \n",
" -1.45753 \n",
" 2.25833 \n",
" -0.32327 \n",
" 1.67898 \n",
" -0.104128 \n",
" -1.28535 \n",
" -1.30343 \n",
" 0.282728 \n",
" -0.402525 \n",
" -0.548687 \n",
" -0.504283 \n",
" -0.685339 \n",
" 0.714828 \n",
" -0.0926736 \n",
" 0.798953 \n",
" -0.150085 \n",
" -0.0371504 \n",
" -0.00687953 \n",
" -0.171568 \n",
" -0.720019 \n",
" -0.419435 \n",
" 1.21199 \n",
" 0.670916 \n",
" -0.103986 \n",
" 0.0300842 \n",
" 8.95 \n",
" \n",
" \n",
" 8 \n",
" 2020-09-24T04:46:29Z \n",
" 2.11936 \n",
" 0.142639 \n",
" -2.37334 \n",
" 0.541949 \n",
" 0.608419 \n",
" -1.77556 \n",
" 0.955775 \n",
" -0.599383 \n",
" 0.0104198 \n",
" 0.295305 \n",
" -0.936569 \n",
" -0.452478 \n",
" -1.3408 \n",
" 1.07746 \n",
" -0.0995836 \n",
" -0.815072 \n",
" 0.0184811 \n",
" -0.639446 \n",
" -0.0654267 \n",
" -0.323573 \n",
" 0.264264 \n",
" 0.898266 \n",
" -0.168063 \n",
" 0.0593112 \n",
" 0.626949 \n",
" 0.729035 \n",
" -0.12912 \n",
" -0.0947133 \n",
" 10 \n",
" \n",
" \n",
" 9 \n",
" 2020-09-23T07:53:34Z \n",
" -5.58426 \n",
" -4.73241 \n",
" -0.448452 \n",
" -0.121442 \n",
" -0.707412 \n",
" -0.114376 \n",
" -1.55463 \n",
" 1.40213 \n",
" -0.0316932 \n",
" -0.942358 \n",
" -2.4395 \n",
" -0.552312 \n",
" -0.295588 \n",
" -0.250246 \n",
" -1.19773 \n",
" 1.54955 \n",
" 0.933237 \n",
" -1.23769 \n",
" 0.416832 \n",
" -1.0469 \n",
" 0.0416507 \n",
" 0.621789 \n",
" 0.223467 \n",
" -0.770137 \n",
" 0.621182 \n",
" -0.0287379 \n",
" 0.505194 \n",
" -1.89832 \n",
" 101.49 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" EVENT_TIMESTAMP va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount\n",
"0 2020-09-23T03:05:50Z -16.5265 8.58497 -18.6499 9.50559 -13.7938 -2.8324 -16.7017 7.51734 -8.50706 -14.1102 5.29924 -10.834 1.67112 -9.37386 0.360806 -9.89925 -19.2363 -8.39855 3.10174 -1.51492 1.19074 -1.12767 -2.35858 0.673461 -1.4137 -0.462762 -2.01858 -1.0428 364.19\n",
"1 2020-09-23T03:51:46Z 0.339812 -2.74375 -0.13407 -1.38573 -1.45141 1.01589 -0.524379 0.22406 0.899746 -0.565012 -0.0876703 0.979427 0.0768828 -0.217884 -0.13683 -2.14289 0.126956 1.75266 0.432546 0.506044 -0.213436 -0.942525 -0.526819 -1.15699 0.311211 -0.746647 0.0409958 0.102038 520.12\n",
"2 2020-09-23T01:25:29Z 1.39959 -0.590701 0.168619 -1.02995 -0.539806 0.0404441 -0.712567 0.00229859 -0.971747 0.756801 0.543827 0.112453 1.07538 -0.245772 0.180483 1.76986 -0.533172 -0.5333 1.19225 0.212877 0.102398 0.168269 -0.166639 -0.81025 0.505083 -0.23234 0.0114086 0.00463414 31\n",
"3 2020-09-24T13:59:28Z -0.432071 1.6479 -1.66936 -0.349504 0.785785 -0.630647 0.27699 0.586025 -0.484715 -1.37665 -1.32834 0.223621 1.13263 -0.550875 0.616568 0.497974 0.502195 0.981343 0.101264 -0.244633 0.358932 0.873663 -0.178642 -0.0171708 -0.207392 -0.157756 -0.237386 0.00193412 1.5\n",
"4 2020-09-24T14:21:58Z 2.01416 -0.137394 -1.01584 0.327269 -0.182179 -0.956571 0.0432408 -0.160746 0.363241 0.259452 0.942162 0.850038 -0.616166 0.592634 -0.603845 0.0910772 -0.471867 -0.333816 0.404711 -0.255293 -0.238644 -0.6164 0.347045 0.0615612 -0.360196 0.17473 -0.0780435 -0.0705705 0.89\n",
"5 2020-09-23T10:25:23Z -0.64133 -0.0573039 1.49 -1.68813 -1.15104 0.259996 -1.39107 -2.33408 1.16864 -2.08408 0.480381 0.473738 -2.19228 0.773942 0.294484 0.406074 -0.541855 1.03145 0.0170758 0.618411 -1.23163 0.257164 -0.371953 -0.0385661 1.39751 -0.665947 0.031003 0.180357 100\n",
"6 2020-09-24T11:56:48Z 2.02395 -0.12014 -1.08692 0.423019 -0.142901 -1.12775 0.178493 -0.303234 0.564509 0.0628307 -0.720047 0.366835 -0.110857 0.319094 0.108359 -0.153633 -0.221312 -0.934141 0.0705527 -0.210864 -0.276175 -0.697708 0.335631 -0.0171964 -0.324904 0.200023 -0.071566 -0.0582239 16.99\n",
"7 2020-09-24T06:21:16Z -0.688944 1.29215 -0.564281 -1.45753 2.25833 -0.32327 1.67898 -0.104128 -1.28535 -1.30343 0.282728 -0.402525 -0.548687 -0.504283 -0.685339 0.714828 -0.0926736 0.798953 -0.150085 -0.0371504 -0.00687953 -0.171568 -0.720019 -0.419435 1.21199 0.670916 -0.103986 0.0300842 8.95\n",
"8 2020-09-24T04:46:29Z 2.11936 0.142639 -2.37334 0.541949 0.608419 -1.77556 0.955775 -0.599383 0.0104198 0.295305 -0.936569 -0.452478 -1.3408 1.07746 -0.0995836 -0.815072 0.0184811 -0.639446 -0.0654267 -0.323573 0.264264 0.898266 -0.168063 0.0593112 0.626949 0.729035 -0.12912 -0.0947133 10\n",
"9 2020-09-23T07:53:34Z -5.58426 -4.73241 -0.448452 -0.121442 -0.707412 -0.114376 -1.55463 1.40213 -0.0316932 -0.942358 -2.4395 -0.552312 -0.295588 -0.250246 -1.19773 1.54955 0.933237 -1.23769 0.416832 -1.0469 0.0416507 0.621789 0.223467 -0.770137 0.621182 -0.0287379 0.505194 -1.89832 101.49"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test = pd.DataFrame(X_test, columns = saved_cols)\n",
"test.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" EVENT_TIMESTAMP \n",
" va \n",
" vb \n",
" vc \n",
" vd \n",
" ve \n",
" vf \n",
" vg \n",
" vh \n",
" vi \n",
" vj \n",
" vk \n",
" vl \n",
" vm \n",
" vn \n",
" vo \n",
" vp \n",
" vq \n",
" vr \n",
" vs \n",
" vt \n",
" vu \n",
" vv \n",
" vw \n",
" vx \n",
" vy \n",
" vz \n",
" vaa \n",
" vab \n",
" amount \n",
" EVENT_LABEL \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 2020-09-23T03:05:50Z \n",
" -16.5265 \n",
" 8.58497 \n",
" -18.6499 \n",
" 9.50559 \n",
" -13.7938 \n",
" -2.8324 \n",
" -16.7017 \n",
" 7.51734 \n",
" -8.50706 \n",
" -14.1102 \n",
" 5.29924 \n",
" -10.834 \n",
" 1.67112 \n",
" -9.37386 \n",
" 0.360806 \n",
" -9.89925 \n",
" -19.2363 \n",
" -8.39855 \n",
" 3.10174 \n",
" -1.51492 \n",
" 1.19074 \n",
" -1.12767 \n",
" -2.35858 \n",
" 0.673461 \n",
" -1.4137 \n",
" -0.462762 \n",
" -2.01858 \n",
" -1.0428 \n",
" 364.19 \n",
" 1 \n",
" \n",
" \n",
" 1 \n",
" 2020-09-23T03:51:46Z \n",
" 0.339812 \n",
" -2.74375 \n",
" -0.13407 \n",
" -1.38573 \n",
" -1.45141 \n",
" 1.01589 \n",
" -0.524379 \n",
" 0.22406 \n",
" 0.899746 \n",
" -0.565012 \n",
" -0.0876703 \n",
" 0.979427 \n",
" 0.0768828 \n",
" -0.217884 \n",
" -0.13683 \n",
" -2.14289 \n",
" 0.126956 \n",
" 1.75266 \n",
" 0.432546 \n",
" 0.506044 \n",
" -0.213436 \n",
" -0.942525 \n",
" -0.526819 \n",
" -1.15699 \n",
" 0.311211 \n",
" -0.746647 \n",
" 0.0409958 \n",
" 0.102038 \n",
" 520.12 \n",
" 0 \n",
" \n",
" \n",
" 2 \n",
" 2020-09-23T01:25:29Z \n",
" 1.39959 \n",
" -0.590701 \n",
" 0.168619 \n",
" -1.02995 \n",
" -0.539806 \n",
" 0.0404441 \n",
" -0.712567 \n",
" 0.00229859 \n",
" -0.971747 \n",
" 0.756801 \n",
" 0.543827 \n",
" 0.112453 \n",
" 1.07538 \n",
" -0.245772 \n",
" 0.180483 \n",
" 1.76986 \n",
" -0.533172 \n",
" -0.5333 \n",
" 1.19225 \n",
" 0.212877 \n",
" 0.102398 \n",
" 0.168269 \n",
" -0.166639 \n",
" -0.81025 \n",
" 0.505083 \n",
" -0.23234 \n",
" 0.0114086 \n",
" 0.00463414 \n",
" 31 \n",
" 0 \n",
" \n",
" \n",
" 3 \n",
" 2020-09-24T13:59:28Z \n",
" -0.432071 \n",
" 1.6479 \n",
" -1.66936 \n",
" -0.349504 \n",
" 0.785785 \n",
" -0.630647 \n",
" 0.27699 \n",
" 0.586025 \n",
" -0.484715 \n",
" -1.37665 \n",
" -1.32834 \n",
" 0.223621 \n",
" 1.13263 \n",
" -0.550875 \n",
" 0.616568 \n",
" 0.497974 \n",
" 0.502195 \n",
" 0.981343 \n",
" 0.101264 \n",
" -0.244633 \n",
" 0.358932 \n",
" 0.873663 \n",
" -0.178642 \n",
" -0.0171708 \n",
" -0.207392 \n",
" -0.157756 \n",
" -0.237386 \n",
" 0.00193412 \n",
" 1.5 \n",
" 0 \n",
" \n",
" \n",
" 4 \n",
" 2020-09-24T14:21:58Z \n",
" 2.01416 \n",
" -0.137394 \n",
" -1.01584 \n",
" 0.327269 \n",
" -0.182179 \n",
" -0.956571 \n",
" 0.0432408 \n",
" -0.160746 \n",
" 0.363241 \n",
" 0.259452 \n",
" 0.942162 \n",
" 0.850038 \n",
" -0.616166 \n",
" 0.592634 \n",
" -0.603845 \n",
" 0.0910772 \n",
" -0.471867 \n",
" -0.333816 \n",
" 0.404711 \n",
" -0.255293 \n",
" -0.238644 \n",
" -0.6164 \n",
" 0.347045 \n",
" 0.0615612 \n",
" -0.360196 \n",
" 0.17473 \n",
" -0.0780435 \n",
" -0.0705705 \n",
" 0.89 \n",
" 0 \n",
" \n",
" \n",
" 5 \n",
" 2020-09-23T10:25:23Z \n",
" -0.64133 \n",
" -0.0573039 \n",
" 1.49 \n",
" -1.68813 \n",
" -1.15104 \n",
" 0.259996 \n",
" -1.39107 \n",
" -2.33408 \n",
" 1.16864 \n",
" -2.08408 \n",
" 0.480381 \n",
" 0.473738 \n",
" -2.19228 \n",
" 0.773942 \n",
" 0.294484 \n",
" 0.406074 \n",
" -0.541855 \n",
" 1.03145 \n",
" 0.0170758 \n",
" 0.618411 \n",
" -1.23163 \n",
" 0.257164 \n",
" -0.371953 \n",
" -0.0385661 \n",
" 1.39751 \n",
" -0.665947 \n",
" 0.031003 \n",
" 0.180357 \n",
" 100 \n",
" 0 \n",
" \n",
" \n",
" 6 \n",
" 2020-09-24T11:56:48Z \n",
" 2.02395 \n",
" -0.12014 \n",
" -1.08692 \n",
" 0.423019 \n",
" -0.142901 \n",
" -1.12775 \n",
" 0.178493 \n",
" -0.303234 \n",
" 0.564509 \n",
" 0.0628307 \n",
" -0.720047 \n",
" 0.366835 \n",
" -0.110857 \n",
" 0.319094 \n",
" 0.108359 \n",
" -0.153633 \n",
" -0.221312 \n",
" -0.934141 \n",
" 0.0705527 \n",
" -0.210864 \n",
" -0.276175 \n",
" -0.697708 \n",
" 0.335631 \n",
" -0.0171964 \n",
" -0.324904 \n",
" 0.200023 \n",
" -0.071566 \n",
" -0.0582239 \n",
" 16.99 \n",
" 0 \n",
" \n",
" \n",
" 7 \n",
" 2020-09-24T06:21:16Z \n",
" -0.688944 \n",
" 1.29215 \n",
" -0.564281 \n",
" -1.45753 \n",
" 2.25833 \n",
" -0.32327 \n",
" 1.67898 \n",
" -0.104128 \n",
" -1.28535 \n",
" -1.30343 \n",
" 0.282728 \n",
" -0.402525 \n",
" -0.548687 \n",
" -0.504283 \n",
" -0.685339 \n",
" 0.714828 \n",
" -0.0926736 \n",
" 0.798953 \n",
" -0.150085 \n",
" -0.0371504 \n",
" -0.00687953 \n",
" -0.171568 \n",
" -0.720019 \n",
" -0.419435 \n",
" 1.21199 \n",
" 0.670916 \n",
" -0.103986 \n",
" 0.0300842 \n",
" 8.95 \n",
" 0 \n",
" \n",
" \n",
" 8 \n",
" 2020-09-24T04:46:29Z \n",
" 2.11936 \n",
" 0.142639 \n",
" -2.37334 \n",
" 0.541949 \n",
" 0.608419 \n",
" -1.77556 \n",
" 0.955775 \n",
" -0.599383 \n",
" 0.0104198 \n",
" 0.295305 \n",
" -0.936569 \n",
" -0.452478 \n",
" -1.3408 \n",
" 1.07746 \n",
" -0.0995836 \n",
" -0.815072 \n",
" 0.0184811 \n",
" -0.639446 \n",
" -0.0654267 \n",
" -0.323573 \n",
" 0.264264 \n",
" 0.898266 \n",
" -0.168063 \n",
" 0.0593112 \n",
" 0.626949 \n",
" 0.729035 \n",
" -0.12912 \n",
" -0.0947133 \n",
" 10 \n",
" 0 \n",
" \n",
" \n",
" 9 \n",
" 2020-09-23T07:53:34Z \n",
" -5.58426 \n",
" -4.73241 \n",
" -0.448452 \n",
" -0.121442 \n",
" -0.707412 \n",
" -0.114376 \n",
" -1.55463 \n",
" 1.40213 \n",
" -0.0316932 \n",
" -0.942358 \n",
" -2.4395 \n",
" -0.552312 \n",
" -0.295588 \n",
" -0.250246 \n",
" -1.19773 \n",
" 1.54955 \n",
" 0.933237 \n",
" -1.23769 \n",
" 0.416832 \n",
" -1.0469 \n",
" 0.0416507 \n",
" 0.621789 \n",
" 0.223467 \n",
" -0.770137 \n",
" 0.621182 \n",
" -0.0287379 \n",
" 0.505194 \n",
" -1.89832 \n",
" 101.49 \n",
" 0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" EVENT_TIMESTAMP va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount EVENT_LABEL\n",
"0 2020-09-23T03:05:50Z -16.5265 8.58497 -18.6499 9.50559 -13.7938 -2.8324 -16.7017 7.51734 -8.50706 -14.1102 5.29924 -10.834 1.67112 -9.37386 0.360806 -9.89925 -19.2363 -8.39855 3.10174 -1.51492 1.19074 -1.12767 -2.35858 0.673461 -1.4137 -0.462762 -2.01858 -1.0428 364.19 1\n",
"1 2020-09-23T03:51:46Z 0.339812 -2.74375 -0.13407 -1.38573 -1.45141 1.01589 -0.524379 0.22406 0.899746 -0.565012 -0.0876703 0.979427 0.0768828 -0.217884 -0.13683 -2.14289 0.126956 1.75266 0.432546 0.506044 -0.213436 -0.942525 -0.526819 -1.15699 0.311211 -0.746647 0.0409958 0.102038 520.12 0\n",
"2 2020-09-23T01:25:29Z 1.39959 -0.590701 0.168619 -1.02995 -0.539806 0.0404441 -0.712567 0.00229859 -0.971747 0.756801 0.543827 0.112453 1.07538 -0.245772 0.180483 1.76986 -0.533172 -0.5333 1.19225 0.212877 0.102398 0.168269 -0.166639 -0.81025 0.505083 -0.23234 0.0114086 0.00463414 31 0\n",
"3 2020-09-24T13:59:28Z -0.432071 1.6479 -1.66936 -0.349504 0.785785 -0.630647 0.27699 0.586025 -0.484715 -1.37665 -1.32834 0.223621 1.13263 -0.550875 0.616568 0.497974 0.502195 0.981343 0.101264 -0.244633 0.358932 0.873663 -0.178642 -0.0171708 -0.207392 -0.157756 -0.237386 0.00193412 1.5 0\n",
"4 2020-09-24T14:21:58Z 2.01416 -0.137394 -1.01584 0.327269 -0.182179 -0.956571 0.0432408 -0.160746 0.363241 0.259452 0.942162 0.850038 -0.616166 0.592634 -0.603845 0.0910772 -0.471867 -0.333816 0.404711 -0.255293 -0.238644 -0.6164 0.347045 0.0615612 -0.360196 0.17473 -0.0780435 -0.0705705 0.89 0\n",
"5 2020-09-23T10:25:23Z -0.64133 -0.0573039 1.49 -1.68813 -1.15104 0.259996 -1.39107 -2.33408 1.16864 -2.08408 0.480381 0.473738 -2.19228 0.773942 0.294484 0.406074 -0.541855 1.03145 0.0170758 0.618411 -1.23163 0.257164 -0.371953 -0.0385661 1.39751 -0.665947 0.031003 0.180357 100 0\n",
"6 2020-09-24T11:56:48Z 2.02395 -0.12014 -1.08692 0.423019 -0.142901 -1.12775 0.178493 -0.303234 0.564509 0.0628307 -0.720047 0.366835 -0.110857 0.319094 0.108359 -0.153633 -0.221312 -0.934141 0.0705527 -0.210864 -0.276175 -0.697708 0.335631 -0.0171964 -0.324904 0.200023 -0.071566 -0.0582239 16.99 0\n",
"7 2020-09-24T06:21:16Z -0.688944 1.29215 -0.564281 -1.45753 2.25833 -0.32327 1.67898 -0.104128 -1.28535 -1.30343 0.282728 -0.402525 -0.548687 -0.504283 -0.685339 0.714828 -0.0926736 0.798953 -0.150085 -0.0371504 -0.00687953 -0.171568 -0.720019 -0.419435 1.21199 0.670916 -0.103986 0.0300842 8.95 0\n",
"8 2020-09-24T04:46:29Z 2.11936 0.142639 -2.37334 0.541949 0.608419 -1.77556 0.955775 -0.599383 0.0104198 0.295305 -0.936569 -0.452478 -1.3408 1.07746 -0.0995836 -0.815072 0.0184811 -0.639446 -0.0654267 -0.323573 0.264264 0.898266 -0.168063 0.0593112 0.626949 0.729035 -0.12912 -0.0947133 10 0\n",
"9 2020-09-23T07:53:34Z -5.58426 -4.73241 -0.448452 -0.121442 -0.707412 -0.114376 -1.55463 1.40213 -0.0316932 -0.942358 -2.4395 -0.552312 -0.295588 -0.250246 -1.19773 1.54955 0.933237 -1.23769 0.416832 -1.0469 0.0416507 0.621789 0.223467 -0.770137 0.621182 -0.0287379 0.505194 -1.89832 101.49 0"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# The testing dataset with the labels to perform evaluations latter on\n",
"test_label = pd.DataFrame(X_test, columns = saved_cols)\n",
"test_label['EVENT_LABEL']=y_test.astype(int)\n",
"test_label.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of frauds in test data: 46\n",
"Number of non-frauds in test data: 28435\n",
"Percentage of fradulent data: 0.16151118289385907\n"
]
}
],
"source": [
"#validating the test dataset with labels\n",
"nonfrauds, frauds = test_label.groupby('EVENT_LABEL').size()\n",
"print('Number of frauds in test data: ', frauds)\n",
"print('Number of non-frauds in test data: ', nonfrauds)\n",
"print('Percentage of fradulent data:', 100.*frauds/(frauds + nonfrauds))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of frauds: 446\n",
"Number of non-frauds: 255880\n",
"Percentage of fradulent data: 0.17399717547186005\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAEFCAYAAAAWrxseAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAUZUlEQVR4nO3df9ClZX3f8fen/FKLCshCcIEs1c1UtHXFLZAxzdgwAwttZ3FGUkgLW4bOphZaHTONxJhCUTrSadRhQuiQsmWxKlKEsjbgZoumxpZfC/JDQgjPEIXNIiwuENAggt/+ca6nHg7nen7tcp6Ffb9m7jnnfO/ruu7rPCzP55zrvs9zUlVIkjTO31jsCUiSdl2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISE1SZYkeSDJ6xZ7Lq8lSfZJ8mdJDlrsuWj+DAlNVJJfS7I5ybNJHk1yY5JfmsBxK8nbZ2l2LvBfq+q51uePkzzX5jq9fTXJ0iQvJHnbmONcl+Q/DR3zhyP9f7PtO7/tP2Wo756ttqz9XKb7/CTJ80OP//MMz/P9SbbM8rOYPvbRI/V/nuTFoeM8lORDQ/uXtX7Pjmz/pO2/IsmnRo9XVT8G1gEfm2le2jXtudgT0O4jyUcZ/CL+l8BG4HlgFbAa+NYiTo0k+wBrgBUju86pqv8ypv1NwOnA+UO1A4CTgJVDTd9dVVOdw24HLkhybVW9OLyjqk4cGvcKYEtVfWLOT6gjSdq8tzN4vreNNLm5qn6ptT0K+N9Jbqmqbw+12a+qXpjnob8I3JXk4y009CrhOwlNRJI3AxcAZ1fVtVX1w6r6SVV9tar+bWuzT5LPJdnats+1X97Tr3K/NTLm/3930F7FXpLkD5M8k+TW6Vf6Sb7Zutw9/Mp3xDHAU1U146vwIesZ/LIddipwX1XdO8cxvsYgKP/ZHNvvDH8feCvwYeDUJHv3GlbVncD9wDt29KDt5/okcOyOjqXJMiQ0Kb8IvA64boY2v83gl8gK4N3A0cB8Xj2fBvx7YH9gCrgQoKp+ue1/d1XtW1VfHtP37wAPzONY1wEHjiyVnQ5cOY8xCvgd4Lwke82j345YA3wVmP4Z/KNewyR/D/gFYPNOOvb9DP676lXEkNCkvAV4YpZlin8KXFBVj1fVNga/8Edfrc/k2qq6rR3jC7x86Wgm+wHPjKlfnOSpoe2TAFX118B/B84ASLIceC+DZZVhd470P2F4Z1VtALYB/2Iec12QJG8ATgG+WFU/Aa5hEBrDjm3zfJbBUtTngQdH2jwx8pzm+k7jGQY/Z72KGBKalB8weOU903mwtwLfG3r8vVabq+8P3f8RsO88+j4JvHFM/d9U1X5D2+8M7VsP/Gq7Gup04GtV9fhI/6NG+m8cc4xPMHgX9UpfVfUB4AXghvb4C8CJSZYMtbmlzXNf4OeAdwL/YWScA0ee0/1zPP4bgad2YP5aBIaEJuVm4Dng5BnabAV+fujx4a0G8EPgDdM7kvzcTp7fPQyWVuasqv6EQfitZnBeYT5LTcPjbGKwPPavFtJ/HtYwCM6Hk3yfwTuhvRgs042b12PAV4B/vJOO/w7g7p00libEkNBEVNXTwL8DLklycpI3JNkryYlJ/mNr9iXgE+3zCge29v+t7bsbeGeSFe2V+/nznMJjwN+aYf9twH5Jls5z3CuBixgso3x1nn2H/TbwmzvQ/yWSvG5kWwocx+AcxAp+dt7nIl6+5DQ9xlsYvPu4bx6H3mPkuHu3sZYCBwC3LPxZaTEYEpqYqvoM8FEGyyvbgEeAc4D/0Zp8isFJ0nuAe4E7W42q+nMGV0f9LwZr5PO9ZPZ8YH1bQ//VMXN7HriCl19p9Hsjnwm4Y2T/lQze8Xy5c2nn3SP9PzduclX1f3j55agLtRT465HtTOCuqvqjqvr+9AZcDPzdJO9qfX9xeq4MTjRvA/71yPhPjTynjw7tO3fkuF9v9V8D1nv566tP/GY6aaCtzf8J8J52Ylo7QbuM+W7gl8ecs9EuzpCQJHW53CS9yiT5+Jg/jfFskhsXe2567fGdhCSpy3cSkqSu19wf+DvwwANr2bJliz0NSXpVueOOO56oqiWj9ddcSCxbtozNm3fWn5qRpN1Dku+Nq7vcJEnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVLXa+7DdK8Wy879w8WewmvKdz/9Dxd7CtJr0qzvJJIcluQbSe5Pcl+SD7f6+Un+MsldbTtpqM9vJZlK8sDwF78nWdVqU0nOHaofkeTWJA8m+fLQt1nt0x5Ptf3LduaTlyTNbC7LTS8Av1FV7wCOBc5OcmTb99mqWtG2GwDavlMZfIH6KuD3k+yRZA/gEuBE4EjgtKFxLmpjLWfwhfRntfpZwJNV9Xbgs62dJGlCZg2Jqnq0qu5s959h8JWGM30P8Grgqqr6cVX9BYMveD+6bVNV9VD7qsirgNVJAvwKcE3rvx44eWis9e3+NcBxrb0kaQLmdeK6Lfe8B7i1lc5Jck+SdUn2b7WlDL67eNqWVuvV3wI8VVUvjNRfMlbb/3RrL0magDmHRJJ9ga8AH6mqvwIuBd4GrAAeBX53uumY7rWA+kxjjc5tbZLNSTZv27ZtxuchSZq7OYVEkr0YBMQXqupagKp6rKperKqfAn/AYDkJBu8EDhvqfiiwdYb6E8B+SfYcqb9krLb/zcD20flV1WVVtbKqVi5Z8rI/hy5JWqC5XN0U4HLg/qr6zFD9kKFmHwC+0+5vAE5tVyYdASwHbgNuB5a3K5n2ZnBye0MNvj/1G8AHW/81wPVDY61p9z8IfL38vlVJmpi5fE7ifcDpwL1J7mq1jzO4OmkFg+Wf7wK/DlBV9yW5GvhTBldGnV1VLwIkOQfYCOwBrKuq+9p4HwOuSvIp4NsMQol2+/kkUwzeQZy6A89VkjRPs4ZEVX2L8ecGbpihz4XAhWPqN4zrV1UP8bPlquH6c8Aps81RkvTK8M9ySJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktQ1a0gkOSzJN5Lcn+S+JB9u9QOSbEryYLvdv9WT5OIkU0nuSXLU0FhrWvsHk6wZqr83yb2tz8VJMtMxJEmTMZd3Ei8Av1FV7wCOBc5OciRwLnBTVS0HbmqPAU4ElrdtLXApDH7hA+cBxwBHA+cN/dK/tLWd7req1XvHkCRNwKwhUVWPVtWd7f4zwP3AUmA1sL41Ww+c3O6vBq6sgVuA/ZIcApwAbKqq7VX1JLAJWNX2vamqbq6qAq4cGWvcMSRJEzCvcxJJlgHvAW4FDq6qR2EQJMBBrdlS4JGhbltabab6ljF1ZjiGJGkC5hwSSfYFvgJ8pKr+aqamY2q1gPqcJVmbZHOSzdu2bZtPV0nSDOYUEkn2YhAQX6iqa1v5sbZURLt9vNW3AIcNdT8U2DpL/dAx9ZmO8RJVdVlVrayqlUuWLJnLU5IkzcFcrm4KcDlwf1V9ZmjXBmD6CqU1wPVD9TPaVU7HAk+3paKNwPFJ9m8nrI8HNrZ9zyQ5th3rjJGxxh1DkjQBe86hzfuA04F7k9zVah8HPg1cneQs4GHglLbvBuAkYAr4EXAmQFVtT/JJ4PbW7oKq2t7ufwi4Ang9cGPbmOEYkqQJmDUkqupbjD9vAHDcmPYFnN0Zax2wbkx9M/CuMfUfjDuGJGky/MS1JKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqmjUkkqxL8niS7wzVzk/yl0nuattJQ/t+K8lUkgeSnDBUX9VqU0nOHaofkeTWJA8m+XKSvVt9n/Z4qu1ftrOetCRpbubyTuIKYNWY+merakXbbgBIciRwKvDO1uf3k+yRZA/gEuBE4EjgtNYW4KI21nLgSeCsVj8LeLKq3g58trWTJE3QrCFRVd8Ets9xvNXAVVX146r6C2AKOLptU1X1UFU9D1wFrE4S4FeAa1r/9cDJQ2Otb/evAY5r7SVJE7Ij5yTOSXJPW47av9WWAo8MtdnSar36W4CnquqFkfpLxmr7n27tJUkTstCQuBR4G7ACeBT43VYf90q/FlCfaayXSbI2yeYkm7dt2zbTvCVJ87CgkKiqx6rqxar6KfAHDJaTYPBO4LChpocCW2eoPwHsl2TPkfpLxmr730xn2auqLquqlVW1csmSJQt5SpKkMRYUEkkOGXr4AWD6yqcNwKntyqQjgOXAbcDtwPJ2JdPeDE5ub6iqAr4BfLD1XwNcPzTWmnb/g8DXW3tJ0oTsOVuDJF8C3g8cmGQLcB7w/iQrGCz/fBf4dYCqui/J1cCfAi8AZ1fVi22cc4CNwB7Auqq6rx3iY8BVST4FfBu4vNUvBz6fZIrBO4hTd/jZSpLmZdaQqKrTxpQvH1Obbn8hcOGY+g3ADWPqD/Gz5arh+nPAKbPNT5L0yvET15KkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1zRoSSdYleTzJd4ZqByTZlOTBdrt/qyfJxUmmktyT5KihPmta+weTrBmqvzfJva3PxUky0zEkSZMzl3cSVwCrRmrnAjdV1XLgpvYY4ERgedvWApfC4Bc+cB5wDHA0cN7QL/1LW9vpfqtmOYYkaUJmDYmq+iawfaS8Gljf7q8HTh6qX1kDtwD7JTkEOAHYVFXbq+pJYBOwqu17U1XdXFUFXDky1rhjSJImZKHnJA6uqkcB2u1Brb4UeGSo3ZZWm6m+ZUx9pmO8TJK1STYn2bxt27YFPiVJ0qidfeI6Y2q1gPq8VNVlVbWyqlYuWbJkvt0lSR0LDYnH2lIR7fbxVt8CHDbU7lBg6yz1Q8fUZzqGJGlCFhoSG4DpK5TWANcP1c9oVzkdCzzdloo2Ascn2b+dsD4e2Nj2PZPk2HZV0xkjY407hiRpQvacrUGSLwHvBw5MsoXBVUqfBq5OchbwMHBKa34DcBIwBfwIOBOgqrYn+SRwe2t3QVVNnwz/EIMrqF4P3Ng2ZjiGJGlCZg2Jqjqts+u4MW0LOLszzjpg3Zj6ZuBdY+o/GHcMSdLk+IlrSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUtUMhkeS7Se5NcleSza12QJJNSR5st/u3epJcnGQqyT1JjhoaZ01r/2CSNUP197bxp1rf7Mh8JUnzszPeSfyDqlpRVSvb43OBm6pqOXBTewxwIrC8bWuBS2EQKsB5wDHA0cB508HS2qwd6rdqJ8xXkjRHr8Ry02pgfbu/Hjh5qH5lDdwC7JfkEOAEYFNVba+qJ4FNwKq2701VdXNVFXDl0FiSpAnY0ZAo4I+S3JFkbasdXFWPArTbg1p9KfDIUN8trTZTfcuYuiRpQvbcwf7vq6qtSQ4CNiX5sxnajjufUAuov3zgQUCtBTj88MNnnrEkac526J1EVW1tt48D1zE4p/BYWyqi3T7emm8BDhvqfiiwdZb6oWPq4+ZxWVWtrKqVS5Ys2ZGnJEkasuCQSPI3k7xx+j5wPPAdYAMwfYXSGuD6dn8DcEa7yulY4Om2HLUROD7J/u2E9fHAxrbvmSTHtquazhgaS5I0ATuy3HQwcF27KnVP4ItV9bUktwNXJzkLeBg4pbW/ATgJmAJ+BJwJUFXbk3wSuL21u6Cqtrf7HwKuAF4P3Ng2SdKELDgkquoh4N1j6j8AjhtTL+DszljrgHVj6puBdy10jpKkHeMnriVJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnq2uVDIsmqJA8kmUpy7mLPR5J2J7t0SCTZA7gEOBE4EjgtyZGLOytJ2n3s0iEBHA1MVdVDVfU8cBWwepHnJEm7jT0XewKzWAo8MvR4C3DMaKMka4G17eGzSR6YwNx2FwcCTyz2JGaTixZ7BloEr4p/m68iPz+uuKuHRMbU6mWFqsuAy1756ex+kmyuqpWLPQ9plP82J2NXX27aAhw29PhQYOsizUWSdju7ekjcDixPckSSvYFTgQ2LPCdJ2m3s0stNVfVCknOAjcAewLqqum+Rp7W7cRlPuyr/bU5Aql62xC9JErDrLzdJkhaRISFJ6jIkJEldu/SJa01Wkr/N4BPtSxl8HmUrsKGq7l/UiUlaNL6TEABJPsbgz54EuI3B5ccBvuQfVtSuLMmZiz2H1zKvbhIASf4ceGdV/WSkvjdwX1UtX5yZSTNL8nBVHb7Y83itcrlJ034KvBX43kj9kLZPWjRJ7untAg6e5Fx2N4aEpn0EuCnJg/zsjyoeDrwdOGfRZiUNHAycADw5Ug/wfyc/nd2HISEAquprSX6BwZ9nX8rgf74twO1V9eKiTk6C/wnsW1V3je5I8seTn87uw3MSkqQur26SJHUZEpKkLkNCktRlSEiSugwJSVLX/wPySywSeEQacQAAAABJRU5ErkJggg==\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"#validating the training dataset\n",
"nonfrauds, frauds = data.groupby('EVENT_LABEL').size()\n",
"print('Number of frauds: ', frauds)\n",
"print('Number of non-frauds: ', nonfrauds)\n",
"print('Percentage of fradulent data:', 100.*frauds/(frauds + nonfrauds))\n",
"\n",
"count_class_0, count_class_1 = data.EVENT_LABEL.value_counts()\n",
"data.EVENT_LABEL.value_counts().plot(kind='bar', title='Count (EVENT_LABEL)');"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Uploading the data for training"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'ResponseMetadata': {'RequestId': 'CE5955583F8DEF3F',\n",
" 'HostId': '9KFf1TqYSU63r33be9Rr6yi3KLpN8eZUEl1bPIaSHbT25EZXtnOG/0nHiFO7lgSAMBXq/qHd6zE=',\n",
" 'HTTPStatusCode': 200,\n",
" 'HTTPHeaders': {'x-amz-id-2': '9KFf1TqYSU63r33be9Rr6yi3KLpN8eZUEl1bPIaSHbT25EZXtnOG/0nHiFO7lgSAMBXq/qHd6zE=',\n",
" 'x-amz-request-id': 'CE5955583F8DEF3F',\n",
" 'date': 'Tue, 22 Sep 2020 15:49:02 GMT',\n",
" 'x-amz-server-side-encryption': 'AES256',\n",
" 'etag': '\"d28f6b034afe0855dc3140edf895eb29\"',\n",
" 'content-length': '0',\n",
" 'server': 'AmazonS3'},\n",
" 'RetryAttempts': 0},\n",
" 'ETag': '\"d28f6b034afe0855dc3140edf895eb29\"',\n",
" 'ServerSideEncryption': 'AES256'}"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csv_buffer = StringIO()\n",
"data.to_csv(csv_buffer, index=False)\n",
"s3_resource.Object(S3_BUCKET, 'dataset-training.csv').put(Body=csv_buffer.getvalue())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Uploading the data for testing"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'ResponseMetadata': {'RequestId': '4YAJCW9VAM8XFR4Y',\n",
" 'HostId': 'ytH2B37HSHmc1uJeMhHErSg02Zv+MOZa5YrJwrkAZQkTsL9qMOhgWQ3Wwhuufgd58WdWdTSVLto=',\n",
" 'HTTPStatusCode': 200,\n",
" 'HTTPHeaders': {'x-amz-id-2': 'ytH2B37HSHmc1uJeMhHErSg02Zv+MOZa5YrJwrkAZQkTsL9qMOhgWQ3Wwhuufgd58WdWdTSVLto=',\n",
" 'x-amz-request-id': '4YAJCW9VAM8XFR4Y',\n",
" 'date': 'Tue, 22 Sep 2020 15:49:08 GMT',\n",
" 'x-amz-server-side-encryption': 'AES256',\n",
" 'etag': '\"d097d1d2e99495c26975acad965d81a5\"',\n",
" 'content-length': '0',\n",
" 'server': 'AmazonS3'},\n",
" 'RetryAttempts': 0},\n",
" 'ETag': '\"d097d1d2e99495c26975acad965d81a5\"',\n",
" 'ServerSideEncryption': 'AES256'}"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csv_buffer = StringIO()\n",
"test.to_csv(csv_buffer, index=False)\n",
"\n",
"s3_resource.Object(S3_BUCKET, 'dataset-test.csv').put(Body=csv_buffer.getvalue())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once we have the datasets ready we need create the necesary entities for build and deploy the fraud detection model. This can be done within the Amazon Fraud Detector console visually or through the API as shown in the following seccion."
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"# -- This is all you need to fill out. Once complete simply interactively run each code cell. -- \n",
"# your_entity_name\n",
"ENTITY_TYPE = \"creditcardtrans{0}\".format(sufx) \n",
"ENTITY_DESC = \"creditcard transactions: {0}\".format(sufx) \n",
"# your_event_type\n",
"EVENT_TYPE = \"creditcard{0}\".format(sufx) \n",
"EVENT_DESC = \"creditcard card payment events: {0}\".format(sufx) \n",
"# your_model_name\n",
"MODEL_NAME = \"fraud_detector_model{0}\".format(sufx) \n",
"MODEL_DESC = \"model trained on: {0}\".format(sufx) \n",
"# your_detector_name\n",
"DETECTOR_NAME = \"fraud_detector_endpoint{0}\".format(sufx) \n",
"DETECTOR_DESC = \"detects synthetic fraud events created: {0}\".format(sufx) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. Profile Your Dataset \n",
"-----\n",
"\n",
" \n",
" đź’ˇ Profiling \n",
"\n",
"The function below will: 1. profile your data, creating descriptive statistics, 2. perform basic data quality checks (nulls, unique variables, etc.), and 3. return summary statistics and the EVENT and MODEL schemas used to define your EVENT_TYPE and TRAIN your MODEL.\n",
"\n",
"\n",
"
"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- summary stats ---\n",
" feature_name dtype count nunique null not_null null_pct nunique_pct feature_type feature_warning\n",
"0 EVENT_TIMESTAMP object 256326 119735 0 256326 0.0 0.4671 EVENT_TIMESTAMP NO WARNING\n",
"1 va object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"2 vb object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"3 vc object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"4 vd object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"5 ve object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"6 vf object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"7 vg object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"8 vh object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"9 vi object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"10 vj object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"11 vk object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"12 vl object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"13 vm object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"14 vn object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"15 vo object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"16 vp object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"17 vq object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"18 vr object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"19 vs object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"20 vt object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"21 vu object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"22 vv object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"23 vw object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"24 vx object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"25 vy object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"26 vz object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"27 vaa object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"28 vab object 256326 248553 0 256326 0.0 0.9697 CATEGORY NO WARNING\n",
"29 amount object 256326 31154 0 256326 0.0 0.1215 PRICE NO WARNING\n",
"30 EVENT_LABEL int64 256326 2 0 256326 0.0 0.0000 TARGET NO WARNING\n",
"\n",
"\n",
"--- event variables ---\n",
"['va', 'vb', 'vc', 'vd', 've', 'vf', 'vg', 'vh', 'vi', 'vj', 'vk', 'vl', 'vm', 'vn', 'vo', 'vp', 'vq', 'vr', 'vs', 'vt', 'vu', 'vv', 'vw', 'vx', 'vy', 'vz', 'vaa', 'vab', 'amount']\n",
"\n",
"\n",
"--- event labels ---\n",
"[0, 1]\n",
"\n",
"\n",
"--- training data schema ---\n",
"{'modelVariables': ['va', 'vb', 'vc', 'vd', 've', 'vf', 'vg', 'vh', 'vi', 'vj', 'vk', 'vl', 'vm', 'vn', 'vo', 'vp', 'vq', 'vr', 'vs', 'vt', 'vu', 'vv', 'vw', 'vx', 'vy', 'vz', 'vaa', 'vab'], 'labelSchema': {'labelMapper': {'FRAUD': ['1'], 'LEGIT': ['0']}}}\n",
"\n",
"\n"
]
}
],
"source": [
"# --- no changes; just run this code block ---\n",
"def summary_stats(df):\n",
" \"\"\" Generate summary statistics for a panda's data frame \n",
" Args:\n",
" df (DataFrame): panda's dataframe to create summary statistics for.\n",
" Returns:\n",
" DataFrame of summary statistics, training data schema, event variables and event lables \n",
" \"\"\"\n",
" df = df.copy()\n",
" rowcnt = len(df)\n",
" df_s1 = df.agg(['count', 'nunique']).transpose().reset_index().rename(columns={\"index\":\"feature_name\"})\n",
" df_s1[\"null\"] = (rowcnt - df_s1[\"count\"]).astype('int64')\n",
" df_s1[\"not_null\"] = rowcnt - df_s1[\"null\"]\n",
" df_s1[\"null_pct\"] = df_s1[\"null\"] / rowcnt\n",
" df_s1[\"nunique_pct\"] = df_s1['nunique']/ rowcnt\n",
" dt = pd.DataFrame(df.dtypes).reset_index().rename(columns={\"index\":\"feature_name\", 0:\"dtype\"})\n",
" df_stats = pd.merge(dt, df_s1, on='feature_name', how='inner').round(4)\n",
" df_stats['nunique'] = df_stats['nunique'].astype('int64')\n",
" df_stats['count'] = df_stats['count'].astype('int64')\n",
" \n",
" # -- variable type mapper -- \n",
" df_stats['feature_type'] = \"UNKOWN\"\n",
" df_stats.loc[df_stats[\"dtype\"] == object, 'feature_type'] = \"CATEGORY\"\n",
" df_stats.loc[(df_stats[\"dtype\"] == \"int64\") | (df_stats[\"dtype\"] == \"float64\"), 'feature_type'] = \"NUMERIC\"\n",
" df_stats.loc[df_stats[\"feature_name\"].str.contains(\"ipaddress|ip_address|ipcli\"), 'feature_type'] = \"IP_ADDRESS\"\n",
" df_stats.loc[df_stats[\"feature_name\"].str.contains(\"email|email_address|emailaddr\"), 'feature_type'] = \"EMAIL_ADDRESS\"\n",
" df_stats.loc[df_stats[\"feature_name\"].str.contains(\"canal|channel\"), 'feature_type'] = \"USERAGENT\"\n",
" df_stats.loc[df_stats[\"feature_name\"].str.contains(\"monto|amount\"), 'feature_type'] = \"PRICE\"\n",
" df_stats.loc[df_stats[\"feature_name\"].str.contains(\"nomdes|name\"), 'feature_type'] = \"BILLING_NAME\"\n",
" df_stats.loc[df_stats[\"feature_name\"] == \"EVENT_LABEL\", 'feature_type'] = \"TARGET\"\n",
" df_stats.loc[df_stats[\"feature_name\"] == \"EVENT_TIMESTAMP\", 'feature_type'] = \"EVENT_TIMESTAMP\"\n",
" \n",
" # -- variable warnings -- \n",
" df_stats['feature_warning'] = \"NO WARNING\"\n",
" df_stats.loc[(df_stats[\"nunique\"] != 2) & (df_stats[\"feature_name\"] == \"EVENT_LABEL\"),'feature_warning' ] = \"LABEL WARNING, NON-BINARY EVENT LABEL\"\n",
" df_stats.loc[(df_stats[\"nunique_pct\"] > 0.97) & (df_stats['feature_type'] == \"CATEGORY\") ,'feature_warning' ] = \"EXCLUDE, GT 97% UNIQUE\"\n",
" df_stats.loc[(df_stats[\"null_pct\"] > 0.2) & (df_stats[\"null_pct\"] <= 0.5), 'feature_warning' ] = \"NULL WARNING, GT 20% MISSING\"\n",
" df_stats.loc[df_stats[\"null_pct\"] > 0.5,'feature_warning' ] = \"EXCLUDE, GT 50% MISSING\"\n",
" df_stats.loc[((df_stats['dtype'] == \"int64\" ) | (df_stats['dtype'] == \"float64\" ) ) & (df_stats['nunique'] < 0.2), 'feature_warning' ] = \"LIKELY CATEGORICAL, NUMERIC w. LOW CARDINALITY\"\n",
" \n",
" # -- target check -- \n",
" exclude_fields = df_stats.loc[(df_stats['feature_warning'] != 'NO WARNING')]['feature_name'].to_list()\n",
" event_variables = df_stats.loc[(~df_stats['feature_name'].isin(['EVENT_LABEL', 'EVENT_TIMESTAMP']))]['feature_name'].to_list()\n",
" event_labels = df[\"EVENT_LABEL\"].unique().tolist()\n",
" \n",
" trainingDataSchema = {\n",
" 'modelVariables' : df_stats.loc[(df_stats['feature_type'].isin(['IP_ADDRESS', 'EMAIL_ADDRESS', 'CATEGORY', 'NUMERIC' ]))]['feature_name'].to_list(),\n",
" 'labelSchema' : {\n",
" 'labelMapper' : {\n",
" 'FRAUD' : [str(df[\"EVENT_LABEL\"].value_counts().idxmin())],\n",
" 'LEGIT' : [str(df[\"EVENT_LABEL\"].value_counts().idxmax())]\n",
" }\n",
" }\n",
" }\n",
" \n",
" \n",
" model_variables = df_stats.loc[(df_stats['feature_type'].isin(['IP_ADDRESS', 'EMAIL_ADDRESS', 'CATEGORY', 'NUMERIC' ]))]['feature_name'].to_list()\n",
" \n",
" \n",
" # -- label schema -- \n",
" label_map = {\n",
" 'FRAUD' : [df[\"EVENT_LABEL\"].value_counts().idxmin()],\n",
" 'LEGIT' : [df[\"EVENT_LABEL\"].value_counts().idxmax()]\n",
" }\n",
" \n",
" \n",
" print(\"--- summary stats ---\")\n",
" print(df_stats)\n",
" print(\"\\n\")\n",
" print(\"--- event variables ---\")\n",
" print(event_variables)\n",
" print(\"\\n\")\n",
" print(\"--- event labels ---\")\n",
" print(event_labels)\n",
" print(\"\\n\")\n",
" print(\"--- training data schema ---\")\n",
" print(trainingDataSchema)\n",
" print(\"\\n\")\n",
" \n",
" return df_stats, trainingDataSchema, event_variables, event_labels\n",
"\n",
"# -- connect to S3, snag file, and convert to a panda's dataframe --\n",
"#s3 = boto3.resource('s3')\n",
"#obj = s3.Object(S3_BUCKET, S3_FILE)\n",
"#body = obj.get()['Body']\n",
"#df = pd.read_csv(body)\n",
"\n",
"# -- call profiling function -- \n",
"df_stats, trainingDataSchema, eventVariables, eventLabels = summary_stats(data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. Create Variables\n",
"-----\n",
"\n",
" đź’ˇ Create Variables. \n",
"\n",
"The following section will automatically create your modeling input variables and your model scoring variable for you. \n",
"\n",
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# --- no changes just run this code block ---\n",
"def create_label(df, FRAUD_LABEL):\n",
" \"\"\"\n",
" Returns a dictionary for the model labelSchema, by identifying the rare event as fraud / and common as not-fraud \n",
" \n",
" Arguments:\n",
" df -- input dataframe \n",
" FRAUD_LABEL -- the name of the field that contains fraud label \n",
" \n",
" Returns:\n",
" labelSchema -- a dictionary containing labelKey & labelMapper \n",
" \"\"\"\n",
" label_summary = df[FRAUD_LABEL].value_counts()\n",
" labelSchema = {'labelKey': FRAUD_LABEL,\n",
" \"labelMapper\" : { \"FRAUD\": [str(label_summary.idxmin())], \n",
" \"LEGIT\": [str(label_summary.idxmax())]}\n",
" }\n",
" afd_resource.put_label(\n",
" name = str(label_summary.idxmin()),\n",
" description = 'FRAUD')\n",
" \n",
" afd_resource.put_label(\n",
" name = str(label_summary.idxmax()),\n",
" description = 'LEGIT')\n",
" \n",
" return labelSchema\n",
" \n",
"# -- function to create all your variables --- \n",
"def create_variables(df_stats, MODEL_NAME):\n",
" \"\"\"\n",
" Returns a variable list of model input variables, checks to see if variable exists,\n",
" and, if not, then it adds the variable to Fraud Detector \n",
" \n",
" Arguments: \n",
" enrichment_features -- dictionary of optional features, mapped to specific variable types enriched (CARD_BIN, USERAGENT)\n",
" numeric_features -- optional list of numeric field names \n",
" categorical_features -- optional list of categorical features \n",
" \n",
" Returns:\n",
" variable_list -- a list of variable dictionaries \n",
" \n",
" \"\"\"\n",
" enrichment_features = df_stats.loc[(df_stats['feature_type'].isin(['IP_ADDRESS', 'EMAIL_ADDRESS', 'USERAGENT', 'BILLING_NAME', 'PRICE']))]['feature_name'].to_dict()\n",
" enrichment_type = df_stats.loc[(df_stats['feature_type'].isin(['IP_ADDRESS', 'EMAIL_ADDRESS', 'USERAGENT', 'BILLING_NAME', 'PRICE']))]['feature_type'].to_dict()\n",
" numeric_features = df_stats.loc[(df_stats['feature_type'].isin(['NUMERIC']))]['feature_name'].to_dict()\n",
" categorical_features = df_stats.loc[(df_stats['feature_type'].isin(['CATEGORY']))]['feature_name'].to_dict()\n",
" \n",
" variable_list = []\n",
" # -- first do the enrichment features\n",
" for feature in enrichment_features.keys(): \n",
" variable_list.append( {'name' : enrichment_features[feature]+\"\"})\n",
" try:\n",
" varname = enrichment_features[feature]+\"\"\n",
" afd_resource.get_variables(name=varname)\n",
" except:\n",
" print(\"Creating variable: {0}\".format(enrichment_features[feature]))\n",
" if enrichment_type[feature] == \"PRICE\":\n",
" resp = afd_resource.create_variable(\n",
" name = varname,\n",
" dataType = 'FLOAT',\n",
" dataSource ='EVENT',\n",
" defaultValue = '0', \n",
" description = enrichment_features[feature],\n",
" variableType = enrichment_type[feature] )\n",
" else:\n",
" resp = afd_resource.create_variable(\n",
" name = varname,\n",
" dataType = 'STRING',\n",
" dataSource ='EVENT',\n",
" defaultValue = '', \n",
" description = enrichment_features[feature],\n",
" variableType = enrichment_type[feature] )\n",
" \n",
" \n",
" # -- check and update the numeric features \n",
" for feature in numeric_features: \n",
" variable_list.append( {'name' : numeric_features[feature]+\"\"})\n",
" try:\n",
" varname = numeric_features[feature]+\"\"\n",
" afd_resource.get_variables(name=varname)\n",
" except:\n",
" print(\"Creating variable: {0}\".format(numeric_features[feature]))\n",
" resp = afd_resource.create_variable(\n",
" name = varname,\n",
" dataType = 'FLOAT',\n",
" dataSource ='EVENT',\n",
" defaultValue = '0.0', \n",
" description = numeric_features[feature],\n",
" variableType = 'NUMERIC' )\n",
" \n",
" # -- check and update the categorical features \n",
" for feature in categorical_features: \n",
" variable_list.append( {'name' : categorical_features[feature]+\"\"})\n",
" try:\n",
" varname = categorical_features[feature]+\"\"\n",
" afd_resource.get_variables(name=varname)\n",
" except:\n",
" print(\"Creating variable: {0}\".format(categorical_features[feature]))\n",
" resp = afd_resource.create_variable(\n",
" name = varname,\n",
" dataType = 'STRING',\n",
" dataSource ='EVENT',\n",
" defaultValue = '', \n",
" description = categorical_features[feature],\n",
" variableType = 'CATEGORICAL' )\n",
" \n",
" # -- create a model score feature \n",
" model_feature = \"{0}_insightscore\".format(MODEL_NAME) \n",
" # variable_list.append( {'name' : model_feature})\n",
" try:\n",
" afd_resource.get_variables(name=model_feature)\n",
" except:\n",
" print(\"Creating variable: {0}\".format(model_feature))\n",
" resp = afd_resource.create_variable(\n",
" name = model_feature,\n",
" dataType = 'FLOAT',\n",
" dataSource ='MODEL_SCORE',\n",
" defaultValue = '0.0', \n",
" description = model_feature,\n",
" variableType = 'NUMERIC' )\n",
" \n",
" return variable_list\n",
"\n",
"\n",
"model_variables = create_variables(df_stats, MODEL_NAME)\n",
"print(\"\\n --- model variable dict --\")\n",
"print(model_variables)\n",
"\n",
"\n",
"model_label = create_label(data, \"EVENT_LABEL\")\n",
"print(\"\\n --- model label schema dict --\")\n",
"print(model_label)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3. Create Entity and Event Types\n",
"-----\n",
" \n",
" đź’ˇ Entity and Event. \n",
" \n",
"The following code block will automatically create your entity and event types for you.\n",
"\n",
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Amazon Fraud Detector expect the labels to be strings.\n",
"eventLabels = list(map(str, eventLabels))\n",
"print(eventLabels)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# --- no changes just run this code block ---\n",
"response = afd_resource.put_entity_type(\n",
" name = ENTITY_TYPE,\n",
" description = ENTITY_DESC\n",
")\n",
"print(\"-- create entity --\")\n",
"print(response)\n",
"\n",
"response = afd_resource.put_event_type (\n",
" name = EVENT_TYPE,\n",
" eventVariables = eventVariables,\n",
" labels = eventLabels,\n",
" entityTypes = [ENTITY_TYPE])\n",
"print(\"-- create event type --\")\n",
"print(response)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 4. Create & Train your Model\n",
"-----\n",
" \n",
" đź’ˇ Train Model. \n",
"\n",
"The following section will automatically train and activate your model for you. \n",
"\n",
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# -- create our model --\n",
"response = afd_resource.create_model(\n",
" description = MODEL_DESC,\n",
" eventTypeName = EVENT_TYPE,\n",
" modelId = MODEL_NAME,\n",
" modelType = 'ONLINE_FRAUD_INSIGHTS')\n",
"print(\"-- initalize model --\")\n",
"print(response)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# -- initializes the model, it's now ready to train -- \n",
"S3_FILE = \"dataset-training.csv\"\n",
"S3_FILE_LOC = \"s3://{0}/{1}\".format(S3_BUCKET,S3_FILE)\n",
"\n",
"response = afd_resource.create_model_version(\n",
" modelId = MODEL_NAME,\n",
" modelType = 'ONLINE_FRAUD_INSIGHTS',\n",
" trainingDataSource = 'EXTERNAL_EVENTS',\n",
" trainingDataSchema = trainingDataSchema,\n",
" externalEventsDetail = {\n",
" 'dataLocation' : S3_FILE_LOC,\n",
" 'dataAccessRoleArn': ARN_ROLE\n",
" }\n",
")\n",
"print(\"-- model training --\")\n",
"print(response)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# -- model training takes time, we'll loop until it's complete -- \n",
"print(\"-- wait for model training to complete --\")\n",
"stime = time.time()\n",
"while True:\n",
" clear_output(wait=True)\n",
" response = afd_resource.get_model_version(modelId=MODEL_NAME, modelType = \"ONLINE_FRAUD_INSIGHTS\", modelVersionNumber = '1.0')\n",
" if response['status'] == 'TRAINING_IN_PROGRESS':\n",
" print(f\"current progress: {(time.time() - stime)/60:{3}.{3}} minutes\")\n",
" time.sleep(60) # -- sleep for 60 seconds \n",
" if response['status'] != 'TRAINING_IN_PROGRESS':\n",
" print(\"Model status : \" + response['status'])\n",
" break\n",
" \n",
"etime = time.time()\n",
"\n",
"# -- summarize -- \n",
"print(\"\\n --- model training complete --\")\n",
"print(\"Elapsed time : %s\" % (etime - stime) + \" seconds \\n\" )\n",
"print(response)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5. Activate the Model and evaluate the performance\n",
"-----\n",
" \n",
" đź’ˇ Train Model. \n",
"\n",
"The following section will automatically train and activate your model for you. \n",
"\n",
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"response = afd_resource.update_model_version_status (\n",
" modelId = MODEL_NAME,\n",
" modelType = 'ONLINE_FRAUD_INSIGHTS',\n",
" modelVersionNumber = '1.0',\n",
" status = 'ACTIVE'\n",
")\n",
"print(\"-- activating model --\")\n",
"print(response)\n",
"\n",
"#-- wait until model is active \n",
"print(\"--- waiting until model status is active \")\n",
"stime = time.time()\n",
"while True:\n",
" clear_output(wait=True)\n",
" response = afd_resource.get_model_version(modelId=MODEL_NAME, modelType = \"ONLINE_FRAUD_INSIGHTS\", modelVersionNumber = '1.0')\n",
" if response['status'] != 'ACTIVE':\n",
" print(f\"current progress: {(time.time() - stime)/60:{3}.{3}} minutes\")\n",
" time.sleep(60) # sleep for 1 minute \n",
" if response['status'] == 'ACTIVE':\n",
" print(\"Model status : \" + response['status'])\n",
" break\n",
" \n",
"etime = time.time()\n",
"print(\"Elapsed time : %s\" % (etime - stime) + \" seconds \\n\" )\n",
"print(response)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# -- model performance summary -- \n",
"auc = afd_resource.describe_model_versions(\n",
" modelId= MODEL_NAME,\n",
" modelVersionNumber='1.0',\n",
" modelType='ONLINE_FRAUD_INSIGHTS',\n",
" maxResults=10\n",
")['modelVersionDetails'][0]['trainingResult']['trainingMetrics']['auc']\n",
"\n",
"\n",
"df_model = pd.DataFrame(afd_resource.describe_model_versions(\n",
" modelId= MODEL_NAME,\n",
" modelVersionNumber='1.0',\n",
" modelType='ONLINE_FRAUD_INSIGHTS',\n",
" maxResults=10\n",
")['modelVersionDetails'][0]['trainingResult']['trainingMetrics']['metricDataPoints'])\n",
"\n",
"\n",
"plt.figure(figsize=(10,10))\n",
"plt.plot(df_model[\"fpr\"], df_model[\"tpr\"], color='darkorange',\n",
" lw=2, label='ROC curve (area = %0.3f)' % auc)\n",
"plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n",
"plt.xlabel('False Positive Rate')\n",
"plt.ylabel('True Positive Rate')\n",
"plt.title( MODEL_NAME + ' ROC Chart')\n",
"plt.legend(loc=\"lower right\",fontsize=12)\n",
"plt.axvline(x = 0.02 ,linewidth=2, color='r')\n",
"plt.axhline(y = 0.73 ,linewidth=2, color='r')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 6. Create Detector, generate Rules and assemble your Detector\n",
"\n",
"-----\n",
" \n",
" đź’ˇ Generate Rules, Create and Publish a Detector. \n",
" \n",
"The following section will automatically generate a number of fraud, investigate and approve rules based on the false positive rate and score thresholds of your model. These are just example rules that you could create, it is recommended that you fine tune your rules specifically to your business use case.\n",
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# -- initialize your detector -- \n",
"response = afd_resource.put_detector(detectorId = DETECTOR_NAME, \n",
" description = DETECTOR_DESC, \n",
" eventTypeName = EVENT_TYPE )\n",
"\n",
"print(response)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# -- make rules -- \n",
"model_stat = df_model.round(decimals=2) \n",
"\n",
"m = model_stat.loc[model_stat.groupby([\"fpr\"])[\"threshold\"].idxmax()] \n",
"\n",
"def make_rule(x):\n",
" rule = \"\"\n",
" if x['fpr'] <= 0.05: \n",
" rule = \"${0}_insightscore > {1}\".format(MODEL_NAME,x['threshold'])\n",
" if x['fpr'] == 0.06:\n",
" rule = \"${0}_insightscore <= {1}\".format(MODEL_NAME,x['threshold_prev'])\n",
" return rule\n",
" \n",
"m[\"threshold_prev\"] = m['threshold'].shift(1)\n",
"m['rule'] = m.apply(lambda x: make_rule(x), axis=1)\n",
"\n",
"m['outcome'] = \"approve\"\n",
"m.loc[m['fpr'] <= 0.03, \"outcome\"] = \"fraud\"\n",
"m.loc[(m['fpr'] > 0.03) & (m['fpr'] <= 0.05), \"outcome\"] = \"investigate\"\n",
"\n",
"print (\" --- score thresholds 1% to 6% --- \")\n",
"print(m[[\"fpr\", \"tpr\", \"threshold\", \"rule\", \"outcome\"]].loc[(m['fpr'] > 0.0 ) & (m['fpr'] <= 0.06)].reset_index(drop=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# -- create outcomes -- \n",
"def create_outcomes(outcomes):\n",
" \"\"\" create Fraud Detector Outcomes \n",
" \n",
" \"\"\" \n",
" for outcome in outcomes:\n",
" print(\"creating outcome variable: {0} \".format(outcome))\n",
" response = afd_resource.put_outcome(\n",
" name=outcome,\n",
" description=outcome)\n",
"\n",
"# -- get distinct outcomes \n",
"outcomes = m[\"outcome\"].unique().tolist()\n",
"\n",
"create_outcomes(outcomes)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rule_set = m[(m[\"fpr\"] > 0.0) & (m[\"fpr\"] <= 0.06)][[\"outcome\", \"rule\"]].to_dict('records')\n",
"rule_list = []\n",
"for i, rule in enumerate(rule_set):\n",
" ruleId = \"rule{0}_{1}\".format(i, MODEL_NAME)\n",
" rule_list.append({\"ruleId\": ruleId, \n",
" \"ruleVersion\" : '1',\n",
" \"detectorId\" : DETECTOR_NAME\n",
" \n",
" })\n",
" print(\"creating rule: {0}: IF {1} THEN {2}\".format(ruleId, rule[\"rule\"], rule['outcome']))\n",
" try:\n",
" response = afd_resource.create_rule(\n",
" ruleId = ruleId,\n",
" detectorId = DETECTOR_NAME,\n",
" expression = rule['rule'],\n",
" language = 'DETECTORPL',\n",
" outcomes = [rule['outcome']]\n",
" )\n",
" except:\n",
" print(\"this rule already exists in this detector\")\n",
"rule_list"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"afd_resource.create_detector_version(\n",
" detectorId = DETECTOR_NAME,\n",
" rules = rule_list,\n",
" modelVersions = [{\"modelId\":MODEL_NAME, \n",
" \"modelType\" : \"ONLINE_FRAUD_INSIGHTS\",\n",
" \"modelVersionNumber\" : \"1.0\"}],\n",
" ruleExecutionMode = 'FIRST_MATCHED'\n",
" )\n",
"\n",
"print(\"\\n -- detector created -- \")\n",
"print(response) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"response = afd_resource.update_detector_version_status(\n",
" detectorId= DETECTOR_NAME,\n",
" detectorVersionId='1',\n",
" status='ACTIVE'\n",
")\n",
"print(\"\\n -- detector activated -- \")\n",
"print(response)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Testing our model endpoint"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"S3_FILE = \"dataset-test.csv\"\n",
"S3_FILE_LOC = \"s3://{0}/{1}\".format(S3_BUCKET,S3_FILE)\n",
"\n",
"s3_resource.Bucket(S3_BUCKET).download_file(S3_FILE, 'dataset-test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" EVENT_TIMESTAMP \n",
" tipologia \n",
" canal \n",
" segmento \n",
" rut \n",
" dv \n",
" ctaori \n",
" ctades \n",
" nomdes \n",
" nombancdes \n",
" rutdes \n",
" monto \n",
" ipcli \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 2020-07-08 11:26:33 \n",
" Pagos y Transferencias \n",
" APP \n",
" PF \n",
" 8b7b9dc18f37bf63f95dd21bc153e392 \n",
" 45c48cce2e2d7fbdea1afc51c7c6ad26 \n",
" 06bb5bef0faf68e34425cfd721ac983e \n",
" b59e491ccfc02f404b1be01976b471e0 \n",
" christian basoalto \n",
" Banco de Chile / Edwards-Citi \n",
" fd357482b5f76349fc81af3ba52e29b3 \n",
" 199999.0 \n",
" 192.168.88.137 \n",
" \n",
" \n",
" 1 \n",
" 2020-07-19 22:06:00 \n",
" Pagos y Transferencias \n",
" APP \n",
" PF \n",
" c455c3cf1fc59f7e6aa7079b43625395 \n",
" a87ff679a2f3e71d9181a67b7542122c \n",
" 9a81863b7b1156dd84576b404efd847b \n",
" ee60570fc4f7efbfb4acd953884969e0 \n",
" maria eugenia gonzalez videla \n",
" Banco Santander-Santiago \n",
" 5845e9d5ed68b3333f8aaffa0f8a110e \n",
" 130000.0 \n",
" 192.168.88.137 \n",
" \n",
" \n",
" 2 \n",
" 2020-07-31 11:59:45 \n",
" Pagos y Transferencias \n",
" APP \n",
" PF \n",
" 9d4d67cab011a7a9e3171d8096d8a5ac \n",
" a5f3c6a11b03839d46af9fb43c97c188 \n",
" 2a2a2feb2ee7b91b962937d3df4fa4fd \n",
" f54d524ade8984180c9ccb50e74e9768 \n",
" Cassandra Ornano \n",
" Banco Santander-Santiago \n",
" e242a0a3af653999d878598dd7025810 \n",
" 15890.0 \n",
" 192.168.88.137 \n",
" \n",
" \n",
" 3 \n",
" 2020-07-02 23:42:51 \n",
" Pagos y Transferencias \n",
" WEB \n",
" PF \n",
" 8f0c2c558aed3b7b27961f9102d19bfe \n",
" e4da3b7fbbce2345d7772b0674a318d5 \n",
" ce4b1bbc50cad868d8c9152582bc8652 \n",
" e3eb10ec1a4ec2a86826f00f8df019ee \n",
" Claudio Jara Molina \n",
" Banco Santander-Santiago \n",
" c72af796f3ce0fae237e1c6b216ceb28 \n",
" 20000.0 \n",
" 186.11.62.185 \n",
" \n",
" \n",
" 4 \n",
" 2020-07-02 11:22:15 \n",
" Pagos y Transferencias \n",
" APP \n",
" PF \n",
" 7c56e083a2408573386883ae08da9a72 \n",
" c9f0f895fb98ab9159f51fd0297e236d \n",
" 1d2089c0b7a1449669b29fcbe6ffa27b \n",
" f0ea88913593ba576a22d67e52dc5b99 \n",
" hugo muñoz \n",
" Banco Falabella \n",
" ce846b2b5b17b28180d9e4d160cfd453 \n",
" 120000.0 \n",
" 192.168.88.137 \n",
" \n",
" \n",
" 5 \n",
" 2020-07-26 20:07:28 \n",
" Pagos y Transferencias \n",
" WEB \n",
" PF \n",
" 496b9cb13a8ccec4dab279a53555adf6 \n",
" eccbc87e4b5ce2fe28308fd9f2a7baf3 \n",
" 95b20c920b33cc0ed271ab7eba1c1cd4 \n",
" b261ea0ad9a0d4065d079906494ce454 \n",
" matilde maddaleno herrera \n",
" Banco de Chile \n",
" d216482286e31df42bc1f025e90fd5de \n",
" 12000.0 \n",
" 201.239.242.46 \n",
" \n",
" \n",
" 6 \n",
" 2020-07-21 08:46:07 \n",
" Pagos y Transferencias \n",
" WEB \n",
" PJ \n",
" d41e62a45537cc9e47da4aa4856beed6 \n",
" a87ff679a2f3e71d9181a67b7542122c \n",
" a3d2de7675556553a5f08e4c88d2c228 \n",
" a3d2de7675556553a5f08e4c88d2c228 \n",
" newiol/bic.iol.Transferencias.tef.tefa3ros \n",
" IENQ005FP_ConfTrEjecutada \n",
" fb21e94fb209a71e831f6019fad298bc \n",
" 0.0 \n",
" 190.45.200.29 \n",
" \n",
" \n",
" 7 \n",
" 2020-07-17 21:32:16 \n",
" Pagos y Transferencias \n",
" APP \n",
" PF \n",
" fa54fef00d81db63d36ffa08d26c43ff \n",
" 45c48cce2e2d7fbdea1afc51c7c6ad26 \n",
" b4ec8b2dd178d847c6df71da9264d352 \n",
" b12a6281f35a48703dd04ceeac690dac \n",
" Laura Vera \n",
" Banco Estado \n",
" bbddfd8cc3eafb60001c5c06cdc6183b \n",
" 2300.0 \n",
" 192.168.88.137 \n",
" \n",
" \n",
" 8 \n",
" 2020-07-13 21:12:01 \n",
" Pagos y Transferencias \n",
" WEB \n",
" PF \n",
" 7d5e960687d13c11ccce83bd0518d717 \n",
" c81e728d9d4c2f636f067f89cc14862c \n",
" 877b46e92286b0aa728e4ae16c65134d \n",
" e9b7a87bb5689e44b24a68e033766aea \n",
" texia navarro pasten \n",
" Banco del Estado de Chile \n",
" bcd414eac4268b89b98639b012b6af76 \n",
" 90000.0 \n",
" 201.215.86.174 \n",
" \n",
" \n",
" 9 \n",
" 2020-07-17 14:35:05 \n",
" Pagos y Transferencias \n",
" Webmobile \n",
" PF \n",
" 3176e3cf35af1994d3b0ba7a13237d74 \n",
" c4ca4238a0b923820dcc509a6f75849b \n",
" 031af4f2f5ea6a3d51e69fd55ddc1903 \n",
" e634726547937b614ebefa9986a6537e \n",
" Cristian aguero \n",
" Banco del Estado de Chile \n",
" c4c1b24523ef03e3aecc2c6da0a83b55 \n",
" 9000.0 \n",
" 152.231.126.216 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" EVENT_TIMESTAMP tipologia canal segmento rut dv ctaori ctades nomdes nombancdes rutdes monto ipcli\n",
"0 2020-07-08 11:26:33 Pagos y Transferencias APP PF 8b7b9dc18f37bf63f95dd21bc153e392 45c48cce2e2d7fbdea1afc51c7c6ad26 06bb5bef0faf68e34425cfd721ac983e b59e491ccfc02f404b1be01976b471e0 christian basoalto Banco de Chile / Edwards-Citi fd357482b5f76349fc81af3ba52e29b3 199999.0 192.168.88.137 \n",
"1 2020-07-19 22:06:00 Pagos y Transferencias APP PF c455c3cf1fc59f7e6aa7079b43625395 a87ff679a2f3e71d9181a67b7542122c 9a81863b7b1156dd84576b404efd847b ee60570fc4f7efbfb4acd953884969e0 maria eugenia gonzalez videla Banco Santander-Santiago 5845e9d5ed68b3333f8aaffa0f8a110e 130000.0 192.168.88.137 \n",
"2 2020-07-31 11:59:45 Pagos y Transferencias APP PF 9d4d67cab011a7a9e3171d8096d8a5ac a5f3c6a11b03839d46af9fb43c97c188 2a2a2feb2ee7b91b962937d3df4fa4fd f54d524ade8984180c9ccb50e74e9768 Cassandra Ornano Banco Santander-Santiago e242a0a3af653999d878598dd7025810 15890.0 192.168.88.137 \n",
"3 2020-07-02 23:42:51 Pagos y Transferencias WEB PF 8f0c2c558aed3b7b27961f9102d19bfe e4da3b7fbbce2345d7772b0674a318d5 ce4b1bbc50cad868d8c9152582bc8652 e3eb10ec1a4ec2a86826f00f8df019ee Claudio Jara Molina Banco Santander-Santiago c72af796f3ce0fae237e1c6b216ceb28 20000.0 186.11.62.185 \n",
"4 2020-07-02 11:22:15 Pagos y Transferencias APP PF 7c56e083a2408573386883ae08da9a72 c9f0f895fb98ab9159f51fd0297e236d 1d2089c0b7a1449669b29fcbe6ffa27b f0ea88913593ba576a22d67e52dc5b99 hugo muñoz Banco Falabella ce846b2b5b17b28180d9e4d160cfd453 120000.0 192.168.88.137 \n",
"5 2020-07-26 20:07:28 Pagos y Transferencias WEB PF 496b9cb13a8ccec4dab279a53555adf6 eccbc87e4b5ce2fe28308fd9f2a7baf3 95b20c920b33cc0ed271ab7eba1c1cd4 b261ea0ad9a0d4065d079906494ce454 matilde maddaleno herrera Banco de Chile d216482286e31df42bc1f025e90fd5de 12000.0 201.239.242.46 \n",
"6 2020-07-21 08:46:07 Pagos y Transferencias WEB PJ d41e62a45537cc9e47da4aa4856beed6 a87ff679a2f3e71d9181a67b7542122c a3d2de7675556553a5f08e4c88d2c228 a3d2de7675556553a5f08e4c88d2c228 newiol/bic.iol.Transferencias.tef.tefa3ros IENQ005FP_ConfTrEjecutada fb21e94fb209a71e831f6019fad298bc 0.0 190.45.200.29 \n",
"7 2020-07-17 21:32:16 Pagos y Transferencias APP PF fa54fef00d81db63d36ffa08d26c43ff 45c48cce2e2d7fbdea1afc51c7c6ad26 b4ec8b2dd178d847c6df71da9264d352 b12a6281f35a48703dd04ceeac690dac Laura Vera Banco Estado bbddfd8cc3eafb60001c5c06cdc6183b 2300.0 192.168.88.137 \n",
"8 2020-07-13 21:12:01 Pagos y Transferencias WEB PF 7d5e960687d13c11ccce83bd0518d717 c81e728d9d4c2f636f067f89cc14862c 877b46e92286b0aa728e4ae16c65134d e9b7a87bb5689e44b24a68e033766aea texia navarro pasten Banco del Estado de Chile bcd414eac4268b89b98639b012b6af76 90000.0 201.215.86.174 \n",
"9 2020-07-17 14:35:05 Pagos y Transferencias Webmobile PF 3176e3cf35af1994d3b0ba7a13237d74 c4ca4238a0b923820dcc509a6f75849b 031af4f2f5ea6a3d51e69fd55ddc1903 e634726547937b614ebefa9986a6537e Cristian aguero Banco del Estado de Chile c4c1b24523ef03e3aecc2c6da0a83b55 9000.0 152.231.126.216"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.set_option('display.max_rows', 500)\n",
"pd.set_option('display.max_columns', 500)\n",
"pd.set_option('display.width', 1000)\n",
"\n",
"test = pd.read_csv('dataset-test.csv', delimiter=',')\n",
"test.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Cleaning the test dataset from training columns and defining the start datetime."
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tipologia canal segmento rut dv ctaori ctades nomdes nombancdes rutdes monto ipcli\n"
]
}
],
"source": [
"record_count = 400\n",
"model_variables = [column for column in test.columns if column not in ['EVENT_LABEL', 'EVENT_TIMESTAMP']]\n",
"#dateTimeObj = datetime.strptime('Sep 3 2013 12:00AM', '%b %d %Y %I:%M%p')\n",
"dateTimeObj = datetime.now()\n",
"timestampStr = dateTimeObj.strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n",
"print(' '.join(model_variables))"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'modelScores': [{'modelVersion': {'modelId': 'fraud_det_model20200908', 'modelType': 'ONLINE_FRAUD_INSIGHTS', 'modelVersionNumber': '2.0'}, 'scores': {'fraud_det_model20200908_insightscore': 432.0}}], 'ruleResults': [{'ruleId': 'rule1_fraud_det_model20200908', 'outcomes': ['investigate']}], 'ResponseMetadata': {'RequestId': '4e7d4056-fc33-4d26-b6a9-1d9b561bcc89', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1', 'date': 'Mon, 14 Sep 2020 20:41:17 GMT', 'x-amzn-requestid': '4e7d4056-fc33-4d26-b6a9-1d9b561bcc89', 'content-length': '277', 'connection': 'keep-alive'}, 'RetryAttempts': 0}}\n"
]
}
],
"source": [
"import uuid\n",
"\n",
"# test the endpoint with a single prediction.\n",
"eventId = uuid.uuid1()\n",
"testrecord = test[model_variables].head(15).astype(str).to_dict(orient='records')[6]\n",
"pred = afd_resource.get_event_prediction(detectorId=DETECTOR_NAME, \n",
" detectorVersionId='1',\n",
" eventId = str(eventId),\n",
" eventTypeName = EVENT_TYPE,\n",
" eventTimestamp = timestampStr, \n",
" entities = [{'entityType': ENTITY_TYPE, 'entityId':str(eventId.int)}],\n",
" eventVariables= testrecord)\n",
"print(pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The next block will use some parallelization to run several test against the fraud detector endpoint."
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"current progress: 100.0 %\n",
"Process took 4.35 seconds\n",
"Scored 404 records\n"
]
}
],
"source": [
"import dask \n",
"import time\n",
"from IPython.core.display import display, HTML\n",
"#display(HTML(\"\"))\n",
"\n",
"start = time.time()\n",
"\n",
"@dask.delayed\n",
"def _predict(record):\n",
" eventId = uuid.uuid1()\n",
" try:\n",
" pred = afd_resource.get_event_prediction(detectorId=DETECTOR_NAME, \n",
" detectorVersionId='1',\n",
" eventId = str(eventId),\n",
" eventTypeName = EVENT_TYPE,\n",
" eventTimestamp = timestampStr, \n",
" entities = [{'entityType': ENTITY_TYPE, 'entityId':str(eventId.int)}],\n",
" eventVariables= record) \n",
" \n",
" record[\"score\"] = pred['modelScores'][0]['scores'][\"{0}_insightscore\".format(MODEL_NAME)]\n",
" if len(pred['ruleResults']) > 0:\n",
" record[\"outcomes\"]= pred['ruleResults'][0]['outcomes']\n",
" else:\n",
" record[\"outcomes\"]= 'approve'\n",
" return record\n",
" \n",
" except:\n",
" pred = afd_resource.get_event_prediction(detectorId=DETECTOR_NAME, \n",
" detectorVersionId='1',\n",
" eventId = str(eventId),\n",
" eventTypeName = EVENT_TYPE,\n",
" eventTimestamp = timestampStr, \n",
" entities = [{'entityType': ENTITY_TYPE, 'entityId':str(eventId.int)}],\n",
" eventVariables= record) \n",
" record[\"score\"] = \"-999\"\n",
" record[\"outcomes\"]= \"error\"\n",
" return record\n",
"\n",
"predict_data = test[model_variables].head(record_count).astype(str).to_dict(orient='records')\n",
"predict_score = []\n",
"\n",
"i=0\n",
"for record in predict_data:\n",
" clear_output(wait=True)\n",
" rec = dask.delayed(_predict)(record)\n",
" predict_score.append(rec)\n",
" i += 1\n",
" print(\"current progress: \", round((i/record_count)*100,2), \"%\" )\n",
" \n",
"predict_recs = dask.compute(*predict_score)\n",
"\n",
"# Calculate time taken and print results\n",
"time_taken = time.time() - start\n",
"tps = len(predict_recs) / time_taken\n",
"\n",
"print ('Process took %0.2f seconds' %time_taken)\n",
"print ('Scored %d records' %len(predict_recs))"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" tipologia \n",
" canal \n",
" segmento \n",
" rut \n",
" dv \n",
" ctaori \n",
" ctades \n",
" nomdes \n",
" nombancdes \n",
" rutdes \n",
" monto \n",
" ipcli \n",
" score \n",
" outcomes \n",
" \n",
" \n",
" \n",
" \n",
" 401 \n",
" Sin tipologia \n",
" WEB \n",
" PF \n",
" 05ef2b2ec63b0451e753da506407ce59 \n",
" c4ca4238a0b923820dcc509a6f75849b \n",
" 8d8b98d0765127cadd3e850af72f8b4d \n",
" 35298865c6b58ae65570b30e2cfaf83b \n",
" victoria \n",
" Banco del Estado de Chile \n",
" b2d3f8b557944f6780d6d60f7b923eaf \n",
" 200000.0 \n",
" 186.11.58.2 \n",
" 1000 \n",
" [fraud] \n",
" \n",
" \n",
" 402 \n",
" Pagos y Transferencias \n",
" WEB \n",
" PF \n",
" 4a379677c0f2edd6a98cebc2c46f4707 \n",
" 45c48cce2e2d7fbdea1afc51c7c6ad26 \n",
" 457ad069eebf1f77dd529fca55e1ec6a \n",
" 952cd2c29bb380044ecb086d43acdb05 \n",
" alan cerda \n",
" Banco del Estado de Chile \n",
" 781c1bcc58864eab9c42b5e2a8b40660 \n",
" 200000.0 \n",
" 186.10.157.179 \n",
" 999 \n",
" [fraud] \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tipologia canal segmento rut dv ctaori ctades nomdes nombancdes rutdes monto ipcli score outcomes\n",
"401 Sin tipologia WEB PF 05ef2b2ec63b0451e753da506407ce59 c4ca4238a0b923820dcc509a6f75849b 8d8b98d0765127cadd3e850af72f8b4d 35298865c6b58ae65570b30e2cfaf83b victoria Banco del Estado de Chile b2d3f8b557944f6780d6d60f7b923eaf 200000.0 186.11.58.2 1000 [fraud]\n",
"402 Pagos y Transferencias WEB PF 4a379677c0f2edd6a98cebc2c46f4707 45c48cce2e2d7fbdea1afc51c7c6ad26 457ad069eebf1f77dd529fca55e1ec6a 952cd2c29bb380044ecb086d43acdb05 alan cerda Banco del Estado de Chile 781c1bcc58864eab9c42b5e2a8b40660 200000.0 186.10.157.179 999 [fraud]"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# lets take a look to the predicted frauds\n",
"predictions = pd.DataFrame.from_dict(predict_recs, orient='columns')\n",
"predictions.loc[predictions['score'].astype('float32') > 900]"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Fecha \n",
" TipologĂa \n",
" Canal \n",
" Segmento \n",
" Rut \n",
" Dv \n",
" CtaOri \n",
" CtaDes \n",
" NomDes \n",
" NomBancDes \n",
" RUTDES \n",
" MONTO \n",
" Fraude \n",
" IPCLI \n",
" \n",
" \n",
" \n",
" \n",
" 369910 \n",
" 2020-07-07 12:37:23 \n",
" NaN \n",
" WEB \n",
" PF \n",
" 05ef2b2ec63b0451e753da506407ce59 \n",
" c4ca4238a0b923820dcc509a6f75849b \n",
" 8d8b98d0765127cadd3e850af72f8b4d \n",
" 35298865c6b58ae65570b30e2cfaf83b \n",
" victoria \n",
" Banco del Estado de Chile \n",
" b2d3f8b557944f6780d6d60f7b923eaf \n",
" 200000.0 \n",
" SI \n",
" 186.11.58.2 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Fecha TipologĂa Canal Segmento Rut Dv CtaOri CtaDes NomDes NomBancDes RUTDES MONTO Fraude IPCLI\n",
"369910 2020-07-07 12:37:23 NaN WEB PF 05ef2b2ec63b0451e753da506407ce59 c4ca4238a0b923820dcc509a6f75849b 8d8b98d0765127cadd3e850af72f8b4d 35298865c6b58ae65570b30e2cfaf83b victoria Banco del Estado de Chile b2d3f8b557944f6780d6d60f7b923eaf 200000.0 SI 186.11.58.2 "
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[(data[\"NomDes\"]==\"victoria\") & (data[\"RUTDES\"]== 'b2d3f8b557944f6780d6d60f7b923eaf')]"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Fecha \n",
" TipologĂa \n",
" Canal \n",
" Segmento \n",
" Rut \n",
" Dv \n",
" CtaOri \n",
" CtaDes \n",
" NomDes \n",
" NomBancDes \n",
" RUTDES \n",
" MONTO \n",
" Fraude \n",
" IPCLI \n",
" \n",
" \n",
" \n",
" \n",
" 895701 \n",
" 2020-07-16 13:39:57 \n",
" Pagos y Transferencias \n",
" WEB \n",
" PF \n",
" 4a379677c0f2edd6a98cebc2c46f4707 \n",
" 45c48cce2e2d7fbdea1afc51c7c6ad26 \n",
" 457ad069eebf1f77dd529fca55e1ec6a \n",
" 952cd2c29bb380044ecb086d43acdb05 \n",
" alan cerda \n",
" Banco del Estado de Chile \n",
" 781c1bcc58864eab9c42b5e2a8b40660 \n",
" 200000.0 \n",
" SI \n",
" 186.10.157.179 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Fecha TipologĂa Canal Segmento Rut Dv CtaOri CtaDes NomDes NomBancDes RUTDES MONTO Fraude IPCLI\n",
"895701 2020-07-16 13:39:57 Pagos y Transferencias WEB PF 4a379677c0f2edd6a98cebc2c46f4707 45c48cce2e2d7fbdea1afc51c7c6ad26 457ad069eebf1f77dd529fca55e1ec6a 952cd2c29bb380044ecb086d43acdb05 alan cerda Banco del Estado de Chile 781c1bcc58864eab9c42b5e2a8b40660 200000.0 SI 186.10.157.179 "
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[(data[\"NomDes\"]==\"alan cerda\") & (data[\"RUTDES\"]== '781c1bcc58864eab9c42b5e2a8b40660')]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"See the model metrics on CloudWatch and the prediction history in Fraud Detector."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# save the results to a csv file and upload it to the output s3 bucket\n",
"csv_buffer = StringIO()\n",
"predictions.to_csv(csv_buffer, index=False)\n",
"s3_resource.Object(S3_BUCKET, MODEL_NAME + \"precictions{}.csv\".format(sufx)).put(Body=csv_buffer.getvalue())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#data.loc[data['vaa'] == 0.14205158164005, 'vaa':'EVENT_LABEL']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finish"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "conda_python3",
"language": "python",
"name": "conda_python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}