{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Fraud detection combining both Amazon Fraud Detector and Amazon SageMaker models in the same transaction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Importing tools and libraries to be used" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "import os\n", "import io\n", "import sys\n", "import json\n", "import uuid\n", "import numpy as np \n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import boto3\n", "import time\n", "import sklearn\n", "import seaborn as sns\n", "import sagemaker\n", "\n", "from IPython.display import clear_output\n", "from datetime import datetime\n", "from io import StringIO\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from imblearn.over_sampling import SMOTENC\n", "from collections import Counter\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.datasets import dump_svmlight_file\n", "from sklearn.metrics import balanced_accuracy_score, cohen_kappa_score, confusion_matrix\n", "from sklearn.metrics import roc_curve, roc_auc_score, auc, roc_auc_score, classification_report\n", "%matplotlib inline\n", "from sagemaker import get_execution_role\n", "from sagemaker.amazon.amazon_estimator import get_image_uri\n", "from sagemaker.predictor import csv_serializer\n", "from math import sqrt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Investigate and process the data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's start by downloading the dataset from: https://www.kaggle.com/mlg-ulb/creditcardfraud?select=creditcard.csv and upload it into the notebook file system as creditcard.csv." ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "# Resources and env variables setup\n", "s3_resource = boto3.resource('s3')\n", "afd_resource = boto3.client('frauddetector')\n", "\n", "# suffix is appended to detector and model name for uniqueness \n", "sufx = datetime.now().strftime(\"%Y%m%d\")\n", "# prefix is prepended to the sagamaker components\n", "prefx = 'sagemaker-model'\n", "# replace with the training bucket created in the CloudFormation\n", "S3_BUCKET = \"afd-poc-trainingbucket-1i37svk9elcoe\"\n", "# replace with the output bucket created in the CloudFormation\n", "S3_OUT_BUCKET = \"afd-poc-outputbucket-1fhn20d7tumgg\"\n", "# Replace the ARN Role with the resources created in CloudFormation stack\n", "ARN_ROLE = \"arn:aws:iam::387461613214:role/afd-poc-NotebookInstanceExecutionRole-1FNQ41S8H2G68\" " ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.width', 1000)\n", "\n", "data = pd.read_csv('creditcard.csv', delimiter=',')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's take a peek at our data (we only show a subset of the columns in the table):" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class'], dtype='object')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
count284807.0000002.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+05284807.000000284807.000000
mean94813.8595753.919560e-155.688174e-16-8.769071e-152.782312e-15-1.552563e-152.010663e-15-1.694249e-15-1.927028e-16-3.137024e-151.768627e-159.170318e-16-1.810658e-151.693438e-151.479045e-153.482336e-151.392007e-15-7.528491e-164.328772e-169.049732e-165.085503e-161.537294e-167.959909e-165.367590e-164.458112e-151.453003e-151.699104e-15-3.660161e-16-1.206049e-1688.3496190.001727
std47488.1459551.958696e+001.651309e+001.516255e+001.415869e+001.380247e+001.332271e+001.237094e+001.194353e+001.098632e+001.088850e+001.020713e+009.992014e-019.952742e-019.585956e-019.153160e-018.762529e-018.493371e-018.381762e-018.140405e-017.709250e-017.345240e-017.257016e-016.244603e-016.056471e-015.212781e-014.822270e-014.036325e-013.300833e-01250.1201090.041527
min0.000000-5.640751e+01-7.271573e+01-4.832559e+01-5.683171e+00-1.137433e+02-2.616051e+01-4.355724e+01-7.321672e+01-1.343407e+01-2.458826e+01-4.797473e+00-1.868371e+01-5.791881e+00-1.921433e+01-4.498945e+00-1.412985e+01-2.516280e+01-9.498746e+00-7.213527e+00-5.449772e+01-3.483038e+01-1.093314e+01-4.480774e+01-2.836627e+00-1.029540e+01-2.604551e+00-2.256568e+01-1.543008e+010.0000000.000000
25%54201.500000-9.203734e-01-5.985499e-01-8.903648e-01-8.486401e-01-6.915971e-01-7.682956e-01-5.540759e-01-2.086297e-01-6.430976e-01-5.354257e-01-7.624942e-01-4.055715e-01-6.485393e-01-4.255740e-01-5.828843e-01-4.680368e-01-4.837483e-01-4.988498e-01-4.562989e-01-2.117214e-01-2.283949e-01-5.423504e-01-1.618463e-01-3.545861e-01-3.171451e-01-3.269839e-01-7.083953e-02-5.295979e-025.6000000.000000
50%84692.0000001.810880e-026.548556e-021.798463e-01-1.984653e-02-5.433583e-02-2.741871e-014.010308e-022.235804e-02-5.142873e-02-9.291738e-02-3.275735e-021.400326e-01-1.356806e-025.060132e-024.807155e-026.641332e-02-6.567575e-02-3.636312e-033.734823e-03-6.248109e-02-2.945017e-026.781943e-03-1.119293e-024.097606e-021.659350e-02-5.213911e-021.342146e-031.124383e-0222.0000000.000000
75%139320.5000001.315642e+008.037239e-011.027196e+007.433413e-016.119264e-013.985649e-015.704361e-013.273459e-015.971390e-014.539234e-017.395934e-016.182380e-016.625050e-014.931498e-016.488208e-015.232963e-013.996750e-015.008067e-014.589494e-011.330408e-011.863772e-015.285536e-011.476421e-014.395266e-013.507156e-012.409522e-019.104512e-027.827995e-0277.1650000.000000
max172792.0000002.454930e+002.205773e+019.382558e+001.687534e+013.480167e+017.330163e+011.205895e+022.000721e+011.559499e+012.374514e+011.201891e+017.848392e+007.126883e+001.052677e+018.877742e+001.731511e+019.253526e+005.041069e+005.591971e+003.942090e+012.720284e+011.050309e+012.252841e+014.584549e+007.519589e+003.517346e+003.161220e+013.384781e+0125691.1600001.000000
\n", "
" ], "text/plain": [ " Time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 Amount Class\n", "count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 284807.000000 284807.000000\n", "mean 94813.859575 3.919560e-15 5.688174e-16 -8.769071e-15 2.782312e-15 -1.552563e-15 2.010663e-15 -1.694249e-15 -1.927028e-16 -3.137024e-15 1.768627e-15 9.170318e-16 -1.810658e-15 1.693438e-15 1.479045e-15 3.482336e-15 1.392007e-15 -7.528491e-16 4.328772e-16 9.049732e-16 5.085503e-16 1.537294e-16 7.959909e-16 5.367590e-16 4.458112e-15 1.453003e-15 1.699104e-15 -3.660161e-16 -1.206049e-16 88.349619 0.001727\n", "std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00 1.380247e+00 1.332271e+00 1.237094e+00 1.194353e+00 1.098632e+00 1.088850e+00 1.020713e+00 9.992014e-01 9.952742e-01 9.585956e-01 9.153160e-01 8.762529e-01 8.493371e-01 8.381762e-01 8.140405e-01 7.709250e-01 7.345240e-01 7.257016e-01 6.244603e-01 6.056471e-01 5.212781e-01 4.822270e-01 4.036325e-01 3.300833e-01 250.120109 0.041527\n", "min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01 -2.458826e+01 -4.797473e+00 -1.868371e+01 -5.791881e+00 -1.921433e+01 -4.498945e+00 -1.412985e+01 -2.516280e+01 -9.498746e+00 -7.213527e+00 -5.449772e+01 -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00 -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01 0.000000 0.000000\n", "25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01 -5.354257e-01 -7.624942e-01 -4.055715e-01 -6.485393e-01 -4.255740e-01 -5.828843e-01 -4.680368e-01 -4.837483e-01 -4.988498e-01 -4.562989e-01 -2.117214e-01 -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01 -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02 5.600000 0.000000\n", "50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02 -5.433583e-02 -2.741871e-01 4.010308e-02 2.235804e-02 -5.142873e-02 -9.291738e-02 -3.275735e-02 1.400326e-01 -1.356806e-02 5.060132e-02 4.807155e-02 6.641332e-02 -6.567575e-02 -3.636312e-03 3.734823e-03 -6.248109e-02 -2.945017e-02 6.781943e-03 -1.119293e-02 4.097606e-02 1.659350e-02 -5.213911e-02 1.342146e-03 1.124383e-02 22.000000 0.000000\n", "75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01 6.119264e-01 3.985649e-01 5.704361e-01 3.273459e-01 5.971390e-01 4.539234e-01 7.395934e-01 6.182380e-01 6.625050e-01 4.931498e-01 6.488208e-01 5.232963e-01 3.996750e-01 5.008067e-01 4.589494e-01 1.330408e-01 1.863772e-01 5.285536e-01 1.476421e-01 4.395266e-01 3.507156e-01 2.409522e-01 9.104512e-02 7.827995e-02 77.165000 0.000000\n", "max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01 3.480167e+01 7.330163e+01 1.205895e+02 2.000721e+01 1.559499e+01 2.374514e+01 1.201891e+01 7.848392e+00 7.126883e+00 1.052677e+01 8.877742e+00 1.731511e+01 9.253526e+00 5.041069e+00 5.591971e+00 3.942090e+01 2.720284e+01 1.050309e+01 2.252841e+01 4.584549e+00 7.519589e+00 3.517346e+00 3.161220e+01 3.384781e+01 25691.160000 1.000000" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(data.columns)\n", "data.describe()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of frauds: 492\n", "Number of non-frauds: 284315\n", "Percentage of fradulent data: 0.1727485630620034\n" ] } ], "source": [ "nonfrauds, frauds = data.groupby('Class').size()\n", "print('Number of frauds: ', frauds)\n", "print('Number of non-frauds: ', nonfrauds)\n", "print('Percentage of fradulent data:', 100.*frauds/(frauds + nonfrauds))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The class column corresponds to whether or not a transaction is fradulent. We see that the majority of data is non-fraudulent with only $492$ ($.173\\%$), check the Class column mean, of the data corresponding to fraudulent examples." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A PCA have been made lets check the mean and standard deviation of the features." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "data.hist(bins=50,figsize=(20,15))\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Looks good, columns 𝑉𝑖 have been normalized to have 0 mean and unit standard deviation as the result of a PCA. Now, lets change the data to be Amazon Fraud Detector compatible." ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['time', 'va', 'vb', 'vc', 'vd', 've', 'vf', 'vg', 'vh', 'vi', 'vj', 'vk', 'vl', 'vm', 'vn', 'vo', 'vp', 'vq', 'vr', 'vs', 'vt', 'vu', 'vv', 'vw', 'vx', 'vy', 'vz', 'vaa', 'vab', 'amount', 'class'], dtype='object')\n" ] } ], "source": [ "# to lowercase\n", "data.columns = map(str.lower, data.columns)\n", "print(data.columns)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Time', 'va', 'vb', 'vc', 'vd', 've', 'vf', 'vg', 'vh', 'vi', 'vj', 'vk', 'vl', 'vm', 'vn', 'vo', 'vp', 'vq', 'vr', 'vs', 'vt', 'vu', 'vv', 'vw', 'vx', 'vy', 'vz', 'vaa', 'vab', 'Amount', 'Class'], dtype='object')\n" ] } ], "source": [ "# mapping column names numbers to letters\n", "\n", "def standardize_headers(x):\n", " if any(char.isdigit() for char in x):\n", " if int(x[1:]) > 26:\n", " return 'va'+chr(int(x[1:])+70)\n", " return 'v'+chr(int(x[1:])+96)\n", " return x\n", "\n", "data.rename(columns=standardize_headers, inplace=True)\n", "print(data.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then change the timestamp and label column names" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timevavbvcvdvevfvgvhvivjvkvlvmvnvovpvqvrvsvtvuvvvwvxvyvzvaavabamountEVENT_LABEL
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.3637870.090794-0.551600-0.617801-0.991390-0.3111691.468177-0.4704010.2079710.0257910.4039930.251412-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425-0.1669741.6127271.0652350.489095-0.1437720.6355580.463917-0.114805-0.183361-0.145783-0.069083-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.5146540.2076430.6245010.0660840.717293-0.1659462.345865-2.8900831.109969-0.121359-2.2618570.5249800.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024-0.054952-0.2264870.1782280.507757-0.287924-0.631418-1.059647-0.6840931.965775-1.232622-0.208038-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.8177390.753074-0.8228430.5381961.345852-1.1196700.175121-0.451449-0.237033-0.0381950.8034870.408542-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990
\n", "
" ], "text/plain": [ " time va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount EVENT_LABEL\n", "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0\n", "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0\n", "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0\n", "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0\n", "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# rename to the Amazon Fraud Detector name conventions \n", "data.rename(columns={'class':'EVENT_LABEL'}, inplace=True)\n", "data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get epoch time for the initial dataset date" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1600808118.409983\n" ] } ], "source": [ "# Get epoch time for the initial dataset date\n", "epoch = datetime.utcfromtimestamp(0)\n", "def unix_time_seconds(dt):\n", " return (dt - epoch).total_seconds()\n", "\n", "# Lets pretend that the data is from yesterday and could can test at the end with todays date.\n", "start_dt = datetime.strptime('Sep 22 2020 12:00AM', '%b %d %Y %I:%M%p')\n", "start_dt = datetime.now()\n", "start_ep = unix_time_seconds(start_dt)\n", "print(start_ep)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The date column is represented as incremental seconds, lets translate it to dates formats." ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timevavbvcvdvevfvgvhvivjvkvlvmvnvovpvqvrvsvtvuvvvwvxvyvzvaavabamountEVENT_LABELEVENT_TIMESTAMP
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.3637870.090794-0.551600-0.617801-0.991390-0.3111691.468177-0.4704010.2079710.0257910.4039930.251412-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.6202020-09-22T20:55:18Z
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425-0.1669741.6127271.0652350.489095-0.1437720.6355580.463917-0.114805-0.183361-0.145783-0.069083-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.6902020-09-22T20:55:18Z
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.5146540.2076430.6245010.0660840.717293-0.1659462.345865-2.8900831.109969-0.121359-2.2618570.5249800.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.6602020-09-22T20:55:19Z
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024-0.054952-0.2264870.1782280.507757-0.287924-0.631418-1.059647-0.6840931.965775-1.232622-0.208038-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.5002020-09-22T20:55:19Z
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.8177390.753074-0.8228430.5381961.345852-1.1196700.175121-0.451449-0.237033-0.0381950.8034870.408542-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.9902020-09-22T20:55:20Z
\n", "
" ], "text/plain": [ " time va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount EVENT_LABEL EVENT_TIMESTAMP\n", "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0 2020-09-22T20:55:18Z\n", "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0 2020-09-22T20:55:18Z\n", "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0 2020-09-22T20:55:19Z\n", "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0 2020-09-22T20:55:19Z\n", "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0 2020-09-22T20:55:20Z" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# translate seconds delta to actual datetimes in ISO 8601\n", "def to_datetime(x):\n", " current_ep = start_ep + x\n", " current_dt = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.localtime(current_ep))\n", " return current_dt\n", "\n", "\n", "data['EVENT_TIMESTAMP'] = data['time'].apply(to_datetime)\n", "data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will split our dataset into a train and test to evaluate the performance of our models. It's important to do so before any techniques meant to alleviate the class imbalance are used. This ensures that we don't leak information from the test set into the train set." ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "features = data.drop('EVENT_LABEL', axis=1).values\n", "labels = (data['EVENT_LABEL'].values).astype('float32')" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "X, X_test, y, y_test = train_test_split(\n", " features, labels, test_size=0.1, random_state=42)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Counter({0.0: 255880, 1.0: 446})\n" ] } ], "source": [ "counter = Counter(y)\n", "print(counter)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Getting the training and testing DataFrame back together" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['time', 'va', 'vb', 'vc', 'vd', 've', 'vf', 'vg', 'vh', 'vi', 'vj', 'vk', 'vl', 'vm', 'vn', 'vo', 'vp', 'vq', 'vr', 'vs', 'vt', 'vu', 'vv', 'vw', 'vx', 'vy', 'vz', 'vaa', 'vab', 'amount'], dtype='object')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timevavbvcvdvevfvgvhvivjvkvlvmvnvovpvqvrvsvtvuvvvwvxvyvzvaavabamountEVENT_LABEL
028515.01.2266430.101988-0.0870720.111524-0.281992-1.3560270.469050-0.371725-0.153672-0.145105-0.1435050.3209640.1493130.4525150.7762530.028739-0.177867-1.0047460.2649530.019692-0.355100-1.1536630.1097930.4203180.1979320.699218-0.1148610.00758350.400
183125.01.1248480.1256020.2499620.489744-0.0403860.167561-0.2476140.284736-0.067302-0.1701391.8951170.507733-1.0086480.1872491.0216630.1363200.370186-0.573559-0.625413-0.204465-0.192467-0.5768190.190343-0.3574510.0008700.139971-0.0009930.0115051.980
275537.0-0.3079021.0037151.4042770.5926270.311014-0.3821060.531393-0.015292-0.758638-0.5115970.1496430.2458660.752802-0.3822141.590185-0.3325460.611852-0.5104950.5637790.125220-0.131802-0.3292680.0469900.057413-0.6569600.1931920.1420380.1575011.980
3156358.02.174919-1.535441-0.726428-1.430792-1.517258-0.751038-1.155344-0.180811-1.1118851.536101-0.735705-0.7717140.238603-0.447050-0.085886-0.5683450.591162-0.104975-0.327292-0.334351-0.1127660.0500180.2946661.123322-0.306025-0.2413430.006553-0.02756764.000
4162523.0-2.2215561.2619872.0476424.659268-0.5359414.542044-3.715525-5.311701-0.9553210.200601-1.3426220.8799050.241171-0.365540-1.7354100.5644950.3806481.2166921.8727110.895990-1.8203880.873723-2.648598-0.162180-0.4921110.6014900.6270300.088289379.290
5113992.0-0.3351980.8713780.6327034.1642421.7025821.9543540.3967220.495056-2.5060201.6091371.029249-0.143569-0.1291870.8624281.059507-0.8621440.597711-0.5099670.2176720.2604040.5296351.4735580.033413-1.333266-0.7799610.5951960.2315470.19333257.780
618653.0-1.3059781.7721180.7417300.9123510.4988981.737490-0.957795-1.6923010.755233-0.6414610.900131-0.9353813.0682281.564764-1.357441-0.2161560.7128780.3547010.959930-0.5793861.985454-1.2901090.108807-1.4279370.140905-0.3934440.078297-0.0525051.000
7133445.0-0.3033561.144996-0.843639-1.0466090.945826-1.8227001.694510-0.465604-0.095435-0.150239-1.004530-0.509581-1.0395960.805069-0.077089-0.649795-0.363229-0.307477-0.1636330.0074690.2081480.842970-0.2014900.076193-0.2772750.0933390.4977110.33648017.000
887252.0-0.4659200.6283651.4495694.4173511.0346541.0839050.103565-0.624355-0.2261902.979203-1.319708-1.136836-0.027942-0.9722510.758803-0.104912-0.4627340.5219721.049701-0.0554640.1921561.231734-0.1131150.603061-1.8900370.069854-0.907822-0.13317011.310
940607.00.271095-2.7204890.427427-0.080256-2.0773110.334682-0.6204120.1741670.0874800.3472860.319218-0.528176-1.560458-0.281460-1.1035531.1813540.433565-0.6231771.1654831.0684530.341998-0.188797-0.5044120.0272950.140939-0.294037-0.0638470.102788552.890
\n", "
" ], "text/plain": [ " time va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount EVENT_LABEL\n", "0 28515.0 1.226643 0.101988 -0.087072 0.111524 -0.281992 -1.356027 0.469050 -0.371725 -0.153672 -0.145105 -0.143505 0.320964 0.149313 0.452515 0.776253 0.028739 -0.177867 -1.004746 0.264953 0.019692 -0.355100 -1.153663 0.109793 0.420318 0.197932 0.699218 -0.114861 0.007583 50.40 0\n", "1 83125.0 1.124848 0.125602 0.249962 0.489744 -0.040386 0.167561 -0.247614 0.284736 -0.067302 -0.170139 1.895117 0.507733 -1.008648 0.187249 1.021663 0.136320 0.370186 -0.573559 -0.625413 -0.204465 -0.192467 -0.576819 0.190343 -0.357451 0.000870 0.139971 -0.000993 0.011505 1.98 0\n", "2 75537.0 -0.307902 1.003715 1.404277 0.592627 0.311014 -0.382106 0.531393 -0.015292 -0.758638 -0.511597 0.149643 0.245866 0.752802 -0.382214 1.590185 -0.332546 0.611852 -0.510495 0.563779 0.125220 -0.131802 -0.329268 0.046990 0.057413 -0.656960 0.193192 0.142038 0.157501 1.98 0\n", "3 156358.0 2.174919 -1.535441 -0.726428 -1.430792 -1.517258 -0.751038 -1.155344 -0.180811 -1.111885 1.536101 -0.735705 -0.771714 0.238603 -0.447050 -0.085886 -0.568345 0.591162 -0.104975 -0.327292 -0.334351 -0.112766 0.050018 0.294666 1.123322 -0.306025 -0.241343 0.006553 -0.027567 64.00 0\n", "4 162523.0 -2.221556 1.261987 2.047642 4.659268 -0.535941 4.542044 -3.715525 -5.311701 -0.955321 0.200601 -1.342622 0.879905 0.241171 -0.365540 -1.735410 0.564495 0.380648 1.216692 1.872711 0.895990 -1.820388 0.873723 -2.648598 -0.162180 -0.492111 0.601490 0.627030 0.088289 379.29 0\n", "5 113992.0 -0.335198 0.871378 0.632703 4.164242 1.702582 1.954354 0.396722 0.495056 -2.506020 1.609137 1.029249 -0.143569 -0.129187 0.862428 1.059507 -0.862144 0.597711 -0.509967 0.217672 0.260404 0.529635 1.473558 0.033413 -1.333266 -0.779961 0.595196 0.231547 0.193332 57.78 0\n", "6 18653.0 -1.305978 1.772118 0.741730 0.912351 0.498898 1.737490 -0.957795 -1.692301 0.755233 -0.641461 0.900131 -0.935381 3.068228 1.564764 -1.357441 -0.216156 0.712878 0.354701 0.959930 -0.579386 1.985454 -1.290109 0.108807 -1.427937 0.140905 -0.393444 0.078297 -0.052505 1.00 0\n", "7 133445.0 -0.303356 1.144996 -0.843639 -1.046609 0.945826 -1.822700 1.694510 -0.465604 -0.095435 -0.150239 -1.004530 -0.509581 -1.039596 0.805069 -0.077089 -0.649795 -0.363229 -0.307477 -0.163633 0.007469 0.208148 0.842970 -0.201490 0.076193 -0.277275 0.093339 0.497711 0.336480 17.00 0\n", "8 87252.0 -0.465920 0.628365 1.449569 4.417351 1.034654 1.083905 0.103565 -0.624355 -0.226190 2.979203 -1.319708 -1.136836 -0.027942 -0.972251 0.758803 -0.104912 -0.462734 0.521972 1.049701 -0.055464 0.192156 1.231734 -0.113115 0.603061 -1.890037 0.069854 -0.907822 -0.133170 11.31 0\n", "9 40607.0 0.271095 -2.720489 0.427427 -0.080256 -2.077311 0.334682 -0.620412 0.174167 0.087480 0.347286 0.319218 -0.528176 -1.560458 -0.281460 -1.103553 1.181354 0.433565 -0.623177 1.165483 1.068453 0.341998 -0.188797 -0.504412 0.027295 0.140939 -0.294037 -0.063847 0.102788 552.89 0" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "saved_cols = data.drop('EVENT_LABEL', axis=1).columns\n", "print(saved_cols)\n", "data = pd.DataFrame(X, columns = saved_cols)\n", "data['EVENT_LABEL']=y.astype(int)\n", "data.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get epoch time for the initial dataset date" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1600808761.483827\n" ] } ], "source": [ "# Get epoch time for the initial dataset date\n", "epoch = datetime.utcfromtimestamp(0)\n", "def unix_time_seconds(dt):\n", " return (dt - epoch).total_seconds()\n", "\n", "# Lets pretend that the data is from yesterday and could can test at the end with todays date.\n", "start_dt = datetime.strptime('Sep 22 2020 12:00AM', '%b %d %Y %I:%M%p')\n", "start_dt = datetime.now()\n", "start_ep = unix_time_seconds(start_dt)\n", "print(start_ep)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The date column is represented as incremental seconds, lets translate it to dates formats." ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timevavbvcvdvevfvgvhvivjvkvlvmvnvovpvqvrvsvtvuvvvwvxvyvzvaavabamountEVENT_LABELEVENT_TIMESTAMP
028515.01.2266430.101988-0.0870720.111524-0.281992-1.3560270.469050-0.371725-0.153672-0.145105-0.1435050.3209640.1493130.4525150.7762530.028739-0.177867-1.0047460.2649530.019692-0.355100-1.1536630.1097930.4203180.1979320.699218-0.1148610.00758350.4002020-09-23T05:01:16Z
183125.01.1248480.1256020.2499620.489744-0.0403860.167561-0.2476140.284736-0.067302-0.1701391.8951170.507733-1.0086480.1872491.0216630.1363200.370186-0.573559-0.625413-0.204465-0.192467-0.5768190.190343-0.3574510.0008700.139971-0.0009930.0115051.9802020-09-23T20:11:26Z
275537.0-0.3079021.0037151.4042770.5926270.311014-0.3821060.531393-0.015292-0.758638-0.5115970.1496430.2458660.752802-0.3822141.590185-0.3325460.611852-0.5104950.5637790.125220-0.131802-0.3292680.0469900.057413-0.6569600.1931920.1420380.1575011.9802020-09-23T18:04:58Z
3156358.02.174919-1.535441-0.726428-1.430792-1.517258-0.751038-1.155344-0.180811-1.1118851.536101-0.735705-0.7717140.238603-0.447050-0.085886-0.5683450.591162-0.104975-0.327292-0.334351-0.1127660.0500180.2946661.123322-0.306025-0.2413430.006553-0.02756764.0002020-09-24T16:31:59Z
4162523.0-2.2215561.2619872.0476424.659268-0.5359414.542044-3.715525-5.311701-0.9553210.200601-1.3426220.8799050.241171-0.365540-1.7354100.5644950.3806481.2166921.8727110.895990-1.8203880.873723-2.648598-0.162180-0.4921110.6014900.6270300.088289379.2902020-09-24T18:14:44Z
\n", "
" ], "text/plain": [ " time va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount EVENT_LABEL EVENT_TIMESTAMP\n", "0 28515.0 1.226643 0.101988 -0.087072 0.111524 -0.281992 -1.356027 0.469050 -0.371725 -0.153672 -0.145105 -0.143505 0.320964 0.149313 0.452515 0.776253 0.028739 -0.177867 -1.004746 0.264953 0.019692 -0.355100 -1.153663 0.109793 0.420318 0.197932 0.699218 -0.114861 0.007583 50.40 0 2020-09-23T05:01:16Z\n", "1 83125.0 1.124848 0.125602 0.249962 0.489744 -0.040386 0.167561 -0.247614 0.284736 -0.067302 -0.170139 1.895117 0.507733 -1.008648 0.187249 1.021663 0.136320 0.370186 -0.573559 -0.625413 -0.204465 -0.192467 -0.576819 0.190343 -0.357451 0.000870 0.139971 -0.000993 0.011505 1.98 0 2020-09-23T20:11:26Z\n", "2 75537.0 -0.307902 1.003715 1.404277 0.592627 0.311014 -0.382106 0.531393 -0.015292 -0.758638 -0.511597 0.149643 0.245866 0.752802 -0.382214 1.590185 -0.332546 0.611852 -0.510495 0.563779 0.125220 -0.131802 -0.329268 0.046990 0.057413 -0.656960 0.193192 0.142038 0.157501 1.98 0 2020-09-23T18:04:58Z\n", "3 156358.0 2.174919 -1.535441 -0.726428 -1.430792 -1.517258 -0.751038 -1.155344 -0.180811 -1.111885 1.536101 -0.735705 -0.771714 0.238603 -0.447050 -0.085886 -0.568345 0.591162 -0.104975 -0.327292 -0.334351 -0.112766 0.050018 0.294666 1.123322 -0.306025 -0.241343 0.006553 -0.027567 64.00 0 2020-09-24T16:31:59Z\n", "4 162523.0 -2.221556 1.261987 2.047642 4.659268 -0.535941 4.542044 -3.715525 -5.311701 -0.955321 0.200601 -1.342622 0.879905 0.241171 -0.365540 -1.735410 0.564495 0.380648 1.216692 1.872711 0.895990 -1.820388 0.873723 -2.648598 -0.162180 -0.492111 0.601490 0.627030 0.088289 379.29 0 2020-09-24T18:14:44Z" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# translate seconds delta to actual datetimes in ISO 8601\n", "def to_datetime(x):\n", " current_ep = start_ep + x\n", " current_dt = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.localtime(current_ep))\n", " return current_dt\n", "\n", "\n", "data['EVENT_TIMESTAMP'] = data['time'].apply(to_datetime)\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timevavbvcvdvevfvgvhvivjvkvlvmvnvovpvqvrvsvtvuvvvwvxvyvzvaavabamount
041505.0-16.5265078.584972-18.6498539.505594-13.793819-2.832404-16.7016947.517344-8.507059-14.1101845.299236-10.8340061.671120-9.3738590.360806-9.899247-19.236292-8.3985523.101735-1.5149231.190739-1.127670-2.3585790.673461-1.413700-0.462762-2.018575-1.042804364.19
144261.00.339812-2.743745-0.134070-1.385729-1.4514131.015887-0.5243790.2240600.899746-0.565012-0.0876700.9794270.076883-0.217884-0.136830-2.1428920.1269561.7526620.4325460.506044-0.213436-0.942525-0.526819-1.1569920.311211-0.7466470.0409960.102038520.12
235484.01.399590-0.5907010.168619-1.029950-0.5398060.040444-0.7125670.002299-0.9717470.7568010.5438270.1124531.075384-0.2457720.1804831.769860-0.533172-0.5333001.1922450.2128770.1023980.168269-0.166639-0.8102500.505083-0.2323400.0114090.00463431.00
3167123.0-0.4320711.647895-1.669361-0.3495040.785785-0.6306470.2769900.586025-0.484715-1.376648-1.3283350.2236211.132627-0.5508750.6165680.4979740.5021950.9813430.101264-0.2446330.3589320.873663-0.178642-0.017171-0.207392-0.157756-0.2373860.0019341.50
4168473.02.014160-0.137394-1.0158390.327269-0.182179-0.9565710.043241-0.1607460.3632410.2594520.9421620.850038-0.6161660.592634-0.6038450.091077-0.471867-0.3338160.404711-0.255293-0.238644-0.6164000.3470450.061561-0.3601960.174730-0.078043-0.0705710.89
567878.0-0.641330-0.0573041.489998-1.688131-1.1510430.259996-1.391069-2.3340751.168644-2.0840800.4803810.473738-2.1922760.7739420.2944840.406074-0.5418551.0314500.0170760.618411-1.2316340.257164-0.371953-0.0385661.397514-0.6659470.0310030.180357100.00
6159763.02.023952-0.120140-1.0869180.423019-0.142901-1.1277520.178493-0.3032340.5645090.062831-0.7200470.366835-0.1108570.3190940.108359-0.153633-0.221312-0.9341410.070553-0.210864-0.276175-0.6977080.335631-0.017196-0.3249040.200023-0.071566-0.05822416.99
7139631.0-0.6889441.292153-0.564281-1.4575262.258333-0.3232701.678984-0.104128-1.285351-1.3034350.282728-0.402525-0.548687-0.504283-0.6853390.714828-0.0926740.798953-0.150085-0.037150-0.006880-0.171568-0.720019-0.4194351.2119910.670916-0.1039860.0300848.95
8133944.02.1193620.142639-2.3733370.5419490.608419-1.7755640.955775-0.5993830.0104200.295305-0.936569-0.452478-1.3407981.077459-0.099584-0.8150720.018481-0.639446-0.065427-0.3235730.2642640.898266-0.1680630.0593110.6269490.729035-0.129120-0.09471310.00
958769.0-5.584256-4.732413-0.448452-0.121442-0.707412-0.114376-1.5546281.402126-0.031693-0.942358-2.439501-0.552312-0.295588-0.250246-1.1977321.5495530.933237-1.2376890.416832-1.0469000.0416510.6217890.223467-0.7701370.621182-0.0287380.505194-1.898323101.49
\n", "
" ], "text/plain": [ " time va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount\n", "0 41505.0 -16.526507 8.584972 -18.649853 9.505594 -13.793819 -2.832404 -16.701694 7.517344 -8.507059 -14.110184 5.299236 -10.834006 1.671120 -9.373859 0.360806 -9.899247 -19.236292 -8.398552 3.101735 -1.514923 1.190739 -1.127670 -2.358579 0.673461 -1.413700 -0.462762 -2.018575 -1.042804 364.19\n", "1 44261.0 0.339812 -2.743745 -0.134070 -1.385729 -1.451413 1.015887 -0.524379 0.224060 0.899746 -0.565012 -0.087670 0.979427 0.076883 -0.217884 -0.136830 -2.142892 0.126956 1.752662 0.432546 0.506044 -0.213436 -0.942525 -0.526819 -1.156992 0.311211 -0.746647 0.040996 0.102038 520.12\n", "2 35484.0 1.399590 -0.590701 0.168619 -1.029950 -0.539806 0.040444 -0.712567 0.002299 -0.971747 0.756801 0.543827 0.112453 1.075384 -0.245772 0.180483 1.769860 -0.533172 -0.533300 1.192245 0.212877 0.102398 0.168269 -0.166639 -0.810250 0.505083 -0.232340 0.011409 0.004634 31.00\n", "3 167123.0 -0.432071 1.647895 -1.669361 -0.349504 0.785785 -0.630647 0.276990 0.586025 -0.484715 -1.376648 -1.328335 0.223621 1.132627 -0.550875 0.616568 0.497974 0.502195 0.981343 0.101264 -0.244633 0.358932 0.873663 -0.178642 -0.017171 -0.207392 -0.157756 -0.237386 0.001934 1.50\n", "4 168473.0 2.014160 -0.137394 -1.015839 0.327269 -0.182179 -0.956571 0.043241 -0.160746 0.363241 0.259452 0.942162 0.850038 -0.616166 0.592634 -0.603845 0.091077 -0.471867 -0.333816 0.404711 -0.255293 -0.238644 -0.616400 0.347045 0.061561 -0.360196 0.174730 -0.078043 -0.070571 0.89\n", "5 67878.0 -0.641330 -0.057304 1.489998 -1.688131 -1.151043 0.259996 -1.391069 -2.334075 1.168644 -2.084080 0.480381 0.473738 -2.192276 0.773942 0.294484 0.406074 -0.541855 1.031450 0.017076 0.618411 -1.231634 0.257164 -0.371953 -0.038566 1.397514 -0.665947 0.031003 0.180357 100.00\n", "6 159763.0 2.023952 -0.120140 -1.086918 0.423019 -0.142901 -1.127752 0.178493 -0.303234 0.564509 0.062831 -0.720047 0.366835 -0.110857 0.319094 0.108359 -0.153633 -0.221312 -0.934141 0.070553 -0.210864 -0.276175 -0.697708 0.335631 -0.017196 -0.324904 0.200023 -0.071566 -0.058224 16.99\n", "7 139631.0 -0.688944 1.292153 -0.564281 -1.457526 2.258333 -0.323270 1.678984 -0.104128 -1.285351 -1.303435 0.282728 -0.402525 -0.548687 -0.504283 -0.685339 0.714828 -0.092674 0.798953 -0.150085 -0.037150 -0.006880 -0.171568 -0.720019 -0.419435 1.211991 0.670916 -0.103986 0.030084 8.95\n", "8 133944.0 2.119362 0.142639 -2.373337 0.541949 0.608419 -1.775564 0.955775 -0.599383 0.010420 0.295305 -0.936569 -0.452478 -1.340798 1.077459 -0.099584 -0.815072 0.018481 -0.639446 -0.065427 -0.323573 0.264264 0.898266 -0.168063 0.059311 0.626949 0.729035 -0.129120 -0.094713 10.00\n", "9 58769.0 -5.584256 -4.732413 -0.448452 -0.121442 -0.707412 -0.114376 -1.554628 1.402126 -0.031693 -0.942358 -2.439501 -0.552312 -0.295588 -0.250246 -1.197732 1.549553 0.933237 -1.237689 0.416832 -1.046900 0.041651 0.621789 0.223467 -0.770137 0.621182 -0.028738 0.505194 -1.898323 101.49" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test = pd.DataFrame(X_test, columns = saved_cols)\n", "test.head(10)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timevavbvcvdvevfvgvhvivjvkvlvmvnvovpvqvrvsvtvuvvvwvxvyvzvaavabamountEVENT_LABEL
041505.0-16.5265078.584972-18.6498539.505594-13.793819-2.832404-16.7016947.517344-8.507059-14.1101845.299236-10.8340061.671120-9.3738590.360806-9.899247-19.236292-8.3985523.101735-1.5149231.190739-1.127670-2.3585790.673461-1.413700-0.462762-2.018575-1.042804364.191
144261.00.339812-2.743745-0.134070-1.385729-1.4514131.015887-0.5243790.2240600.899746-0.565012-0.0876700.9794270.076883-0.217884-0.136830-2.1428920.1269561.7526620.4325460.506044-0.213436-0.942525-0.526819-1.1569920.311211-0.7466470.0409960.102038520.120
235484.01.399590-0.5907010.168619-1.029950-0.5398060.040444-0.7125670.002299-0.9717470.7568010.5438270.1124531.075384-0.2457720.1804831.769860-0.533172-0.5333001.1922450.2128770.1023980.168269-0.166639-0.8102500.505083-0.2323400.0114090.00463431.000
3167123.0-0.4320711.647895-1.669361-0.3495040.785785-0.6306470.2769900.586025-0.484715-1.376648-1.3283350.2236211.132627-0.5508750.6165680.4979740.5021950.9813430.101264-0.2446330.3589320.873663-0.178642-0.017171-0.207392-0.157756-0.2373860.0019341.500
4168473.02.014160-0.137394-1.0158390.327269-0.182179-0.9565710.043241-0.1607460.3632410.2594520.9421620.850038-0.6161660.592634-0.6038450.091077-0.471867-0.3338160.404711-0.255293-0.238644-0.6164000.3470450.061561-0.3601960.174730-0.078043-0.0705710.890
\n", "
" ], "text/plain": [ " time va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount EVENT_LABEL\n", "0 41505.0 -16.526507 8.584972 -18.649853 9.505594 -13.793819 -2.832404 -16.701694 7.517344 -8.507059 -14.110184 5.299236 -10.834006 1.671120 -9.373859 0.360806 -9.899247 -19.236292 -8.398552 3.101735 -1.514923 1.190739 -1.127670 -2.358579 0.673461 -1.413700 -0.462762 -2.018575 -1.042804 364.19 1\n", "1 44261.0 0.339812 -2.743745 -0.134070 -1.385729 -1.451413 1.015887 -0.524379 0.224060 0.899746 -0.565012 -0.087670 0.979427 0.076883 -0.217884 -0.136830 -2.142892 0.126956 1.752662 0.432546 0.506044 -0.213436 -0.942525 -0.526819 -1.156992 0.311211 -0.746647 0.040996 0.102038 520.12 0\n", "2 35484.0 1.399590 -0.590701 0.168619 -1.029950 -0.539806 0.040444 -0.712567 0.002299 -0.971747 0.756801 0.543827 0.112453 1.075384 -0.245772 0.180483 1.769860 -0.533172 -0.533300 1.192245 0.212877 0.102398 0.168269 -0.166639 -0.810250 0.505083 -0.232340 0.011409 0.004634 31.00 0\n", "3 167123.0 -0.432071 1.647895 -1.669361 -0.349504 0.785785 -0.630647 0.276990 0.586025 -0.484715 -1.376648 -1.328335 0.223621 1.132627 -0.550875 0.616568 0.497974 0.502195 0.981343 0.101264 -0.244633 0.358932 0.873663 -0.178642 -0.017171 -0.207392 -0.157756 -0.237386 0.001934 1.50 0\n", "4 168473.0 2.014160 -0.137394 -1.015839 0.327269 -0.182179 -0.956571 0.043241 -0.160746 0.363241 0.259452 0.942162 0.850038 -0.616166 0.592634 -0.603845 0.091077 -0.471867 -0.333816 0.404711 -0.255293 -0.238644 -0.616400 0.347045 0.061561 -0.360196 0.174730 -0.078043 -0.070571 0.89 0" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The testing dataset with the labels to perform evaluations latter on\n", "test_label = pd.DataFrame(X_test, columns = saved_cols)\n", "test_label['EVENT_LABEL']=y_test.astype(int)\n", "test_label.head()" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of frauds in test data: 46\n", "Number of non-frauds in test data: 28435\n", "Percentage of fradulent data: 0.16151118289385907\n" ] } ], "source": [ "#validating the test dataset with labels\n", "nonfrauds, frauds = test_label.groupby('EVENT_LABEL').size()\n", "print('Number of frauds in test data: ', frauds)\n", "print('Number of non-frauds in test data: ', nonfrauds)\n", "print('Percentage of fradulent data:', 100.*frauds/(frauds + nonfrauds))" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of frauds: 446\n", "Number of non-frauds: 255880\n", "Percentage of fradulent data: 0.17399717547186005\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAEFCAYAAAAWrxseAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAUZUlEQVR4nO3df9ClZX3f8fen/FKLCshCcIEs1c1UtHXFLZAxzdgwAwttZ3FGUkgLW4bOphZaHTONxJhCUTrSadRhQuiQsmWxKlKEsjbgZoumxpZfC/JDQgjPEIXNIiwuENAggt/+ca6nHg7nen7tcp6Ffb9m7jnnfO/ruu7rPCzP55zrvs9zUlVIkjTO31jsCUiSdl2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISE1SZYkeSDJ6xZ7Lq8lSfZJ8mdJDlrsuWj+DAlNVJJfS7I5ybNJHk1yY5JfmsBxK8nbZ2l2LvBfq+q51uePkzzX5jq9fTXJ0iQvJHnbmONcl+Q/DR3zhyP9f7PtO7/tP2Wo756ttqz9XKb7/CTJ80OP//MMz/P9SbbM8rOYPvbRI/V/nuTFoeM8lORDQ/uXtX7Pjmz/pO2/IsmnRo9XVT8G1gEfm2le2jXtudgT0O4jyUcZ/CL+l8BG4HlgFbAa+NYiTo0k+wBrgBUju86pqv8ypv1NwOnA+UO1A4CTgJVDTd9dVVOdw24HLkhybVW9OLyjqk4cGvcKYEtVfWLOT6gjSdq8tzN4vreNNLm5qn6ptT0K+N9Jbqmqbw+12a+qXpjnob8I3JXk4y009CrhOwlNRJI3AxcAZ1fVtVX1w6r6SVV9tar+bWuzT5LPJdnats+1X97Tr3K/NTLm/3930F7FXpLkD5M8k+TW6Vf6Sb7Zutw9/Mp3xDHAU1U146vwIesZ/LIddipwX1XdO8cxvsYgKP/ZHNvvDH8feCvwYeDUJHv3GlbVncD9wDt29KDt5/okcOyOjqXJMiQ0Kb8IvA64boY2v83gl8gK4N3A0cB8Xj2fBvx7YH9gCrgQoKp+ue1/d1XtW1VfHtP37wAPzONY1wEHjiyVnQ5cOY8xCvgd4Lwke82j345YA3wVmP4Z/KNewyR/D/gFYPNOOvb9DP676lXEkNCkvAV4YpZlin8KXFBVj1fVNga/8Edfrc/k2qq6rR3jC7x86Wgm+wHPjKlfnOSpoe2TAFX118B/B84ASLIceC+DZZVhd470P2F4Z1VtALYB/2Iec12QJG8ATgG+WFU/Aa5hEBrDjm3zfJbBUtTngQdH2jwx8pzm+k7jGQY/Z72KGBKalB8weOU903mwtwLfG3r8vVabq+8P3f8RsO88+j4JvHFM/d9U1X5D2+8M7VsP/Gq7Gup04GtV9fhI/6NG+m8cc4xPMHgX9UpfVfUB4AXghvb4C8CJSZYMtbmlzXNf4OeAdwL/YWScA0ee0/1zPP4bgad2YP5aBIaEJuVm4Dng5BnabAV+fujx4a0G8EPgDdM7kvzcTp7fPQyWVuasqv6EQfitZnBeYT5LTcPjbGKwPPavFtJ/HtYwCM6Hk3yfwTuhvRgs042b12PAV4B/vJOO/w7g7p00libEkNBEVNXTwL8DLklycpI3JNkryYlJ/mNr9iXgE+3zCge29v+t7bsbeGeSFe2V+/nznMJjwN+aYf9twH5Jls5z3CuBixgso3x1nn2H/TbwmzvQ/yWSvG5kWwocx+AcxAp+dt7nIl6+5DQ9xlsYvPu4bx6H3mPkuHu3sZYCBwC3LPxZaTEYEpqYqvoM8FEGyyvbgEeAc4D/0Zp8isFJ0nuAe4E7W42q+nMGV0f9LwZr5PO9ZPZ8YH1bQ//VMXN7HriCl19p9Hsjnwm4Y2T/lQze8Xy5c2nn3SP9PzduclX1f3j55agLtRT465HtTOCuqvqjqvr+9AZcDPzdJO9qfX9xeq4MTjRvA/71yPhPjTynjw7tO3fkuF9v9V8D1nv566tP/GY6aaCtzf8J8J52Ylo7QbuM+W7gl8ecs9EuzpCQJHW53CS9yiT5+Jg/jfFskhsXe2567fGdhCSpy3cSkqSu19wf+DvwwANr2bJliz0NSXpVueOOO56oqiWj9ddcSCxbtozNm3fWn5qRpN1Dku+Nq7vcJEnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVLXa+7DdK8Wy879w8WewmvKdz/9Dxd7CtJr0qzvJJIcluQbSe5Pcl+SD7f6+Un+MsldbTtpqM9vJZlK8sDwF78nWdVqU0nOHaofkeTWJA8m+fLQt1nt0x5Ptf3LduaTlyTNbC7LTS8Av1FV7wCOBc5OcmTb99mqWtG2GwDavlMZfIH6KuD3k+yRZA/gEuBE4EjgtKFxLmpjLWfwhfRntfpZwJNV9Xbgs62dJGlCZg2Jqnq0qu5s959h8JWGM30P8Grgqqr6cVX9BYMveD+6bVNV9VD7qsirgNVJAvwKcE3rvx44eWis9e3+NcBxrb0kaQLmdeK6Lfe8B7i1lc5Jck+SdUn2b7WlDL67eNqWVuvV3wI8VVUvjNRfMlbb/3RrL0magDmHRJJ9ga8AH6mqvwIuBd4GrAAeBX53uumY7rWA+kxjjc5tbZLNSTZv27ZtxuchSZq7OYVEkr0YBMQXqupagKp6rKperKqfAn/AYDkJBu8EDhvqfiiwdYb6E8B+SfYcqb9krLb/zcD20flV1WVVtbKqVi5Z8rI/hy5JWqC5XN0U4HLg/qr6zFD9kKFmHwC+0+5vAE5tVyYdASwHbgNuB5a3K5n2ZnBye0MNvj/1G8AHW/81wPVDY61p9z8IfL38vlVJmpi5fE7ifcDpwL1J7mq1jzO4OmkFg+Wf7wK/DlBV9yW5GvhTBldGnV1VLwIkOQfYCOwBrKuq+9p4HwOuSvIp4NsMQol2+/kkUwzeQZy6A89VkjRPs4ZEVX2L8ecGbpihz4XAhWPqN4zrV1UP8bPlquH6c8Aps81RkvTK8M9ySJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktQ1a0gkOSzJN5Lcn+S+JB9u9QOSbEryYLvdv9WT5OIkU0nuSXLU0FhrWvsHk6wZqr83yb2tz8VJMtMxJEmTMZd3Ei8Av1FV7wCOBc5OciRwLnBTVS0HbmqPAU4ElrdtLXApDH7hA+cBxwBHA+cN/dK/tLWd7req1XvHkCRNwKwhUVWPVtWd7f4zwP3AUmA1sL41Ww+c3O6vBq6sgVuA/ZIcApwAbKqq7VX1JLAJWNX2vamqbq6qAq4cGWvcMSRJEzCvcxJJlgHvAW4FDq6qR2EQJMBBrdlS4JGhbltabab6ljF1ZjiGJGkC5hwSSfYFvgJ8pKr+aqamY2q1gPqcJVmbZHOSzdu2bZtPV0nSDOYUEkn2YhAQX6iqa1v5sbZURLt9vNW3AIcNdT8U2DpL/dAx9ZmO8RJVdVlVrayqlUuWLJnLU5IkzcFcrm4KcDlwf1V9ZmjXBmD6CqU1wPVD9TPaVU7HAk+3paKNwPFJ9m8nrI8HNrZ9zyQ5th3rjJGxxh1DkjQBe86hzfuA04F7k9zVah8HPg1cneQs4GHglLbvBuAkYAr4EXAmQFVtT/JJ4PbW7oKq2t7ufwi4Ang9cGPbmOEYkqQJmDUkqupbjD9vAHDcmPYFnN0Zax2wbkx9M/CuMfUfjDuGJGky/MS1JKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqmjUkkqxL8niS7wzVzk/yl0nuattJQ/t+K8lUkgeSnDBUX9VqU0nOHaofkeTWJA8m+XKSvVt9n/Z4qu1ftrOetCRpbubyTuIKYNWY+merakXbbgBIciRwKvDO1uf3k+yRZA/gEuBE4EjgtNYW4KI21nLgSeCsVj8LeLKq3g58trWTJE3QrCFRVd8Ets9xvNXAVVX146r6C2AKOLptU1X1UFU9D1wFrE4S4FeAa1r/9cDJQ2Otb/evAY5r7SVJE7Ij5yTOSXJPW47av9WWAo8MtdnSar36W4CnquqFkfpLxmr7n27tJUkTstCQuBR4G7ACeBT43VYf90q/FlCfaayXSbI2yeYkm7dt2zbTvCVJ87CgkKiqx6rqxar6KfAHDJaTYPBO4LChpocCW2eoPwHsl2TPkfpLxmr730xn2auqLquqlVW1csmSJQt5SpKkMRYUEkkOGXr4AWD6yqcNwKntyqQjgOXAbcDtwPJ2JdPeDE5ub6iqAr4BfLD1XwNcPzTWmnb/g8DXW3tJ0oTsOVuDJF8C3g8cmGQLcB7w/iQrGCz/fBf4dYCqui/J1cCfAi8AZ1fVi22cc4CNwB7Auqq6rx3iY8BVST4FfBu4vNUvBz6fZIrBO4hTd/jZSpLmZdaQqKrTxpQvH1Obbn8hcOGY+g3ADWPqD/Gz5arh+nPAKbPNT5L0yvET15KkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1zRoSSdYleTzJd4ZqByTZlOTBdrt/qyfJxUmmktyT5KihPmta+weTrBmqvzfJva3PxUky0zEkSZMzl3cSVwCrRmrnAjdV1XLgpvYY4ERgedvWApfC4Bc+cB5wDHA0cN7QL/1LW9vpfqtmOYYkaUJmDYmq+iawfaS8Gljf7q8HTh6qX1kDtwD7JTkEOAHYVFXbq+pJYBOwqu17U1XdXFUFXDky1rhjSJImZKHnJA6uqkcB2u1Brb4UeGSo3ZZWm6m+ZUx9pmO8TJK1STYn2bxt27YFPiVJ0qidfeI6Y2q1gPq8VNVlVbWyqlYuWbJkvt0lSR0LDYnH2lIR7fbxVt8CHDbU7lBg6yz1Q8fUZzqGJGlCFhoSG4DpK5TWANcP1c9oVzkdCzzdloo2Ascn2b+dsD4e2Nj2PZPk2HZV0xkjY407hiRpQvacrUGSLwHvBw5MsoXBVUqfBq5OchbwMHBKa34DcBIwBfwIOBOgqrYn+SRwe2t3QVVNnwz/EIMrqF4P3Ng2ZjiGJGlCZg2Jqjqts+u4MW0LOLszzjpg3Zj6ZuBdY+o/GHcMSdLk+IlrSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUtUMhkeS7Se5NcleSza12QJJNSR5st/u3epJcnGQqyT1JjhoaZ01r/2CSNUP197bxp1rf7Mh8JUnzszPeSfyDqlpRVSvb43OBm6pqOXBTewxwIrC8bWuBS2EQKsB5wDHA0cB508HS2qwd6rdqJ8xXkjRHr8Ry02pgfbu/Hjh5qH5lDdwC7JfkEOAEYFNVba+qJ4FNwKq2701VdXNVFXDl0FiSpAnY0ZAo4I+S3JFkbasdXFWPArTbg1p9KfDIUN8trTZTfcuYuiRpQvbcwf7vq6qtSQ4CNiX5sxnajjufUAuov3zgQUCtBTj88MNnnrEkac526J1EVW1tt48D1zE4p/BYWyqi3T7emm8BDhvqfiiwdZb6oWPq4+ZxWVWtrKqVS5Ys2ZGnJEkasuCQSPI3k7xx+j5wPPAdYAMwfYXSGuD6dn8DcEa7yulY4Om2HLUROD7J/u2E9fHAxrbvmSTHtquazhgaS5I0ATuy3HQwcF27KnVP4ItV9bUktwNXJzkLeBg4pbW/ATgJmAJ+BJwJUFXbk3wSuL21u6Cqtrf7HwKuAF4P3Ng2SdKELDgkquoh4N1j6j8AjhtTL+DszljrgHVj6puBdy10jpKkHeMnriVJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnq2uVDIsmqJA8kmUpy7mLPR5J2J7t0SCTZA7gEOBE4EjgtyZGLOytJ2n3s0iEBHA1MVdVDVfU8cBWwepHnJEm7jT0XewKzWAo8MvR4C3DMaKMka4G17eGzSR6YwNx2FwcCTyz2JGaTixZ7BloEr4p/m68iPz+uuKuHRMbU6mWFqsuAy1756ex+kmyuqpWLPQ9plP82J2NXX27aAhw29PhQYOsizUWSdju7ekjcDixPckSSvYFTgQ2LPCdJ2m3s0stNVfVCknOAjcAewLqqum+Rp7W7cRlPuyr/bU5Aql62xC9JErDrLzdJkhaRISFJ6jIkJEldu/SJa01Wkr/N4BPtSxl8HmUrsKGq7l/UiUlaNL6TEABJPsbgz54EuI3B5ccBvuQfVtSuLMmZiz2H1zKvbhIASf4ceGdV/WSkvjdwX1UtX5yZSTNL8nBVHb7Y83itcrlJ034KvBX43kj9kLZPWjRJ7untAg6e5Fx2N4aEpn0EuCnJg/zsjyoeDrwdOGfRZiUNHAycADw5Ug/wfyc/nd2HISEAquprSX6BwZ9nX8rgf74twO1V9eKiTk6C/wnsW1V3je5I8seTn87uw3MSkqQur26SJHUZEpKkLkNCktRlSEiSugwJSVLX/wPySywSeEQacQAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "#validating the training dataset\n", "nonfrauds, frauds = data.groupby('EVENT_LABEL').size()\n", "print('Number of frauds: ', frauds)\n", "print('Number of non-frauds: ', nonfrauds)\n", "print('Percentage of fradulent data:', 100.*frauds/(frauds + nonfrauds))\n", "\n", "count_class_0, count_class_1 = data.EVENT_LABEL.value_counts()\n", "data.EVENT_LABEL.value_counts().plot(kind='bar', title='Count (EVENT_LABEL)');" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Uploading the data for training" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ResponseMetadata': {'RequestId': 'AX5P2P5W6K2Y7V2W',\n", " 'HostId': 'dAdIFZFsxxw5ufQqVlMhLaO1zn5bPeuI2w7ua3cqHKYcg8DcZ27gyL3OMuFDaOifAfwszauk39g=',\n", " 'HTTPStatusCode': 200,\n", " 'HTTPHeaders': {'x-amz-id-2': 'dAdIFZFsxxw5ufQqVlMhLaO1zn5bPeuI2w7ua3cqHKYcg8DcZ27gyL3OMuFDaOifAfwszauk39g=',\n", " 'x-amz-request-id': 'AX5P2P5W6K2Y7V2W',\n", " 'date': 'Tue, 22 Sep 2020 21:06:38 GMT',\n", " 'x-amz-server-side-encryption': 'AES256',\n", " 'etag': '\"8baed18fc24e97ddcba8e6ff15de1f93\"',\n", " 'content-length': '0',\n", " 'server': 'AmazonS3'},\n", " 'RetryAttempts': 0},\n", " 'ETag': '\"8baed18fc24e97ddcba8e6ff15de1f93\"',\n", " 'ServerSideEncryption': 'AES256'}" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "csv_buffer = StringIO()\n", "data.to_csv(csv_buffer, index=False)\n", "s3_resource.Object(S3_BUCKET, 'dataset-training.csv').put(Body=csv_buffer.getvalue())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Uploading the data for testing" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ResponseMetadata': {'RequestId': 'B6E8DEF11A9A2C22',\n", " 'HostId': 'jqD7xx2/tpv2PnFnhAiHXMRTaF7JDGex1nnP2GYXLFK/O19KkDONmxejes76LkFQtZE9mnbGlZU=',\n", " 'HTTPStatusCode': 200,\n", " 'HTTPHeaders': {'x-amz-id-2': 'jqD7xx2/tpv2PnFnhAiHXMRTaF7JDGex1nnP2GYXLFK/O19KkDONmxejes76LkFQtZE9mnbGlZU=',\n", " 'x-amz-request-id': 'B6E8DEF11A9A2C22',\n", " 'date': 'Tue, 22 Sep 2020 21:06:43 GMT',\n", " 'x-amz-server-side-encryption': 'AES256',\n", " 'etag': '\"4338feb9a18c65c7c0c98c68c5c935e6\"',\n", " 'content-length': '0',\n", " 'server': 'AmazonS3'},\n", " 'RetryAttempts': 0},\n", " 'ETag': '\"4338feb9a18c65c7c0c98c68c5c935e6\"',\n", " 'ServerSideEncryption': 'AES256'}" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "csv_buffer = StringIO()\n", "test.to_csv(csv_buffer, index=False)\n", "\n", "s3_resource.Object(S3_BUCKET, 'dataset-test.csv').put(Body=csv_buffer.getvalue())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Once we have the datasets ready we need create the necesary entities for build and deploy the fraud detection model. This can be done within the Amazon Fraud Detector console or through the API as shown in the second jupyter notebook." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, we are going to traing and deploy the SageMaker version of the model." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First we copy the data to an in-memory buffer." ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "buf = io.BytesIO()\n", "\n", "sklearn.datasets.dump_svmlight_file(X, y, buf)\n", "buf.seek(0);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we upload the data to S3 using boto3." ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Uploaded training data location: s3://afd-poc-trainingbucket-1i37svk9elcoe/sagemaker-dataset-training\n", "Training artifacts will be uploaded to: s3://afd-poc-outputbucket-1fhn20d7tumgg\n" ] } ], "source": [ "key = 'sagemaker-dataset-training'\n", "boto3.resource('s3').Bucket(S3_BUCKET).Object(key).upload_fileobj(buf)\n", "\n", "s3_train_data = 's3://{}/{}'.format(S3_BUCKET, key)\n", "print('Uploaded training data location: {}'.format(s3_train_data))\n", "\n", "output_location = 's3://{}'.format(S3_OUT_BUCKET)\n", "print('Training artifacts will be uploaded to: {}'.format(output_location))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can now train using SageMaker's built-in XGBoost algorithm. To specify the XGBoost algorithm, we use a utility function to obtain its URI. A complete list of built-in algorithms is found here: https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.\n" ] } ], "source": [ "container = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version='1.0-1')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "SageMaker abstracts training via Estimators. We can pass the classifier and parameters along with hyperparameters to the estimator, and fit the estimator to the data in S3. An important parameter here is scale_pos_weight which scales the weights of the positive vs. negative class examples. This is crucial to do in an imbalanced dataset like the one we are using here, otherwise the majority class would dominate the learning." ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.\n", "'s3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2020-09-22 21:27:11 Starting - Starting the training job...\n", "2020-09-22 21:27:14 Starting - Launching requested ML instances......\n", "2020-09-22 21:28:42 Starting - Preparing the instances for training............\n", "2020-09-22 21:30:25 Downloading - Downloading input data...\n", "2020-09-22 21:30:46 Training - Downloading the training image..\u001b[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training\u001b[0m\n", "\u001b[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc to Json.\u001b[0m\n", "\u001b[34mReturning the value itself\u001b[0m\n", "\u001b[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.\u001b[0m\n", "\u001b[34mReturning the value itself\u001b[0m\n", "\u001b[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\u001b[0m\n", "\u001b[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", "\u001b[34m[21:31:19] 256326x30 matrix with 7688153 entries loaded from /opt/ml/input/data/train\u001b[0m\n", "\u001b[34mINFO:root:Single node training.\u001b[0m\n", "\u001b[34mINFO:root:Train matrix has 256326 rows\u001b[0m\n", "\u001b[34m[0]#011train-auc:0.94930\u001b[0m\n", "\u001b[34m[1]#011train-auc:0.97015\u001b[0m\n", "\u001b[34m[2]#011train-auc:0.97634\u001b[0m\n", "\u001b[34m[3]#011train-auc:0.97924\u001b[0m\n", "\u001b[34m[4]#011train-auc:0.97912\u001b[0m\n", "\u001b[34m[5]#011train-auc:0.98223\u001b[0m\n", "\u001b[34m[6]#011train-auc:0.98227\u001b[0m\n", "\u001b[34m[7]#011train-auc:0.98236\u001b[0m\n", "\u001b[34m[8]#011train-auc:0.98251\u001b[0m\n", "\u001b[34m[9]#011train-auc:0.98409\u001b[0m\n", "\u001b[34m[10]#011train-auc:0.98539\u001b[0m\n", "\u001b[34m[11]#011train-auc:0.98637\u001b[0m\n", "\u001b[34m[12]#011train-auc:0.99252\u001b[0m\n", "\u001b[34m[13]#011train-auc:0.99264\u001b[0m\n", "\u001b[34m[14]#011train-auc:0.99328\u001b[0m\n", "\u001b[34m[15]#011train-auc:0.99423\u001b[0m\n", "\u001b[34m[16]#011train-auc:0.99657\u001b[0m\n", "\u001b[34m[17]#011train-auc:0.99722\u001b[0m\n", "\u001b[34m[18]#011train-auc:0.99788\u001b[0m\n", "\n", "2020-09-22 21:31:16 Training - Training image download completed. Training in progress.\u001b[34m[19]#011train-auc:0.99883\u001b[0m\n", "\u001b[34m[20]#011train-auc:0.99899\u001b[0m\n", "\u001b[34m[21]#011train-auc:0.99914\u001b[0m\n", "\u001b[34m[22]#011train-auc:0.99910\u001b[0m\n", "\u001b[34m[23]#011train-auc:0.99972\u001b[0m\n", "\u001b[34m[24]#011train-auc:0.99973\u001b[0m\n", "\u001b[34m[25]#011train-auc:0.99984\u001b[0m\n", "\u001b[34m[26]#011train-auc:0.99989\u001b[0m\n", "\u001b[34m[27]#011train-auc:0.99989\u001b[0m\n", "\u001b[34m[28]#011train-auc:0.99990\u001b[0m\n", "\u001b[34m[29]#011train-auc:0.99991\u001b[0m\n", "\u001b[34m[30]#011train-auc:0.99993\u001b[0m\n", "\u001b[34m[31]#011train-auc:0.99994\u001b[0m\n", "\u001b[34m[32]#011train-auc:0.99995\u001b[0m\n", "\u001b[34m[33]#011train-auc:0.99995\u001b[0m\n", "\u001b[34m[34]#011train-auc:0.99996\u001b[0m\n", "\u001b[34m[35]#011train-auc:0.99996\u001b[0m\n", "\u001b[34m[36]#011train-auc:0.99996\u001b[0m\n", "\u001b[34m[37]#011train-auc:0.99996\u001b[0m\n", "\u001b[34m[38]#011train-auc:0.99996\u001b[0m\n", "\u001b[34m[39]#011train-auc:0.99996\u001b[0m\n", "\u001b[34m[40]#011train-auc:0.99997\u001b[0m\n", "\u001b[34m[41]#011train-auc:0.99997\u001b[0m\n", "\u001b[34m[42]#011train-auc:0.99997\u001b[0m\n", "\u001b[34m[43]#011train-auc:0.99997\u001b[0m\n", "\u001b[34m[44]#011train-auc:0.99997\u001b[0m\n", "\u001b[34m[45]#011train-auc:0.99998\u001b[0m\n", "\u001b[34m[46]#011train-auc:0.99998\u001b[0m\n", "\u001b[34m[47]#011train-auc:0.99998\u001b[0m\n", "\u001b[34m[48]#011train-auc:0.99998\u001b[0m\n", "\u001b[34m[49]#011train-auc:0.99998\u001b[0m\n", "\u001b[34m[50]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[51]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[52]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[53]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[54]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[55]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[56]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[57]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[58]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[59]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[60]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[61]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[62]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[63]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[64]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[65]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[66]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[67]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[68]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[69]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[70]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[71]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[72]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[73]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[74]#011train-auc:0.99999\u001b[0m\n", "\u001b[34m[75]#011train-auc:0.99999\u001b[0m\n", "\n", "2020-09-22 21:32:14 Uploading - Uploading generated training model\u001b[34m[76]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[77]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[78]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[79]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[80]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[81]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[82]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[83]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[84]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[85]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[86]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[87]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[88]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[89]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[90]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[91]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[92]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[93]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[94]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[95]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[96]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[97]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[98]#011train-auc:1.00000\u001b[0m\n", "\u001b[34m[99]#011train-auc:1.00000\u001b[0m\n", "\n", "2020-09-22 21:32:23 Completed - Training job completed\n", "Training seconds: 118\n", "Billable seconds: 118\n" ] } ], "source": [ "# Because the data set is so highly skewed, we set the scale position weight conservatively,\n", "# as sqrt(num_nonfraud/num_fraud).\n", "# Other recommendations for the scale_pos_weight are setting it to (num_nonfraud/num_fraud).\n", "session = sagemaker.Session()\n", "\n", "scale_pos_weight = sqrt(np.count_nonzero(y==0)/np.count_nonzero(y))\n", "hyperparams = {\n", " \"max_depth\":5,\n", " \"subsample\":0.8,\n", " \"num_round\":100,\n", " \"eta\":0.2,\n", " \"gamma\":4,\n", " \"min_child_weight\":6,\n", " \"silent\":0,\n", " \"objective\":'binary:logistic',\n", " \"eval_metric\":'auc',\n", " \"scale_pos_weight\": scale_pos_weight\n", "}\n", "\n", "clf = sagemaker.estimator.Estimator(container,\n", " get_execution_role(),\n", " hyperparameters=hyperparams,\n", " train_instance_count=1, \n", " train_instance_type='ml.m4.xlarge',\n", " output_path=output_location,\n", " sagemaker_session=session)\n", "clf.fit({'train': s3_train_data})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we deploy the estimator to and endpoint." ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Parameter image will be renamed to image_uri in SageMaker Python SDK v2.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "---------------!" ] } ], "source": [ "predictor = clf.deploy(initial_instance_count=1,\n", " endpoint_name=\"fraud-detection-endpoint\",\n", " instance_type='ml.m4.xlarge', \n", " serializer=csv_serializer,\n", " deserializer=None,\n", " content_type='text/csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Once we have trained the model we can use it to make predictions for the test set." ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "# Because we have a large test set, we call predict on smaller batches\n", "def predict(current_predictor, data, rows=500):\n", " split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))\n", " predictions = ''\n", " for array in split_array:\n", " predictions = ','.join([predictions, current_predictor.predict(array).decode('utf-8')])\n", "\n", " return np.fromstring(predictions[1:], sep=',')" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "raw_preds = predict(predictor, X_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will use a few measures from the scikit-learn package to evaluate the performance of our model. When dealing with an imbalanced dataset, we need to choose metrics that take into account the frequency of each class in the data.\n", "Two such metrics are the balanced accuracy score, and Cohen's Kappa." ] }, { "cell_type": "code", "execution_count": 201, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Balanced accuracy = 0.9021211611532022\n" ] } ], "source": [ "# sagamaker model balanced accuracy for 0.5 threshold\n", "y_preds = np.where(raw_preds > 0.5, 1, 0)\n", "print(\"Balanced accuracy = {}\".format(balanced_accuracy_score(y_test, y_preds)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can already see that our model performs very well in terms of both metrics, Cohen's Kappa scores above 0.8 are generally very favorable.\n", "Apart from single-value metrics, it's also useful to look at metrics that indicate performance per class. A confusion matrix, and per-class precision, recall and f1-score can also provide more information about the model's performance." ] }, { "cell_type": "code", "execution_count": 202, "metadata": {}, "outputs": [], "source": [ "def plot_confusion_matrix(y_true, y_predicted):\n", " cm = confusion_matrix(y_true, y_predicted)\n", " # Get the per-class normalized value for each cell\n", " cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", " # We color each cell according to its normalized value, annotate with exact counts.\n", " ax = sns.heatmap(cm_norm, annot=cm, fmt=\"d\")\n", " ax.set(xticklabels=[\"non-fraud\", \"fraud\"], yticklabels=[\"non-fraud\", \"fraud\"])\n", " ax.set_ylim([0,2])\n", " plt.title('Confusion Matrix')\n", " plt.ylabel('Real Classes')\n", " plt.xlabel('Predicted Classes')\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": 203, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plot_confusion_matrix(y_test, y_preds)" ] }, { "cell_type": "code", "execution_count": 204, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " non-fraud 1.00 1.00 1.00 28435\n", " fraud 0.93 0.80 0.86 46\n", "\n", " accuracy 1.00 28481\n", " macro avg 0.96 0.90 0.93 28481\n", "weighted avg 1.00 1.00 1.00 28481\n", "\n" ] } ], "source": [ "print(classification_report(\n", " y_test, y_preds, target_names=['non-fraud', 'fraud']))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now that we have the sagemaker model, lets build the Amazon Fraud Detector one" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "# -- This is all you need to fill out. Once complete simply interactively run each code cell. -- \n", "# your_entity_name\n", "ENTITY_TYPE = \"creditcardtrans{0}\".format(sufx) \n", "ENTITY_DESC = \"creditcard transactions: {0}\".format(sufx) \n", "# your_event_type\n", "EVENT_TYPE = \"creditcard{0}\".format(sufx) \n", "EVENT_DESC = \"creditcard card payment events: {0}\".format(sufx) \n", "# your_model_name\n", "MODEL_NAME = \"fraud_detector_model{0}\".format(sufx) \n", "MODEL_DESC = \"model trained on: {0}\".format(sufx) \n", "# your_detector_name\n", "DETECTOR_NAME = \"fraud_detector_endpoint{0}\".format(sufx) \n", "DETECTOR_DESC = \"detects synthetic fraud events created: {0}\".format(sufx) " ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--- summary stats ---\n", " feature_name dtype count nunique null not_null null_pct nunique_pct feature_type feature_warning\n", "0 time float64 256326 119735 0 256326 0.0 0.4671 NUMERIC NO WARNING\n", "1 va float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "2 vb float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "3 vc float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "4 vd float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "5 ve float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "6 vf float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "7 vg float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "8 vh float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "9 vi float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "10 vj float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "11 vk float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "12 vl float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "13 vm float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "14 vn float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "15 vo float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "16 vp float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "17 vq float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "18 vr float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "19 vs float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "20 vt float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "21 vu float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "22 vv float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "23 vw float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "24 vx float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "25 vy float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "26 vz float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "27 vaa float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "28 vab float64 256326 248553 0 256326 0.0 0.9697 NUMERIC NO WARNING\n", "29 amount float64 256326 31154 0 256326 0.0 0.1215 PRICE NO WARNING\n", "30 EVENT_LABEL int64 256326 2 0 256326 0.0 0.0000 TARGET NO WARNING\n", "31 EVENT_TIMESTAMP object 256326 119735 0 256326 0.0 0.4671 EVENT_TIMESTAMP NO WARNING\n", "\n", "\n", "--- event variables ---\n", "['time', 'va', 'vb', 'vc', 'vd', 've', 'vf', 'vg', 'vh', 'vi', 'vj', 'vk', 'vl', 'vm', 'vn', 'vo', 'vp', 'vq', 'vr', 'vs', 'vt', 'vu', 'vv', 'vw', 'vx', 'vy', 'vz', 'vaa', 'vab', 'amount']\n", "\n", "\n", "--- event labels ---\n", "[0, 1]\n", "\n", "\n", "--- training data schema ---\n", "{'modelVariables': ['time', 'va', 'vb', 'vc', 'vd', 've', 'vf', 'vg', 'vh', 'vi', 'vj', 'vk', 'vl', 'vm', 'vn', 'vo', 'vp', 'vq', 'vr', 'vs', 'vt', 'vu', 'vv', 'vw', 'vx', 'vy', 'vz', 'vaa', 'vab'], 'labelSchema': {'labelMapper': {'FRAUD': ['1'], 'LEGIT': ['0']}}}\n", "\n", "\n" ] } ], "source": [ "# Dataset profiling, just run this code block\n", "def summary_stats(df):\n", " \"\"\" Generate summary statistics for a panda's data frame \n", " Args:\n", " df (DataFrame): panda's dataframe to create summary statistics for.\n", " Returns:\n", " DataFrame of summary statistics, training data schema, event variables and event lables \n", " \"\"\"\n", " df = df.copy()\n", " rowcnt = len(df)\n", " df_s1 = df.agg(['count', 'nunique']).transpose().reset_index().rename(columns={\"index\":\"feature_name\"})\n", " df_s1[\"null\"] = (rowcnt - df_s1[\"count\"]).astype('int64')\n", " df_s1[\"not_null\"] = rowcnt - df_s1[\"null\"]\n", " df_s1[\"null_pct\"] = df_s1[\"null\"] / rowcnt\n", " df_s1[\"nunique_pct\"] = df_s1['nunique']/ rowcnt\n", " dt = pd.DataFrame(df.dtypes).reset_index().rename(columns={\"index\":\"feature_name\", 0:\"dtype\"})\n", " df_stats = pd.merge(dt, df_s1, on='feature_name', how='inner').round(4)\n", " df_stats['nunique'] = df_stats['nunique'].astype('int64')\n", " df_stats['count'] = df_stats['count'].astype('int64')\n", " \n", " # -- variable type mapper -- \n", " df_stats['feature_type'] = \"UNKOWN\"\n", " df_stats.loc[df_stats[\"dtype\"] == object, 'feature_type'] = \"CATEGORY\"\n", " df_stats.loc[(df_stats[\"dtype\"] == \"int64\") | (df_stats[\"dtype\"] == \"float64\"), 'feature_type'] = \"NUMERIC\"\n", " df_stats.loc[df_stats[\"feature_name\"].str.contains(\"ipaddress|ip_address|ipcli\"), 'feature_type'] = \"IP_ADDRESS\"\n", " df_stats.loc[df_stats[\"feature_name\"].str.contains(\"email|email_address|emailaddr\"), 'feature_type'] = \"EMAIL_ADDRESS\"\n", " df_stats.loc[df_stats[\"feature_name\"].str.contains(\"canal|channel\"), 'feature_type'] = \"USERAGENT\"\n", " df_stats.loc[df_stats[\"feature_name\"].str.contains(\"monto|amount\"), 'feature_type'] = \"PRICE\"\n", " df_stats.loc[df_stats[\"feature_name\"].str.contains(\"nomdes|name\"), 'feature_type'] = \"BILLING_NAME\"\n", " df_stats.loc[df_stats[\"feature_name\"] == \"EVENT_LABEL\", 'feature_type'] = \"TARGET\"\n", " df_stats.loc[df_stats[\"feature_name\"] == \"EVENT_TIMESTAMP\", 'feature_type'] = \"EVENT_TIMESTAMP\"\n", " \n", " # -- variable warnings -- \n", " df_stats['feature_warning'] = \"NO WARNING\"\n", " df_stats.loc[(df_stats[\"nunique\"] != 2) & (df_stats[\"feature_name\"] == \"EVENT_LABEL\"),'feature_warning' ] = \"LABEL WARNING, NON-BINARY EVENT LABEL\"\n", " df_stats.loc[(df_stats[\"nunique_pct\"] > 0.97) & (df_stats['feature_type'] == \"CATEGORY\") ,'feature_warning' ] = \"EXCLUDE, GT 97% UNIQUE\"\n", " df_stats.loc[(df_stats[\"null_pct\"] > 0.2) & (df_stats[\"null_pct\"] <= 0.5), 'feature_warning' ] = \"NULL WARNING, GT 20% MISSING\"\n", " df_stats.loc[df_stats[\"null_pct\"] > 0.5,'feature_warning' ] = \"EXCLUDE, GT 50% MISSING\"\n", " df_stats.loc[((df_stats['dtype'] == \"int64\" ) | (df_stats['dtype'] == \"float64\" ) ) & (df_stats['nunique'] < 0.2), 'feature_warning' ] = \"LIKELY CATEGORICAL, NUMERIC w. LOW CARDINALITY\"\n", " \n", " # -- target check -- \n", " exclude_fields = df_stats.loc[(df_stats['feature_warning'] != 'NO WARNING')]['feature_name'].to_list()\n", " event_variables = df_stats.loc[(~df_stats['feature_name'].isin(['EVENT_LABEL', 'EVENT_TIMESTAMP']))]['feature_name'].to_list()\n", " event_labels = df[\"EVENT_LABEL\"].unique().tolist()\n", " \n", " trainingDataSchema = {\n", " 'modelVariables' : df_stats.loc[(df_stats['feature_type'].isin(['IP_ADDRESS', 'EMAIL_ADDRESS', 'CATEGORY', 'NUMERIC' ]))]['feature_name'].to_list(),\n", " 'labelSchema' : {\n", " 'labelMapper' : {\n", " 'FRAUD' : [str(df[\"EVENT_LABEL\"].value_counts().idxmin())],\n", " 'LEGIT' : [str(df[\"EVENT_LABEL\"].value_counts().idxmax())]\n", " }\n", " }\n", " }\n", " \n", " \n", " model_variables = df_stats.loc[(df_stats['feature_type'].isin(['IP_ADDRESS', 'EMAIL_ADDRESS', 'CATEGORY', 'NUMERIC' ]))]['feature_name'].to_list()\n", " \n", " \n", " # -- label schema -- \n", " label_map = {\n", " 'FRAUD' : [df[\"EVENT_LABEL\"].value_counts().idxmin()],\n", " 'LEGIT' : [df[\"EVENT_LABEL\"].value_counts().idxmax()]\n", " }\n", " \n", " \n", " print(\"--- summary stats ---\")\n", " print(df_stats)\n", " print(\"\\n\")\n", " print(\"--- event variables ---\")\n", " print(event_variables)\n", " print(\"\\n\")\n", " print(\"--- event labels ---\")\n", " print(event_labels)\n", " print(\"\\n\")\n", " print(\"--- training data schema ---\")\n", " print(trainingDataSchema)\n", " print(\"\\n\")\n", " \n", " return df_stats, trainingDataSchema, event_variables, event_labels\n", "\n", "# -- connect to S3, snag file, and convert to a panda's dataframe --\n", "#s3 = boto3.resource('s3')\n", "#obj = s3.Object(S3_BUCKET, S3_FILE)\n", "#body = obj.get()['Body']\n", "#df = pd.read_csv(body)\n", "\n", "# -- call profiling function -- \n", "df_stats, trainingDataSchema, eventVariables, eventLabels = summary_stats(data)" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Creating variable: time\n", "Creating variable: fraud_detector_model20200922_insightscore\n", "\n", " --- model variable dict --\n", "[{'name': 'amount'}, {'name': 'time'}, {'name': 'va'}, {'name': 'vb'}, {'name': 'vc'}, {'name': 'vd'}, {'name': 've'}, {'name': 'vf'}, {'name': 'vg'}, {'name': 'vh'}, {'name': 'vi'}, {'name': 'vj'}, {'name': 'vk'}, {'name': 'vl'}, {'name': 'vm'}, {'name': 'vn'}, {'name': 'vo'}, {'name': 'vp'}, {'name': 'vq'}, {'name': 'vr'}, {'name': 'vs'}, {'name': 'vt'}, {'name': 'vu'}, {'name': 'vv'}, {'name': 'vw'}, {'name': 'vx'}, {'name': 'vy'}, {'name': 'vz'}, {'name': 'vaa'}, {'name': 'vab'}]\n", "\n", " --- model label schema dict --\n", "{'labelKey': 'EVENT_LABEL', 'labelMapper': {'FRAUD': ['1'], 'LEGIT': ['0']}}\n" ] } ], "source": [ "# Variable creation just run this code block\n", "def create_label(df, FRAUD_LABEL):\n", " \"\"\"\n", " Returns a dictionary for the model labelSchema, by identifying the rare event as fraud / and common as not-fraud \n", " \n", " Arguments:\n", " df -- input dataframe \n", " FRAUD_LABEL -- the name of the field that contains fraud label \n", " \n", " Returns:\n", " labelSchema -- a dictionary containing labelKey & labelMapper \n", " \"\"\"\n", " label_summary = df[FRAUD_LABEL].value_counts()\n", " labelSchema = {'labelKey': FRAUD_LABEL,\n", " \"labelMapper\" : { \"FRAUD\": [str(label_summary.idxmin())], \n", " \"LEGIT\": [str(label_summary.idxmax())]}\n", " }\n", " afd_resource.put_label(\n", " name = str(label_summary.idxmin()),\n", " description = 'FRAUD')\n", " \n", " afd_resource.put_label(\n", " name = str(label_summary.idxmax()),\n", " description = 'LEGIT')\n", " \n", " return labelSchema\n", " \n", "# -- function to create all your variables --- \n", "def create_variables(df_stats, MODEL_NAME):\n", " \"\"\"\n", " Returns a variable list of model input variables, checks to see if variable exists,\n", " and, if not, then it adds the variable to Fraud Detector \n", " \n", " Arguments: \n", " enrichment_features -- dictionary of optional features, mapped to specific variable types enriched (CARD_BIN, USERAGENT)\n", " numeric_features -- optional list of numeric field names \n", " categorical_features -- optional list of categorical features \n", " \n", " Returns:\n", " variable_list -- a list of variable dictionaries \n", " \n", " \"\"\"\n", " enrichment_features = df_stats.loc[(df_stats['feature_type'].isin(['IP_ADDRESS', 'EMAIL_ADDRESS', 'USERAGENT', 'BILLING_NAME', 'PRICE']))]['feature_name'].to_dict()\n", " enrichment_type = df_stats.loc[(df_stats['feature_type'].isin(['IP_ADDRESS', 'EMAIL_ADDRESS', 'USERAGENT', 'BILLING_NAME', 'PRICE']))]['feature_type'].to_dict()\n", " numeric_features = df_stats.loc[(df_stats['feature_type'].isin(['NUMERIC']))]['feature_name'].to_dict()\n", " categorical_features = df_stats.loc[(df_stats['feature_type'].isin(['CATEGORY']))]['feature_name'].to_dict()\n", " \n", " variable_list = []\n", " # -- first do the enrichment features\n", " for feature in enrichment_features.keys(): \n", " variable_list.append( {'name' : enrichment_features[feature]+\"\"})\n", " try:\n", " varname = enrichment_features[feature]+\"\"\n", " afd_resource.get_variables(name=varname)\n", " except:\n", " print(\"Creating variable: {0}\".format(enrichment_features[feature]))\n", " if enrichment_type[feature] == \"PRICE\":\n", " resp = afd_resource.create_variable(\n", " name = varname,\n", " dataType = 'FLOAT',\n", " dataSource ='EVENT',\n", " defaultValue = '0', \n", " description = enrichment_features[feature],\n", " variableType = enrichment_type[feature] )\n", " else:\n", " resp = afd_resource.create_variable(\n", " name = varname,\n", " dataType = 'STRING',\n", " dataSource ='EVENT',\n", " defaultValue = '', \n", " description = enrichment_features[feature],\n", " variableType = enrichment_type[feature] )\n", " \n", " \n", " # -- check and update the numeric features \n", " for feature in numeric_features: \n", " variable_list.append( {'name' : numeric_features[feature]+\"\"})\n", " try:\n", " varname = numeric_features[feature]+\"\"\n", " afd_resource.get_variables(name=varname)\n", " except:\n", " print(\"Creating variable: {0}\".format(numeric_features[feature]))\n", " resp = afd_resource.create_variable(\n", " name = varname,\n", " dataType = 'FLOAT',\n", " dataSource ='EVENT',\n", " defaultValue = '0.0', \n", " description = numeric_features[feature],\n", " variableType = 'NUMERIC' )\n", " \n", " # -- check and update the categorical features \n", " for feature in categorical_features: \n", " variable_list.append( {'name' : categorical_features[feature]+\"\"})\n", " try:\n", " varname = categorical_features[feature]+\"\"\n", " afd_resource.get_variables(name=varname)\n", " except:\n", " print(\"Creating variable: {0}\".format(categorical_features[feature]))\n", " resp = afd_resource.create_variable(\n", " name = varname,\n", " dataType = 'STRING',\n", " dataSource ='EVENT',\n", " defaultValue = '', \n", " description = categorical_features[feature],\n", " variableType = 'CATEGORICAL' )\n", " \n", " # -- create a model score feature \n", " model_feature = \"{0}_insightscore\".format(MODEL_NAME) \n", " # variable_list.append( {'name' : model_feature})\n", " try:\n", " afd_resource.get_variables(name=model_feature)\n", " except:\n", " print(\"Creating variable: {0}\".format(model_feature))\n", " resp = afd_resource.create_variable(\n", " name = model_feature,\n", " dataType = 'FLOAT',\n", " dataSource ='MODEL_SCORE',\n", " defaultValue = '0.0', \n", " description = model_feature,\n", " variableType = 'NUMERIC' )\n", " \n", " return variable_list\n", "\n", "\n", "model_variables = create_variables(df_stats, MODEL_NAME)\n", "print(\"\\n --- model variable dict --\")\n", "print(model_variables)\n", "\n", "\n", "model_label = create_label(data, \"EVENT_LABEL\")\n", "print(\"\\n --- model label schema dict --\")\n", "print(model_label)" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['0', '1']\n" ] } ], "source": [ "#Amazon Fraud Detector expect the labels to be strings.\n", "eventLabels = list(map(str, eventLabels))\n", "print(eventLabels)" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-- create entity --\n", "{'ResponseMetadata': {'RequestId': '88e9d45f-2c8b-4a90-b58c-8a6dcea3bc40', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1', 'date': 'Tue, 22 Sep 2020 21:48:52 GMT', 'x-amzn-requestid': '88e9d45f-2c8b-4a90-b58c-8a6dcea3bc40', 'content-length': '2', 'connection': 'keep-alive'}, 'RetryAttempts': 0}}\n", "-- create event type --\n", "{'ResponseMetadata': {'RequestId': 'd48b2c1c-34a2-4c97-bc7d-52c0e192f729', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1', 'date': 'Tue, 22 Sep 2020 21:48:53 GMT', 'x-amzn-requestid': 'd48b2c1c-34a2-4c97-bc7d-52c0e192f729', 'content-length': '2', 'connection': 'keep-alive'}, 'RetryAttempts': 0}}\n" ] } ], "source": [ "# Creating entity and event types just run this code block ---\n", "response = afd_resource.put_entity_type(\n", " name = ENTITY_TYPE,\n", " description = ENTITY_DESC\n", ")\n", "print(\"-- create entity --\")\n", "print(response)\n", "\n", "response = afd_resource.put_event_type (\n", " name = EVENT_TYPE,\n", " eventVariables = eventVariables,\n", " labels = eventLabels,\n", " entityTypes = [ENTITY_TYPE])\n", "print(\"-- create event type --\")\n", "print(response)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-- initalize model --\n", "{'ResponseMetadata': {'RequestId': '2c15f806-c600-4217-b400-8332dda66a47', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1', 'date': 'Tue, 22 Sep 2020 21:48:56 GMT', 'x-amzn-requestid': '2c15f806-c600-4217-b400-8332dda66a47', 'content-length': '2', 'connection': 'keep-alive'}, 'RetryAttempts': 0}}\n" ] } ], "source": [ "# -- create our model --\n", "response = afd_resource.create_model(\n", " description = MODEL_DESC,\n", " eventTypeName = EVENT_TYPE,\n", " modelId = MODEL_NAME,\n", " modelType = 'ONLINE_FRAUD_INSIGHTS')\n", "print(\"-- initalize model --\")\n", "print(response)" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-- model training --\n", "{'modelId': 'fraud_detector_model20200922', 'modelType': 'ONLINE_FRAUD_INSIGHTS', 'modelVersionNumber': '1.0', 'status': 'TRAINING_IN_PROGRESS', 'ResponseMetadata': {'RequestId': 'e88dc617-42fe-4971-9509-ee6d1fbdf6f8', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1', 'date': 'Tue, 22 Sep 2020 21:49:02 GMT', 'x-amzn-requestid': 'e88dc617-42fe-4971-9509-ee6d1fbdf6f8', 'content-length': '137', 'connection': 'keep-alive'}, 'RetryAttempts': 0}}\n" ] } ], "source": [ "# -- initializes the model, it's now ready to train -- \n", "S3_FILE = \"dataset-training.csv\"\n", "S3_FILE_LOC = \"s3://{0}/{1}\".format(S3_BUCKET,S3_FILE)\n", "\n", "response = afd_resource.create_model_version(\n", " modelId = MODEL_NAME,\n", " modelType = 'ONLINE_FRAUD_INSIGHTS',\n", " trainingDataSource = 'EXTERNAL_EVENTS',\n", " trainingDataSchema = trainingDataSchema,\n", " externalEventsDetail = {\n", " 'dataLocation' : S3_FILE_LOC,\n", " 'dataAccessRoleArn': ARN_ROLE\n", " }\n", ")\n", "print(\"-- model training --\")\n", "print(response)" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model status : TRAINING_COMPLETE\n", "\n", " --- model training complete --\n", "Elapsed time : 3134.791792869568 seconds \n", "\n", "{'modelId': 'fraud_detector_model20200922', 'modelType': 'ONLINE_FRAUD_INSIGHTS', 'modelVersionNumber': '1.0', 'trainingDataSource': 'EXTERNAL_EVENTS', 'trainingDataSchema': {'modelVariables': ['time', 'va', 'vb', 'vc', 'vd', 've', 'vf', 'vg', 'vh', 'vi', 'vj', 'vk', 'vl', 'vm', 'vn', 'vo', 'vp', 'vq', 'vr', 'vs', 'vt', 'vu', 'vv', 'vw', 'vx', 'vy', 'vz', 'vaa', 'vab'], 'labelSchema': {'labelMapper': {'FRAUD': ['1'], 'LEGIT': ['0']}}}, 'externalEventsDetail': {'dataLocation': 's3://afd-poc-trainingbucket-1i37svk9elcoe/dataset-training.csv', 'dataAccessRoleArn': 'arn:aws:iam::387461613214:role/afd-poc-NotebookInstanceExecutionRole-1FNQ41S8H2G68'}, 'status': 'TRAINING_COMPLETE', 'arn': 'arn:aws:frauddetector:us-west-2:387461613214:model-version/ONLINE_FRAUD_INSIGHTS/fraud_detector_model20200922/1.0', 'ResponseMetadata': {'RequestId': '7386e6d4-a8b0-4033-b965-899cd5814a26', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1', 'date': 'Tue, 22 Sep 2020 22:41:31 GMT', 'x-amzn-requestid': '7386e6d4-a8b0-4033-b965-899cd5814a26', 'content-length': '756', 'connection': 'keep-alive'}, 'RetryAttempts': 0}}\n" ] } ], "source": [ "# -- model training takes time, we'll loop until it's complete -- \n", "print(\"-- wait for model training to complete --\")\n", "stime = time.time()\n", "while True:\n", " clear_output(wait=True)\n", " response = afd_resource.get_model_version(modelId=MODEL_NAME, modelType = \"ONLINE_FRAUD_INSIGHTS\", modelVersionNumber = '1.0')\n", " if response['status'] == 'TRAINING_IN_PROGRESS':\n", " print(f\"current progress: {(time.time() - stime)/60:{3}.{3}} minutes\")\n", " time.sleep(60) # -- sleep for 60 seconds \n", " if response['status'] != 'TRAINING_IN_PROGRESS':\n", " print(\"Model status : \" + response['status'])\n", " break\n", " \n", "etime = time.time()\n", "\n", "# -- summarize -- \n", "print(\"\\n --- model training complete --\")\n", "print(\"Elapsed time : %s\" % (etime - stime) + \" seconds \\n\" )\n", "print(response)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model status : ACTIVE\n", "Elapsed time : 602.6856355667114 seconds \n", "\n", "{'modelId': 'fraud_detector_model20200922', 'modelType': 'ONLINE_FRAUD_INSIGHTS', 'modelVersionNumber': '1.0', 'trainingDataSource': 'EXTERNAL_EVENTS', 'trainingDataSchema': {'modelVariables': ['time', 'va', 'vb', 'vc', 'vd', 've', 'vf', 'vg', 'vh', 'vi', 'vj', 'vk', 'vl', 'vm', 'vn', 'vo', 'vp', 'vq', 'vr', 'vs', 'vt', 'vu', 'vv', 'vw', 'vx', 'vy', 'vz', 'vaa', 'vab'], 'labelSchema': {'labelMapper': {'FRAUD': ['1'], 'LEGIT': ['0']}}}, 'externalEventsDetail': {'dataLocation': 's3://afd-poc-trainingbucket-1i37svk9elcoe/dataset-training.csv', 'dataAccessRoleArn': 'arn:aws:iam::387461613214:role/afd-poc-NotebookInstanceExecutionRole-1FNQ41S8H2G68'}, 'status': 'ACTIVE', 'arn': 'arn:aws:frauddetector:us-west-2:387461613214:model-version/ONLINE_FRAUD_INSIGHTS/fraud_detector_model20200922/1.0', 'ResponseMetadata': {'RequestId': 'bca2079c-03d0-42f1-9863-7cd0c71fc4dc', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1', 'date': 'Tue, 22 Sep 2020 22:54:49 GMT', 'x-amzn-requestid': 'bca2079c-03d0-42f1-9863-7cd0c71fc4dc', 'content-length': '745', 'connection': 'keep-alive'}, 'RetryAttempts': 0}}\n" ] } ], "source": [ "# activating the model\n", "response = afd_resource.update_model_version_status (\n", " modelId = MODEL_NAME,\n", " modelType = 'ONLINE_FRAUD_INSIGHTS',\n", " modelVersionNumber = '1.0',\n", " status = 'ACTIVE'\n", ")\n", "print(\"-- activating model --\")\n", "print(response)\n", "\n", "#-- wait until model is active \n", "print(\"--- waiting until model status is active \")\n", "stime = time.time()\n", "while True:\n", " clear_output(wait=True)\n", " response = afd_resource.get_model_version(modelId=MODEL_NAME, modelType = \"ONLINE_FRAUD_INSIGHTS\", modelVersionNumber = '1.0')\n", " if response['status'] != 'ACTIVE':\n", " print(f\"current progress: {(time.time() - stime)/60:{3}.{3}} minutes\")\n", " time.sleep(60) # sleep for 1 minute \n", " if response['status'] == 'ACTIVE':\n", " print(\"Model status : \" + response['status'])\n", " break\n", " \n", "etime = time.time()\n", "print(\"Elapsed time : %s\" % (etime - stime) + \" seconds \\n\" )\n", "print(response)" ] }, { "cell_type": "code", "execution_count": 185, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# -- model performance summary -- \n", "auc = afd_resource.describe_model_versions(\n", " modelId= MODEL_NAME,\n", " modelVersionNumber='1.0',\n", " modelType='ONLINE_FRAUD_INSIGHTS',\n", " maxResults=10\n", ")['modelVersionDetails'][0]['trainingResult']['trainingMetrics']['auc']\n", "\n", "\n", "df_model = pd.DataFrame(afd_resource.describe_model_versions(\n", " modelId= MODEL_NAME,\n", " modelVersionNumber='1.0',\n", " modelType='ONLINE_FRAUD_INSIGHTS',\n", " maxResults=10\n", ")['modelVersionDetails'][0]['trainingResult']['trainingMetrics']['metricDataPoints'])\n", "\n", "\n", "plt.figure(figsize=(6,6))\n", "plt.plot(df_model[\"fpr\"], df_model[\"tpr\"], color='darkorange',\n", " lw=2, label='ROC curve (area = %0.3f)' % auc)\n", "plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "plt.title( MODEL_NAME + ' ROC Chart')\n", "plt.legend(loc=\"lower right\",fontsize=12)\n", "plt.axvline(x = 0.02 ,linewidth=2, color='r')\n", "plt.axhline(y = 0.73 ,linewidth=2, color='r')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'ResponseMetadata': {'RequestId': '4c1eb5b8-0bb0-4ac0-abf0-49d55c3861c6', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1', 'date': 'Tue, 22 Sep 2020 22:55:06 GMT', 'x-amzn-requestid': '4c1eb5b8-0bb0-4ac0-abf0-49d55c3861c6', 'content-length': '2', 'connection': 'keep-alive'}, 'RetryAttempts': 0}}\n" ] } ], "source": [ "# -- initialize your detector -- \n", "response = afd_resource.put_detector(detectorId = DETECTOR_NAME, \n", " description = DETECTOR_DESC, \n", " eventTypeName = EVENT_TYPE )\n", "\n", "print(response)" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " --- score thresholds 1% to 6% --- \n", " fpr tpr threshold rule outcome\n", "0 0.01 0.85 970.0 $fraud_detector_model20200922_insightscore > 9... fraud\n", "1 0.02 0.87 925.0 $fraud_detector_model20200922_insightscore > 9... fraud\n", "2 0.03 0.87 880.0 $fraud_detector_model20200922_insightscore > 8... fraud\n", "3 0.04 0.90 835.0 $fraud_detector_model20200922_insightscore > 8... investigate\n", "4 0.05 0.91 790.0 $fraud_detector_model20200922_insightscore > 7... investigate\n", "5 0.06 0.91 755.0 $fraud_detector_model20200922_insightscore <= ... approve\n" ] } ], "source": [ "# -- make rules -- \n", "model_stat = df_model.round(decimals=2) \n", "\n", "m = model_stat.loc[model_stat.groupby([\"fpr\"])[\"threshold\"].idxmax()] \n", "\n", "def make_rule(x):\n", " rule = \"\"\n", " if x['fpr'] <= 0.05: \n", " rule = \"${0}_insightscore > {1}\".format(MODEL_NAME,x['threshold'])\n", " if x['fpr'] == 0.06:\n", " rule = \"${0}_insightscore <= {1}\".format(MODEL_NAME,x['threshold_prev'])\n", " return rule\n", " \n", "m[\"threshold_prev\"] = m['threshold'].shift(1)\n", "m['rule'] = m.apply(lambda x: make_rule(x), axis=1)\n", "\n", "m['outcome'] = \"approve\"\n", "m.loc[m['fpr'] <= 0.03, \"outcome\"] = \"fraud\"\n", "m.loc[(m['fpr'] > 0.03) & (m['fpr'] <= 0.05), \"outcome\"] = \"investigate\"\n", "\n", "print (\" --- score thresholds 1% to 6% --- \")\n", "print(m[[\"fpr\", \"tpr\", \"threshold\", \"rule\", \"outcome\"]].loc[(m['fpr'] > 0.0 ) & (m['fpr'] <= 0.06)].reset_index(drop=True))" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "creating outcome variable: fraud \n", "creating outcome variable: investigate \n", "creating outcome variable: approve \n" ] } ], "source": [ "# -- create outcomes -- \n", "def create_outcomes(outcomes):\n", " \"\"\" create Fraud Detector Outcomes \n", " \n", " \"\"\" \n", " for outcome in outcomes:\n", " print(\"creating outcome variable: {0} \".format(outcome))\n", " response = afd_resource.put_outcome(\n", " name=outcome,\n", " description=outcome)\n", "\n", "# -- get distinct outcomes \n", "outcomes = m[\"outcome\"].unique().tolist()\n", "\n", "create_outcomes(outcomes)" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "creating rule: rule0_fraud_detector_model20200922: IF $fraud_detector_model20200922_insightscore > 970.0 THEN fraud\n", "creating rule: rule1_fraud_detector_model20200922: IF $fraud_detector_model20200922_insightscore > 925.0 THEN fraud\n", "creating rule: rule2_fraud_detector_model20200922: IF $fraud_detector_model20200922_insightscore > 880.0 THEN fraud\n", "creating rule: rule3_fraud_detector_model20200922: IF $fraud_detector_model20200922_insightscore > 835.0 THEN investigate\n", "creating rule: rule4_fraud_detector_model20200922: IF $fraud_detector_model20200922_insightscore > 790.0 THEN investigate\n", "creating rule: rule5_fraud_detector_model20200922: IF $fraud_detector_model20200922_insightscore <= 790.0 THEN approve\n" ] }, { "data": { "text/plain": [ "[{'ruleId': 'rule0_fraud_detector_model20200922',\n", " 'ruleVersion': '1',\n", " 'detectorId': 'fraud_detector_endpoint20200922'},\n", " {'ruleId': 'rule1_fraud_detector_model20200922',\n", " 'ruleVersion': '1',\n", " 'detectorId': 'fraud_detector_endpoint20200922'},\n", " {'ruleId': 'rule2_fraud_detector_model20200922',\n", " 'ruleVersion': '1',\n", " 'detectorId': 'fraud_detector_endpoint20200922'},\n", " {'ruleId': 'rule3_fraud_detector_model20200922',\n", " 'ruleVersion': '1',\n", " 'detectorId': 'fraud_detector_endpoint20200922'},\n", " {'ruleId': 'rule4_fraud_detector_model20200922',\n", " 'ruleVersion': '1',\n", " 'detectorId': 'fraud_detector_endpoint20200922'},\n", " {'ruleId': 'rule5_fraud_detector_model20200922',\n", " 'ruleVersion': '1',\n", " 'detectorId': 'fraud_detector_endpoint20200922'}]" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# creating te rules with the desired outcomes associated with our detector\n", "rule_set = m[(m[\"fpr\"] > 0.0) & (m[\"fpr\"] <= 0.06)][[\"outcome\", \"rule\"]].to_dict('records')\n", "rule_list = []\n", "for i, rule in enumerate(rule_set):\n", " ruleId = \"rule{0}_{1}\".format(i, MODEL_NAME)\n", " rule_list.append({\"ruleId\": ruleId, \n", " \"ruleVersion\" : '1',\n", " \"detectorId\" : DETECTOR_NAME\n", " \n", " })\n", " print(\"creating rule: {0}: IF {1} THEN {2}\".format(ruleId, rule[\"rule\"], rule['outcome']))\n", " try:\n", " response = afd_resource.create_rule(\n", " ruleId = ruleId,\n", " detectorId = DETECTOR_NAME,\n", " expression = rule['rule'],\n", " language = 'DETECTORPL',\n", " outcomes = [rule['outcome']]\n", " )\n", " except:\n", " print(\"this rule already exists in this detector\")\n", "rule_list" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " -- detector created -- \n", "{'rule': {'detectorId': 'fraud_detector_endpoint20200922', 'ruleId': 'rule5_fraud_detector_model20200922', 'ruleVersion': '1'}, 'ResponseMetadata': {'RequestId': 'ceb37fe4-61df-4797-b198-341bf37b8a7d', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1', 'date': 'Tue, 22 Sep 2020 22:55:26 GMT', 'x-amzn-requestid': 'ceb37fe4-61df-4797-b198-341bf37b8a7d', 'content-length': '121', 'connection': 'keep-alive'}, 'RetryAttempts': 0}}\n" ] } ], "source": [ "# creating the detector\n", "afd_resource.create_detector_version(\n", " detectorId = DETECTOR_NAME,\n", " rules = rule_list,\n", " modelVersions = [{\"modelId\":MODEL_NAME, \n", " \"modelType\" : \"ONLINE_FRAUD_INSIGHTS\",\n", " \"modelVersionNumber\" : \"1.0\"}],\n", " ruleExecutionMode = 'FIRST_MATCHED'\n", " )\n", "\n", "print(\"\\n -- detector created -- \")\n", "print(response) " ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " -- detector activated -- \n", "{'ResponseMetadata': {'RequestId': '12c43795-ac18-42d6-8d07-22f49ca7014d', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1', 'date': 'Tue, 22 Sep 2020 22:55:37 GMT', 'x-amzn-requestid': '12c43795-ac18-42d6-8d07-22f49ca7014d', 'content-length': '2', 'connection': 'keep-alive'}, 'RetryAttempts': 0}}\n" ] } ], "source": [ "# activating the detector\n", "response = afd_resource.update_detector_version_status(\n", " detectorId= DETECTOR_NAME,\n", " detectorVersionId='1',\n", " status='ACTIVE'\n", ")\n", "print(\"\\n -- detector activated -- \")\n", "print(response)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Testing our model endpoint" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "S3_FILE = \"dataset-test.csv\"\n", "S3_FILE_LOC = \"s3://{0}/{1}\".format(S3_BUCKET,S3_FILE)\n", "\n", "s3_resource.Bucket(S3_BUCKET).download_file(S3_FILE, 'dataset-test.csv')" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timevavbvcvdvevfvgvhvivjvkvlvmvnvovpvqvrvsvtvuvvvwvxvyvzvaavabamount
041505.0-16.5265078.584972-18.6498539.505594-13.793819-2.832404-16.7016947.517344-8.507059-14.1101845.299236-10.8340061.671120-9.3738590.360806-9.899247-19.236292-8.3985523.101735-1.5149231.190739-1.127670-2.3585790.673461-1.413700-0.462762-2.018575-1.042804364.19
144261.00.339812-2.743745-0.134070-1.385729-1.4514131.015887-0.5243790.2240600.899746-0.565012-0.0876700.9794270.076883-0.217884-0.136830-2.1428920.1269561.7526620.4325460.506044-0.213436-0.942525-0.526819-1.1569920.311211-0.7466470.0409960.102038520.12
235484.01.399590-0.5907010.168619-1.029950-0.5398060.040444-0.7125670.002299-0.9717470.7568010.5438270.1124531.075384-0.2457720.1804831.769860-0.533172-0.5333001.1922450.2128770.1023980.168269-0.166639-0.8102500.505083-0.2323400.0114090.00463431.00
3167123.0-0.4320711.647895-1.669361-0.3495040.785785-0.6306470.2769900.586025-0.484715-1.376648-1.3283350.2236211.132627-0.5508750.6165680.4979740.5021950.9813430.101264-0.2446330.3589320.873663-0.178642-0.017171-0.207392-0.157756-0.2373860.0019341.50
4168473.02.014160-0.137394-1.0158390.327269-0.182179-0.9565710.043241-0.1607460.3632410.2594520.9421620.850038-0.6161660.592634-0.6038450.091077-0.471867-0.3338160.404711-0.255293-0.238644-0.6164000.3470450.061561-0.3601960.174730-0.078043-0.0705710.89
567878.0-0.641330-0.0573041.489998-1.688131-1.1510430.259996-1.391069-2.3340751.168644-2.0840800.4803810.473738-2.1922760.7739420.2944840.406074-0.5418551.0314500.0170760.618411-1.2316340.257164-0.371953-0.0385661.397514-0.6659470.0310030.180357100.00
6159763.02.023952-0.120140-1.0869180.423019-0.142901-1.1277520.178493-0.3032340.5645090.062831-0.7200470.366835-0.1108570.3190940.108359-0.153633-0.221312-0.9341410.070553-0.210864-0.276175-0.6977080.335631-0.017196-0.3249040.200023-0.071566-0.05822416.99
7139631.0-0.6889441.292153-0.564281-1.4575262.258333-0.3232701.678984-0.104128-1.285351-1.3034350.282728-0.402525-0.548687-0.504283-0.6853390.714828-0.0926740.798953-0.150085-0.037150-0.006880-0.171568-0.720019-0.4194351.2119910.670916-0.1039860.0300848.95
8133944.02.1193620.142639-2.3733370.5419490.608419-1.7755640.955775-0.5993830.0104200.295305-0.936569-0.452478-1.3407981.077459-0.099584-0.8150720.018481-0.639446-0.065427-0.3235730.2642640.898266-0.1680630.0593110.6269490.729035-0.129120-0.09471310.00
958769.0-5.584256-4.732413-0.448452-0.121442-0.707412-0.114376-1.5546281.402126-0.031693-0.942358-2.439501-0.552312-0.295588-0.250246-1.1977321.5495530.933237-1.2376890.416832-1.0469000.0416510.6217890.223467-0.7701370.621182-0.0287380.505194-1.898323101.49
\n", "
" ], "text/plain": [ " time va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount\n", "0 41505.0 -16.526507 8.584972 -18.649853 9.505594 -13.793819 -2.832404 -16.701694 7.517344 -8.507059 -14.110184 5.299236 -10.834006 1.671120 -9.373859 0.360806 -9.899247 -19.236292 -8.398552 3.101735 -1.514923 1.190739 -1.127670 -2.358579 0.673461 -1.413700 -0.462762 -2.018575 -1.042804 364.19\n", "1 44261.0 0.339812 -2.743745 -0.134070 -1.385729 -1.451413 1.015887 -0.524379 0.224060 0.899746 -0.565012 -0.087670 0.979427 0.076883 -0.217884 -0.136830 -2.142892 0.126956 1.752662 0.432546 0.506044 -0.213436 -0.942525 -0.526819 -1.156992 0.311211 -0.746647 0.040996 0.102038 520.12\n", "2 35484.0 1.399590 -0.590701 0.168619 -1.029950 -0.539806 0.040444 -0.712567 0.002299 -0.971747 0.756801 0.543827 0.112453 1.075384 -0.245772 0.180483 1.769860 -0.533172 -0.533300 1.192245 0.212877 0.102398 0.168269 -0.166639 -0.810250 0.505083 -0.232340 0.011409 0.004634 31.00\n", "3 167123.0 -0.432071 1.647895 -1.669361 -0.349504 0.785785 -0.630647 0.276990 0.586025 -0.484715 -1.376648 -1.328335 0.223621 1.132627 -0.550875 0.616568 0.497974 0.502195 0.981343 0.101264 -0.244633 0.358932 0.873663 -0.178642 -0.017171 -0.207392 -0.157756 -0.237386 0.001934 1.50\n", "4 168473.0 2.014160 -0.137394 -1.015839 0.327269 -0.182179 -0.956571 0.043241 -0.160746 0.363241 0.259452 0.942162 0.850038 -0.616166 0.592634 -0.603845 0.091077 -0.471867 -0.333816 0.404711 -0.255293 -0.238644 -0.616400 0.347045 0.061561 -0.360196 0.174730 -0.078043 -0.070571 0.89\n", "5 67878.0 -0.641330 -0.057304 1.489998 -1.688131 -1.151043 0.259996 -1.391069 -2.334075 1.168644 -2.084080 0.480381 0.473738 -2.192276 0.773942 0.294484 0.406074 -0.541855 1.031450 0.017076 0.618411 -1.231634 0.257164 -0.371953 -0.038566 1.397514 -0.665947 0.031003 0.180357 100.00\n", "6 159763.0 2.023952 -0.120140 -1.086918 0.423019 -0.142901 -1.127752 0.178493 -0.303234 0.564509 0.062831 -0.720047 0.366835 -0.110857 0.319094 0.108359 -0.153633 -0.221312 -0.934141 0.070553 -0.210864 -0.276175 -0.697708 0.335631 -0.017196 -0.324904 0.200023 -0.071566 -0.058224 16.99\n", "7 139631.0 -0.688944 1.292153 -0.564281 -1.457526 2.258333 -0.323270 1.678984 -0.104128 -1.285351 -1.303435 0.282728 -0.402525 -0.548687 -0.504283 -0.685339 0.714828 -0.092674 0.798953 -0.150085 -0.037150 -0.006880 -0.171568 -0.720019 -0.419435 1.211991 0.670916 -0.103986 0.030084 8.95\n", "8 133944.0 2.119362 0.142639 -2.373337 0.541949 0.608419 -1.775564 0.955775 -0.599383 0.010420 0.295305 -0.936569 -0.452478 -1.340798 1.077459 -0.099584 -0.815072 0.018481 -0.639446 -0.065427 -0.323573 0.264264 0.898266 -0.168063 0.059311 0.626949 0.729035 -0.129120 -0.094713 10.00\n", "9 58769.0 -5.584256 -4.732413 -0.448452 -0.121442 -0.707412 -0.114376 -1.554628 1.402126 -0.031693 -0.942358 -2.439501 -0.552312 -0.295588 -0.250246 -1.197732 1.549553 0.933237 -1.237689 0.416832 -1.046900 0.041651 0.621789 0.223467 -0.770137 0.621182 -0.028738 0.505194 -1.898323 101.49" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.width', 1000)\n", "\n", "test = pd.read_csv('dataset-test.csv', delimiter=',')\n", "test.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Cleaning the test dataset from training columns and defining the start datetime." ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount\n" ] } ], "source": [ "record_count = 500\n", "model_variables = [column for column in test.columns if column not in ['EVENT_LABEL', 'EVENT_TIMESTAMP']]\n", "dateTimeObj = datetime.now()\n", "timestampStr = dateTimeObj.strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n", "print(' '.join(model_variables))" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'modelScores': [{'modelVersion': {'modelId': 'fraud_detector_model20200922', 'modelType': 'ONLINE_FRAUD_INSIGHTS', 'modelVersionNumber': '1.0'}, 'scores': {'fraud_detector_model20200922_insightscore': 8.0}}], 'ruleResults': [{'ruleId': 'rule5_fraud_detector_model20200922', 'outcomes': ['approve']}], 'ResponseMetadata': {'RequestId': 'd85b48b5-4ed8-495f-a7ee-e545092a9021', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1', 'date': 'Tue, 22 Sep 2020 22:56:18 GMT', 'x-amzn-requestid': 'd85b48b5-4ed8-495f-a7ee-e545092a9021', 'content-length': '286', 'connection': 'keep-alive'}, 'RetryAttempts': 0}}\n" ] } ], "source": [ "import uuid\n", "\n", "# test the endpoint with a single prediction.\n", "eventId = uuid.uuid1()\n", "testrecord = test[model_variables].head(15).astype(str).to_dict(orient='records')[6]\n", "pred = afd_resource.get_event_prediction(detectorId=DETECTOR_NAME, \n", " detectorVersionId='1',\n", " eventId = str(eventId),\n", " eventTypeName = EVENT_TYPE,\n", " eventTimestamp = timestampStr, \n", " entities = [{'entityType': ENTITY_TYPE, 'entityId':str(eventId.int)}],\n", " eventVariables= testrecord)\n", "print(pred)" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(28481,)" ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test.count()\n", "y_test" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The next block will use some parallelization to run several test against the fraud detector endpoint." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import dask \n", "import time\n", "from IPython.core.display import display, HTML\n", "#display(HTML(\"\"))\n", "\n", "start = time.time()\n", "\n", "@dask.delayed\n", "def _predict(record):\n", " eventId = uuid.uuid1()\n", " try:\n", " pred = afd_resource.get_event_prediction(detectorId=DETECTOR_NAME, \n", " detectorVersionId='1',\n", " eventId = str(eventId),\n", " eventTypeName = EVENT_TYPE,\n", " eventTimestamp = timestampStr, \n", " entities = [{'entityType': ENTITY_TYPE, 'entityId':str(eventId.int)}],\n", " eventVariables= record) \n", " \n", " record[\"score\"] = pred['modelScores'][0]['scores'][\"{0}_insightscore\".format(MODEL_NAME)]\n", " if len(pred['ruleResults']) > 0:\n", " record[\"outcomes\"]= pred['ruleResults'][0]['outcomes']\n", " else:\n", " record[\"outcomes\"]= 'approve'\n", " return record\n", " \n", " except:\n", " pred = afd_resource.get_event_prediction(detectorId=DETECTOR_NAME, \n", " detectorVersionId='1',\n", " eventId = str(eventId),\n", " eventTypeName = EVENT_TYPE,\n", " eventTimestamp = timestampStr, \n", " entities = [{'entityType': ENTITY_TYPE, 'entityId':str(eventId.int)}],\n", " eventVariables= record) \n", " record[\"score\"] = \"-999\"\n", " record[\"outcomes\"]= \"error\"\n", " return record" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# dask approach, if you want to run the entire test dataset(28481) preffer the loop approach on the next cell\n", "predict_data = test[model_variables].head(400).astype(str).to_dict(orient='records')\n", "predict_score = []\n", "\n", "i=0\n", "for record in predict_data:\n", " clear_output(wait=True)\n", " rec = dask.delayed(_predict)(record)\n", " predict_score.append(rec)\n", " i += 1\n", " print(\"current progress: \", round((i/record_count)*100,2), \"%\" )\n", " \n", "predict_recs = dask.compute(*predict_score)\n", "\n", "# Calculate time taken and print results\n", "time_taken = time.time() - start\n", "tps = len(predict_recs) / time_taken\n", "\n", "print ('Process took %0.2f seconds' %time_taken)\n", "print ('Scored %d records' %len(predict_recs))" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Process took 4600.17 seconds\n", "Scored 28481 records\n" ] } ], "source": [ "# loop approach, slower but doesnt freeze the browser, go grab a coffe :)\n", "predict_data = test[model_variables].head(28481).astype(str).to_dict(orient='records')\n", "predict_score = []\n", "start = time.time()\n", "\n", "for record in predict_data:\n", " eventId = uuid.uuid1()\n", " pred = afd_resource.get_event_prediction(detectorId=DETECTOR_NAME, \n", " detectorVersionId='1',\n", " eventId = str(eventId),\n", " eventTypeName = EVENT_TYPE,\n", " eventTimestamp = timestampStr, \n", " entities = [{'entityType': ENTITY_TYPE, 'entityId':str(eventId.int)}],\n", " eventVariables= record)\n", " record[\"score\"] = pred['modelScores'][0]['scores'][\"{0}_insightscore\".format(MODEL_NAME)]\n", " \n", "\n", "\n", "# Calculate time taken and print results\n", "time_taken = time.time() - start\n", "tps = len(predict_recs) / time_taken\n", "\n", "print ('Process took %0.2f seconds' %time_taken)\n", "print ('Scored %d records' %len(predict_data))" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [], "source": [ "p_data = [d['score'] for d in predict_data]" ] }, { "cell_type": "code", "execution_count": 205, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Balanced accuracy = 0.949470569796867\n" ] } ], "source": [ "# fraud detector model balanced accuracy for 0.9 threshold\n", "afd_preds = np.where(np.array(p_data) > 950, 1, 0)\n", "print(\"Balanced accuracy = {}\".format(balanced_accuracy_score(y_test, afd_preds)))" ] }, { "cell_type": "code", "execution_count": 183, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plot_confusion_matrix(y_test, afd_preds)" ] }, { "cell_type": "code", "execution_count": 206, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " non-fraud 1.00 0.99 0.99 28435\n", " fraud 0.09 0.91 0.17 46\n", "\n", " accuracy 0.99 28481\n", " macro avg 0.55 0.95 0.58 28481\n", "weighted avg 1.00 0.99 0.99 28481\n", "\n" ] } ], "source": [ "print(classification_report(\n", " y_test, afd_preds, target_names=['non-fraud', 'fraud']))" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timevavbvcvdvevfvgvhvivjvkvlvmvnvovpvqvrvsvtvuvvvwvxvyvzvaavabamountscoreoutcomes
041505.0-16.52650656912318.58497179585822-18.6498531851944989.50559351508723-13.793818527095699-2.8324042993974703-16.7016942960457.517343903709871-8.50705863675898-14.1101844415456965.29923634963938-10.83400648147341.67112025332681-9.373858583649760.36080564163161705-9.899246540806661-19.2362923697613-8.398551994945753.10173536885404-1.514923435278521.19073869481428-1.1276700090206102-2.35857876978810.6734613289872371-1.4136996745881998-0.46276236139933-2.01857524875161-1.04280416970881364.191000.0[fraud]
1652913.00.6768548293865951.58702920998622-1.508146982548071.443814844350841.31678997910859-1.16034214105187021.1490491098159-0.4193931328245341-0.0905654487115674-0.3230584848927612.967920897973221-0.155026280352087-0.7177661091303009-3.709275179915210.7310503071803590.510125020058252.73026213011681041.46374586218951-0.5784055858811940.31792704392540505-0.1914184018868230.1014507814923720.05829839657805901-0.135311156556857-0.6044061673895129-0.45276321577020710.0464301769729269-0.38364151787050610.89977.0[fraud]
1955575.0-3.41942263819263-1.130075598090151.607086373753772.22997892209850962.64408903133496-0.79353772235042011.08824220975816-1.00840988415033-0.9333151682582161.921981759265391.95444539942508970.3888397669221480.30147367767299-0.427868493466856-0.0292266721907548-0.0750638320495107-0.7991270792650841-0.629796480633153-0.469016593690311-1.44533223472583-0.4796742054290580.5240222644533290.0936850261636710.305230152643873930.65396264039042710.0228521071220258-0.955552398442486-0.085629544571638322.7953.0[fraud]
32141655.0-29.942972270707894-25.8317821051362-16.22751192912736.690679169572508-20.787845811536313.085694370938917.2566228959585-9.161746211138565.00304130464362-2.43146639127421.75998939335631021.861432717528513.05338368623288-1.40963974297515023.680271528524714.531275030273720.22001123266623696-1.604536175899423.14031061569242-21.3871219290332-2.49469918146669-0.660297208514391-8.537815806510410.40080447201294-0.6430227870311340.49690284583126896.26770908866261-2.76507000175875953502.11962.0[fraud]
5156803.01.173609124328550.511603445630379-0.52045147570860111.122626206084360.35886736932159297-0.295960433531853030.02500463877529330.14444171501870098-0.110183833896483-0.5133584384151081.50090553458987-0.228726450784267-1.5703348271985802-0.8396775630709610.82551133038619420.66477432410181890.8862748303610830.925736187467082-0.5606318507221589-0.20184199976003606-0.0268349064777576-0.0755364811334122-0.129545704955994-0.48168672971307110.582500674589865-0.2898759014019720.03791282390347460.03743657518193761.0958.0[fraud]
5994330.0-0.1591882534515391.63642225435632051.545990380254814.554799403928640.9725001613143450.1356194690038960.890709635187871-0.308928981901544-0.7506710020331751.488085162255360.0189227146632523-3.514951120162031.378520839157811.75093472390239980.0364873683299348-0.3325434537365550.83124079874662720.02724267393996681.117555277824450.374095700292396-0.4149084993816161-0.72533256362199290.0198138142745913-0.0804172850283939-0.56664384906918710.02949884484729110.2068604920836630.0030600077367060412.74965.0[fraud]
17758954.01.238042617998440.77053043333537-0.3319975239085191.345464512194510.139816295211697-1.229675965953360.354259165638368-0.280804576159513-0.184996149214632-0.6998128892677070.2306550666583520.214775247241316050.561439138721415-1.45979426106370981.02124766236501020.5671136559722870.8590524028769070.516214961212954-0.629783953300129-0.0582422755483618-0.0605703154557262-0.0710964980063678-0.1310000239867180.2691442471066710.7120999799201929-0.329489178612648040.0393080261364550.05883787437146981.0947.0[fraud]
253878.0-2.51219240716515020.317838238718562041.270332802121071.214257950189791.17990250727029-0.412233265109121061.00271608138525-0.77468018394012411.656601996466922.7790737156669403-0.0847781328647432-0.18321933511528-0.330652014184725-1.27735053626582020.586282642874345-1.569661847674820.122172327014581-0.91672986647514520.66952463425091890.5929484938755051-0.6552577988297880.24236823281838-0.1350739000735970.1042607010447150.211102067974728-0.287306672363311030.3351291310584530.3282118191077618.8962.0[fraud]
328110199.01.751363243257720.3378124008339520.7952623929564184.31273176972153-0.681348924648448-0.126233056366034-0.566591255980354-0.03134405796601180.9195228807082540.9751300812665230.435764202447928-2.215695859205621.914571179079941.15256247746996-1.37465707640882020.82398092409294710.175328666447042-0.0483307755538258-2.38942182862032-0.2899434441092140.160070701106165950.8038411155151620.253569280923484960.8074573633362009-0.3400690874520441-0.00655722686278409-0.00813323923492112-0.025071551940193729.58955.0[fraud]
352164441.01.887761424030020.62826988690252-2.759570582863541.622395827442550.9446222968645832-1.154559012666870.364580675536746-0.135389346167279020.0345521682776357-0.97469507733035521.53126492421627-0.558651257008026-1.40204078796233-2.72556210859660.4057779092180571.431159483215741.956127100137552.26414005330485-0.8243269352259971-0.133303038078448980.1068235738572810.29362617939436403-0.136295490129905-0.73096267217823210.263143555477082-0.41954559365040990.02923912049066180.019826445101855148.8976.0[fraud]
36168878.0-4.80458628691832-4.350219221436540.85473394419768510.2078105803862342.91117776563597-2.16185200836836-2.10165901071884-1.60972469419277-1.245613056767151.136409004113531.56282964824949-0.656243196361657-1.353672680218550.4997651307796050.81534951867079510.77316997390809210.809416192491745-1.49110563925249022.03964533568703-2.16862341900588970.759812961941168-0.65509493945911791.646318675080590.04879335932347210.8261049032712451-0.09576068169313651.3059493867690302-0.57937853608619760.17960.0[fraud]
36623407.01.240746736507220.848160288821537-0.1998249517890071.553723837558020.171272503594932-1.194116483501410.228779262997882-0.3339504046228011.17298661915574-1.056222337416731.09596426457221-2.548640740955231.718976603118150.07083826639059470.336702692339511940.560484743900291.61892627448261050.8487676121649308-0.8297510609350229-0.13240918359282802-0.216617664690264-0.2968326218353089-0.1134421958930960.1949130118969550.67266243405551-0.3664718399033450.00942251427038020.05138885380081561.0902.0[fraud]
46753370.0-2.52338699856657-5.90958900863933-2.11454907610384042.68771715054533-1.71894595466850020.78834723929927012.62144747179771-0.387473414390844-0.304782617317214-0.75019119617736710.89256625998421490.679212965658016-0.5466734522705841.03001531809147-0.116777526168877-0.09147870610096-0.105836250043010.193687825429198-0.6820109170168113.676626072116961.17496005535309-0.8406424904120059-1.85015434331846-0.213489917842780.0526663052725845-0.42989130448891705-0.3449938939361070.3287425446865911918.5907.0[fraud]
\n", "
" ], "text/plain": [ " time va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz vaa vab amount score outcomes\n", "0 41505.0 -16.5265065691231 8.58497179585822 -18.649853185194498 9.50559351508723 -13.793818527095699 -2.8324042993974703 -16.701694296045 7.517343903709871 -8.50705863675898 -14.110184441545696 5.29923634963938 -10.8340064814734 1.67112025332681 -9.37385858364976 0.36080564163161705 -9.899246540806661 -19.2362923697613 -8.39855199494575 3.10173536885404 -1.51492343527852 1.19073869481428 -1.1276700090206102 -2.3585787697881 0.6734613289872371 -1.4136996745881998 -0.46276236139933 -2.01857524875161 -1.04280416970881 364.19 1000.0 [fraud]\n", "16 52913.0 0.676854829386595 1.58702920998622 -1.50814698254807 1.44381484435084 1.31678997910859 -1.1603421410518702 1.1490491098159 -0.4193931328245341 -0.0905654487115674 -0.323058484892761 2.967920897973221 -0.155026280352087 -0.7177661091303009 -3.70927517991521 0.731050307180359 0.51012502005825 2.7302621301168104 1.46374586218951 -0.578405585881194 0.31792704392540505 -0.191418401886823 0.101450781492372 0.05829839657805901 -0.135311156556857 -0.6044061673895129 -0.4527632157702071 0.0464301769729269 -0.3836415178705061 0.89 977.0 [fraud]\n", "19 55575.0 -3.41942263819263 -1.13007559809015 1.60708637375377 2.2299789220985096 2.64408903133496 -0.7935377223504201 1.08824220975816 -1.00840988415033 -0.933315168258216 1.92198175926539 1.9544453994250897 0.388839766922148 0.30147367767299 -0.427868493466856 -0.0292266721907548 -0.0750638320495107 -0.7991270792650841 -0.629796480633153 -0.469016593690311 -1.44533223472583 -0.479674205429058 0.524022264453329 0.093685026163671 0.30523015264387393 0.6539626403904271 0.0228521071220258 -0.955552398442486 -0.0856295445716383 22.7 953.0 [fraud]\n", "32 141655.0 -29.942972270707894 -25.8317821051362 -16.2275119291273 6.690679169572508 -20.7878458115363 13.0856943709389 17.2566228959585 -9.16174621113856 5.00304130464362 -2.4314663912742 1.7599893933563102 1.86143271752851 3.05338368623288 -1.4096397429751502 3.68027152852471 4.53127503027372 0.22001123266623696 -1.60453617589942 3.14031061569242 -21.3871219290332 -2.49469918146669 -0.660297208514391 -8.53781580651041 0.40080447201294 -0.643022787031134 0.4969028458312689 6.26770908866261 -2.7650700017587595 3502.11 962.0 [fraud]\n", "51 56803.0 1.17360912432855 0.511603445630379 -0.5204514757086011 1.12262620608436 0.35886736932159297 -0.29596043353185303 0.0250046387752933 0.14444171501870098 -0.110183833896483 -0.513358438415108 1.50090553458987 -0.228726450784267 -1.5703348271985802 -0.839677563070961 0.8255113303861942 0.6647743241018189 0.886274830361083 0.925736187467082 -0.5606318507221589 -0.20184199976003606 -0.0268349064777576 -0.0755364811334122 -0.129545704955994 -0.4816867297130711 0.582500674589865 -0.289875901401972 0.0379128239034746 0.0374365751819376 1.0 958.0 [fraud]\n", "59 94330.0 -0.159188253451539 1.6364222543563205 1.54599038025481 4.55479940392864 0.972500161314345 0.135619469003896 0.890709635187871 -0.308928981901544 -0.750671002033175 1.48808516225536 0.0189227146632523 -3.51495112016203 1.37852083915781 1.7509347239023998 0.0364873683299348 -0.332543453736555 0.8312407987466272 0.0272426739399668 1.11755527782445 0.374095700292396 -0.4149084993816161 -0.7253325636219929 0.0198138142745913 -0.0804172850283939 -0.5666438490691871 0.0294988448472911 0.206860492083663 0.00306000773670604 12.74 965.0 [fraud]\n", "177 58954.0 1.23804261799844 0.77053043333537 -0.331997523908519 1.34546451219451 0.139816295211697 -1.22967596595336 0.354259165638368 -0.280804576159513 -0.184996149214632 -0.699812889267707 0.230655066658352 0.21477524724131605 0.561439138721415 -1.4597942610637098 1.0212476623650102 0.567113655972287 0.859052402876907 0.516214961212954 -0.629783953300129 -0.0582422755483618 -0.0605703154557262 -0.0710964980063678 -0.131000023986718 0.269144247106671 0.7120999799201929 -0.32948917861264804 0.039308026136455 0.0588378743714698 1.0 947.0 [fraud]\n", "253 878.0 -2.5121924071651502 0.31783823871856204 1.27033280212107 1.21425795018979 1.17990250727029 -0.41223326510912106 1.00271608138525 -0.7746801839401241 1.65660199646692 2.7790737156669403 -0.0847781328647432 -0.18321933511528 -0.330652014184725 -1.2773505362658202 0.586282642874345 -1.56966184767482 0.122172327014581 -0.9167298664751452 0.6695246342509189 0.5929484938755051 -0.655257798829788 0.24236823281838 -0.135073900073597 0.104260701044715 0.211102067974728 -0.28730667236331103 0.335129131058453 0.32821181910776 18.8 962.0 [fraud]\n", "328 110199.0 1.75136324325772 0.337812400833952 0.795262392956418 4.31273176972153 -0.681348924648448 -0.126233056366034 -0.566591255980354 -0.0313440579660118 0.919522880708254 0.975130081266523 0.435764202447928 -2.21569585920562 1.91457117907994 1.15256247746996 -1.3746570764088202 0.8239809240929471 0.175328666447042 -0.0483307755538258 -2.38942182862032 -0.289943444109214 0.16007070110616595 0.803841115515162 0.25356928092348496 0.8074573633362009 -0.3400690874520441 -0.00655722686278409 -0.00813323923492112 -0.0250715519401937 29.58 955.0 [fraud]\n", "352 164441.0 1.88776142403002 0.62826988690252 -2.75957058286354 1.62239582744255 0.9446222968645832 -1.15455901266687 0.364580675536746 -0.13538934616727902 0.0345521682776357 -0.9746950773303552 1.53126492421627 -0.558651257008026 -1.40204078796233 -2.7255621085966 0.405777909218057 1.43115948321574 1.95612710013755 2.26414005330485 -0.8243269352259971 -0.13330303807844898 0.106823573857281 0.29362617939436403 -0.136295490129905 -0.7309626721782321 0.263143555477082 -0.4195455936504099 0.0292391204906618 0.0198264451018551 48.8 976.0 [fraud]\n", "361 68878.0 -4.80458628691832 -4.35021922143654 0.8547339441976851 0.207810580386234 2.91117776563597 -2.16185200836836 -2.10165901071884 -1.60972469419277 -1.24561305676715 1.13640900411353 1.56282964824949 -0.656243196361657 -1.35367268021855 0.499765130779605 0.8153495186707951 0.7731699739080921 0.809416192491745 -1.4911056392524902 2.03964533568703 -2.1686234190058897 0.759812961941168 -0.6550949394591179 1.64631867508059 0.0487933593234721 0.8261049032712451 -0.0957606816931365 1.3059493867690302 -0.579378536086197 60.17 960.0 [fraud]\n", "366 23407.0 1.24074673650722 0.848160288821537 -0.199824951789007 1.55372383755802 0.171272503594932 -1.19411648350141 0.228779262997882 -0.333950404622801 1.17298661915574 -1.05622233741673 1.09596426457221 -2.54864074095523 1.71897660311815 0.0708382663905947 0.33670269233951194 0.56048474390029 1.6189262744826105 0.8487676121649308 -0.8297510609350229 -0.13240918359282802 -0.216617664690264 -0.2968326218353089 -0.113442195893096 0.194913011896955 0.67266243405551 -0.366471839903345 0.0094225142703802 0.0513888538008156 1.0 902.0 [fraud]\n", "467 53370.0 -2.52338699856657 -5.90958900863933 -2.1145490761038404 2.68771715054533 -1.7189459546685002 0.7883472392992701 2.62144747179771 -0.387473414390844 -0.304782617317214 -0.7501911961773671 0.8925662599842149 0.679212965658016 -0.546673452270584 1.03001531809147 -0.116777526168877 -0.09147870610096 -0.10583625004301 0.193687825429198 -0.682010917016811 3.67662607211696 1.17496005535309 -0.8406424904120059 -1.85015434331846 -0.21348991784278 0.0526663052725845 -0.42989130448891705 -0.344993893936107 0.328742544686591 1918.5 907.0 [fraud]" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# lets take a look to the predicted frauds\n", "predictions = pd.DataFrame.from_dict(predict_recs, orient='columns')\n", "predictions.loc[predictions['score'].astype('float32') > 900]" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ResponseMetadata': {'RequestId': '5B6ED35CCB4FB643',\n", " 'HostId': 'lfEj4Wi46OeQJbr7wiYy4Vh2xqAZdNFjHDXWIx449ZKcXRkZIb+RNiqI5eeyQ3fKi+Gv/0YZTtI=',\n", " 'HTTPStatusCode': 200,\n", " 'HTTPHeaders': {'x-amz-id-2': 'lfEj4Wi46OeQJbr7wiYy4Vh2xqAZdNFjHDXWIx449ZKcXRkZIb+RNiqI5eeyQ3fKi+Gv/0YZTtI=',\n", " 'x-amz-request-id': '5B6ED35CCB4FB643',\n", " 'date': 'Tue, 22 Sep 2020 22:57:06 GMT',\n", " 'x-amz-server-side-encryption': 'AES256',\n", " 'etag': '\"a349bb5bdc090d7f8ce04f7816ac60d5\"',\n", " 'content-length': '0',\n", " 'server': 'AmazonS3'},\n", " 'RetryAttempts': 0},\n", " 'ETag': '\"a349bb5bdc090d7f8ce04f7816ac60d5\"',\n", " 'ServerSideEncryption': 'AES256'}" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# save the results to a csv file and upload it to the output s3 bucket\n", "csv_buffer = StringIO()\n", "predictions.to_csv(csv_buffer, index=False)\n", "s3_resource.Object(S3_BUCKET, MODEL_NAME + \"precictions{}.csv\".format(sufx)).put(Body=csv_buffer.getvalue())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finish" ] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 2 }