# Players micro-transactions fraud detector with SageMaker Linear Regression - Logistic Regression

## Investigate and process the data

In [15]:
import boto3
import botocore
import sagemaker
import sys


bucket = 'percona2020-player-events'   # <--- specify a bucket you have access to
execution_role = sagemaker.get_execution_role()



# check if the bucket exists
try:
    boto3.Session().client('s3').head_bucket(Bucket=bucket)
except botocore.exceptions.ParamValidationError as e:
    print('Hey! You either forgot to specify your S3 bucket'
          ' or you gave your bucket an invalid name!')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == '403':
        print("Hey! You don't have permission to access the bucket, {}.".format(bucket))
    elif e.response['Error']['Code'] == '404':
        print("Hey! Your bucket, {}, doesn't exist!".format(bucket))
    else:
        raise
else:
    print('Training input/output will be stored in: s3://{}/'.format(bucket))

Training input/output will be stored in: s3://percona2020-player-events/


In [16]:
%%bash
aws s3 ls s3://percona2020-player-events/

                           PRE AWSLogs/
                           PRE curated-data/
                           PRE encounters/
                           PRE players_cheat_model/
                           PRE sagemaker/
                           PRE transactions_cheat_model/
2020-03-24 16:26:21    5014975 curated-data
2020-02-24 04:40:50    2496166 model.tar.gz
2020-02-24 04:55:37    5247027 player-dynamic-encounters.csv
2020-02-26 16:48:23 15192938184 player_encounters.csv
2020-04-23 05:32:36   55749535 players-transactions.csv.part_00000
2020-03-24 23:22:41    5307147 results.csv
2020-04-20 22:43:26   15196650 training_player_trans.csv


In [17]:
%%time

import pandas as pd
import urllib.request
import boto3

data_filename = 'players-transactions.csv.part_00000'
data_objectname = 'players-transactions.csv.part_00000'
# execute in RDS to generate data_source
# select month,day,hour,minute,unix_timestamp,name,uagent,class from  transactions INTO OUTFILE S3 's3-us-west-2://percona2020-player-events/players-transactions.csv' FORMAT CSV HEADER  OVERWRITE ON;
data_source = 'percona2020-player-events'


s3 = boto3.client('s3')
s3.download_file(data_source, data_objectname, data_filename)

player_data = pd.read_csv(data_filename, delimiter=',')
print(player_data.columns)

Index(['month', 'day', 'hour', 'minute', 'unix_timestamp', 'name', 'uagent',
       'class'],
      dtype='object')
CPU times: user 1.64 s, sys: 238 ms, total: 1.88 s
Wall time: 1.52 s


In [18]:
print(player_data.columns)
#player_data[['timestamp','playerGuId','name', 'class']].describe()

Index(['month', 'day', 'hour', 'minute', 'unix_timestamp', 'name', 'uagent',
       'class'],
      dtype='object')


Let's take a peek at our data (we only show a subset of the columns in the table):

In [20]:
#player_data=player_data.drop('playerGuId',axis=1)
#player_data=player_data.drop('timestamp',axis=1)
player_data.head()

Unnamed: 0,month,day,hour,minute,unix_timestamp,name,uagent,class
0,4,7,5,44,1587793483,LootBoxesType2,8,0
1,5,6,19,39,1590781178,LootBoxesType2,6,0
2,4,2,22,58,1587423493,Wormhole,7,0
3,4,1,16,12,1587312731,Wormhole,8,0
4,5,5,19,31,1589484694,Wormhole,9,0


encode transaction type (name)

In [21]:
import csv
import sys
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(player_data.name)
player_data["name_encoded"]=integer_encoded
player_data.head()



Unnamed: 0,month,day,hour,minute,unix_timestamp,name,uagent,class,name_encoded
0,4,7,5,44,1587793483,LootBoxesType2,8,0,4
1,5,6,19,39,1590781178,LootBoxesType2,6,0,4
2,4,2,22,58,1587423493,Wormhole,7,0,6
3,4,1,16,12,1587312731,Wormhole,8,0,6
4,5,5,19,31,1589484694,Wormhole,9,0,6


In [35]:
trans_type=player_data.groupby('name_encoded').size()
print(trans_type)

name_encoded
0       453
1    188837
2     83169
3     73600
4    769446
5     79070
6    162674
7      1731
dtype: int64


In [22]:
player_data=player_data.drop('name',axis=1)
player_data=player_data.drop('unix_timestamp',axis=1)
player_data=player_data[['month','day','hour','minute','name_encoded','uagent','class']]
print(player_data.columns)

Index(['month', 'day', 'hour', 'minute', 'name_encoded', 'uagent', 'class'], dtype='object')


In [23]:
nonfrauds, frauds = player_data.groupby('class').size()
print('Number of frauds: ', frauds)
print('Number of non-frauds: ', nonfrauds)
print('Percentage of fradulent data:', 100.*frauds/(frauds + nonfrauds))

#player_data=player_data.drop('name',axis=1)



Number of frauds:  1731
Number of non-frauds:  1357249
Percentage of fradulent data: 0.12737494297193483


The class column corresponds to whether or not a transaction is fradulent. We see that the majority of data is non-fraudulant with only $1731$ ($0.127\%$) of the data corresponding to fraudulant examples.

In [30]:
print(player_data.columns)
player_data[['month','day','hour','minute','name_encoded','uagent','class']].describe()



Index(['month', 'day', 'hour', 'minute', 'name_encoded', 'uagent', 'class'], dtype='object')


Unnamed: 0,month,day,hour,minute,name_encoded,uagent,class
count,1358980.0,1358980.0,1358980.0,1358980.0,1358980.0,1358980.0,1358980.0
mean,4.728811,4.009378,11.36756,29.92128,3.706655,5.523543,0.001273749
std,0.9411845,1.985817,6.890938,17.43651,1.420315,2.860383,0.0356669
min,1.0,1.0,0.0,0.0,0.0,1.0,0.0
25%,4.0,2.0,5.0,15.0,3.0,3.0,0.0
50%,5.0,4.0,11.0,30.0,4.0,6.0,0.0
75%,5.0,6.0,17.0,45.0,4.0,8.0,0.0
max,12.0,7.0,23.0,59.0,7.0,10.0,1.0


In [31]:
print(player_data.columns)
feature_columns = player_data.columns[:-1]
label_column = player_data.columns[-1]

print('feature_columns={}'.format(feature_columns))
print('label_column={}'.format(label_column))

features = player_data[feature_columns].values.astype('float32')
labels = (player_data[label_column].values).astype('float32')

Index(['month', 'day', 'hour', 'minute', 'name_encoded', 'uagent', 'class'], dtype='object')
feature_columns=Index(['month', 'day', 'hour', 'minute', 'name_encoded', 'uagent'], dtype='object')
label_column=class


Let's do some analysis and discuss different ways we can preprocess our data. Let's discuss the way in which this data was preprocessed.

## SageMaker Linear Learner

### Prepare Data and Upload to S3

The Amazon common libraries provide utilities to convert NumPy n-dimensional arrays into a the Record-IO format which SageMaker uses for a concise representation of features and labels. The Record-IO format is implemented via protocol buffer so the serialization is very efficient.

In [26]:
import io
import sagemaker.amazon.common as smac

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, features, labels)
buf.seek(0);

Now we upload the data to S3 using boto3.

In [27]:
import boto3
import os
import sagemaker

session = sagemaker.Session()
bucket = 'percona2020-player-events'

prefix = 'transactions_cheat_model'
key = 'recordio-pb-data'

boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)

s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('Uploaded training data location: {}'.format(s3_train_data))

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('Training artifacts will be uploaded to: {}'.format(output_location))

Uploaded training data location: s3://percona2020-player-events/transactions_cheat_model/train/recordio-pb-data
Training artifacts will be uploaded to: s3://percona2020-player-events/transactions_cheat_model/output


Now we train a Linear Learner using SageMaker's built-in algorithm. To specify the Linear Learner algorithm, we use a utility function to obtain it's URI. A complete list of build-in algorithms is found here: https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html

In [28]:
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(boto3.Session().region_name, 'linear-learner')

SageMaker abstracts training with Estimators. We can pass container, and all parameters to the estimator, as well as the hyperparameters for the linear learner and fit the estimator to the data in S3.

In [29]:
from sagemaker import get_execution_role

linear = sagemaker.estimator.Estimator(container,
                                       get_execution_role(), 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=session)
linear.set_hyperparameters(feature_dim=features.shape[1],
                           predictor_type='binary_classifier',
                           mini_batch_size=200)

linear.fit({'train': s3_train_data},wait=False)

### Host Linear Classifier

Now we deploy the estimator to and endpoint.

In [32]:
from sagemaker.predictor import csv_serializer, json_deserializer

linear_predictor = linear.deploy(initial_instance_count=1,
                                 endpoint_name="trans-cheat",
                                 instance_type='ml.m4.xlarge')
# Specify input and output formats.
linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

-------------------------!

Call the model from aurora
select *, trans_cheat_score(month,day,hour,minute,name_encoded,uagent) class from transactions where class>0;

select playerGuid from (select playerGuid,trans_cheat_score(12,5,60,43,7,3) class from transactions) as t where t.class>0;

## Clean up

We will leave the prediction endpoint running at the end of this notebook so we can handle incoming event streams. However, don't forget to delete the prediction endpoint when you're done. You can do that at the Amazon SageMaker console in the Endpoints page. Or you can run `linear_predictor.delete_endpoint()`