{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Players non-legit authentication detector with SageMaker Linear Regression - Logistic Regression" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Investigate and process the data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training input/output will be stored in: s3://percona2020-player-events/\n" ] } ], "source": [ "import boto3\n", "import botocore\n", "import sagemaker\n", "import sys\n", "\n", "\n", "bucket = 'percona2020-player-events' # <--- specify a bucket you have access to\n", "execution_role = sagemaker.get_execution_role()\n", "\n", "\n", "\n", "# check if the bucket exists\n", "try:\n", " boto3.Session().client('s3').head_bucket(Bucket=bucket)\n", "except botocore.exceptions.ParamValidationError as e:\n", " print('Hey! You either forgot to specify your S3 bucket'\n", " ' or you gave your bucket an invalid name!')\n", "except botocore.exceptions.ClientError as e:\n", " if e.response['Error']['Code'] == '403':\n", " print(\"Hey! You don't have permission to access the bucket, {}.\".format(bucket))\n", " elif e.response['Error']['Code'] == '404':\n", " print(\"Hey! Your bucket, {}, doesn't exist!\".format(bucket))\n", " else:\n", " raise\n", "else:\n", " print('Training input/output will be stored in: s3://{}/'.format(bucket))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " PRE AWSLogs/\n", " PRE curated-data/\n", " PRE encounters/\n", " PRE players_cheat_model/\n", " PRE sagemaker/\n", " PRE transactions_cheat_model/\n", "2020-03-24 16:26:21 5014975 curated-data\n", "2020-02-24 04:40:50 2496166 model.tar.gz\n", "2020-02-24 04:55:37 5247027 player-dynamic-encounters.csv\n", "2020-02-26 16:48:23 15192938184 player_encounters.csv\n", "2020-04-26 06:43:22 92520718 players-auth.csv.part_00000\n", "2020-04-23 05:32:36 55749535 players-transactions.csv.part_00000\n", "2020-03-24 23:22:41 5307147 results.csv\n", "2020-04-20 22:43:26 15196650 training_player_trans.csv\n" ] } ], "source": [ "%%bash\n", "aws s3 ls s3://percona2020-player-events/" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['timestamp', 'playerGuId', 'uagent', 'day', 'month', 'hour', 'minute',\n", " 'src_ip', 'src_ip_encoded', 'class', 'cidr'],\n", " dtype='object')\n", "CPU times: user 2.87 s, sys: 507 ms, total: 3.38 s\n", "Wall time: 2.21 s\n" ] } ], "source": [ "%%time\n", "\n", "import pandas as pd\n", "import urllib.request\n", "import boto3\n", "\n", "# execute in RDS to generate data_source\n", "# select month,day,hour,minute,unix_timestamp,name,uagent,class from transactions INTO OUTFILE S3 's3-us-west-2://percona2020-player-events/players-transactions.csv' FORMAT CSV HEADER OVERWRITE ON;\n", "\n", "data_filename = 'players-auth.csv.part_00000' # <--- specify the file exported from RDS\n", "data_objectname = 'players-auth.csv.part_00000'\n", "data_source = 'percona2020-player-events'\n", "\n", "\n", "s3 = boto3.client('s3')\n", "s3.download_file(data_source, data_objectname, data_filename)\n", "\n", "player_data = pd.read_csv(data_filename, delimiter=',')\n", "print(player_data.columns)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['timestamp', 'playerGuId', 'uagent', 'day', 'month', 'hour', 'minute',\n", " 'src_ip', 'src_ip_encoded', 'class', 'cidr'],\n", " dtype='object')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timestampplayerGuIduagentdaymonthhourminutesrc_ipsrc_ip_encodedclasscidr
02020-04-19 20:00:20463ca444-cdd8-4b64-b6c9-9ddc0de78854814200208.249.139.421474836470208.249.139/24
12020-04-15 06:18:40821f2ac8-5206-44e4-8450-79ab5448cef1644618230.147.6.25121474836470230.147.6/24
22020-04-30 07:12:49857a78f7-d433-440e-bc38-19a0458b5adf45471245.96.190.224761315040045.96.190/24
32020-04-05 09:05:07aac335fd-3ec0-4ba2-9fa0-e52bbfcb058311495172.80.139.10521474836470172.80.139/24
42020-06-05 00:30:1778a13dd5-874b-4850-a1b4-da5fd646fb25566030205.93.28.9021474836470205.93.28/24
\n", "
" ], "text/plain": [ " timestamp playerGuId uagent day \\\n", "0 2020-04-19 20:00:20 463ca444-cdd8-4b64-b6c9-9ddc0de78854 8 1 \n", "1 2020-04-15 06:18:40 821f2ac8-5206-44e4-8450-79ab5448cef1 6 4 \n", "2 2020-04-30 07:12:49 857a78f7-d433-440e-bc38-19a0458b5adf 4 5 \n", "3 2020-04-05 09:05:07 aac335fd-3ec0-4ba2-9fa0-e52bbfcb0583 1 1 \n", "4 2020-06-05 00:30:17 78a13dd5-874b-4850-a1b4-da5fd646fb25 5 6 \n", "\n", " month hour minute src_ip src_ip_encoded class cidr \n", "0 4 20 0 208.249.139.4 2147483647 0 208.249.139/24 \n", "1 4 6 18 230.147.6.251 2147483647 0 230.147.6/24 \n", "2 4 7 12 45.96.190.224 761315040 0 45.96.190/24 \n", "3 4 9 5 172.80.139.105 2147483647 0 172.80.139/24 \n", "4 6 0 30 205.93.28.90 2147483647 0 205.93.28/24 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(player_data.columns)\n", "player_data.head()\n", "#player_data[['timestamp','playerGuId','name', 'class']].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's take a peek at our data (we only show a subset of the columns in the table):" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
uagentdaymonthhourminutesrc_ip_encodedclass
081420021474836470
164461821474836470
24547127613150400
31149521474836470
456603021474836470
\n", "
" ], "text/plain": [ " uagent day month hour minute src_ip_encoded class\n", "0 8 1 4 20 0 2147483647 0\n", "1 6 4 4 6 18 2147483647 0\n", "2 4 5 4 7 12 761315040 0\n", "3 1 1 4 9 5 2147483647 0\n", "4 5 6 6 0 30 2147483647 0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "player_data=player_data.drop('playerGuId',axis=1)\n", "player_data=player_data.drop('timestamp',axis=1)\n", "player_data=player_data.drop('src_ip',axis=1)\n", "player_data=player_data.drop('cidr',axis=1)\n", "player_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "encode transaction type (name)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
monthdayhourminuteunix_timestampnameuagentclassname_encoded
0475441587793483LootBoxesType2804
15619391590781178LootBoxesType2604
24222581587423493Wormhole706
34116121587312731Wormhole806
45519311589484694Wormhole906
\n", "
" ], "text/plain": [ " month day hour minute unix_timestamp name uagent class \\\n", "0 4 7 5 44 1587793483 LootBoxesType2 8 0 \n", "1 5 6 19 39 1590781178 LootBoxesType2 6 0 \n", "2 4 2 22 58 1587423493 Wormhole 7 0 \n", "3 4 1 16 12 1587312731 Wormhole 8 0 \n", "4 5 5 19 31 1589484694 Wormhole 9 0 \n", "\n", " name_encoded \n", "0 4 \n", "1 4 \n", "2 6 \n", "3 6 \n", "4 6 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import csv\n", "import sys\n", "import pandas as pd\n", "pd.set_option(\"display.max_rows\", None, \"display.max_columns\", None)\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "label_encoder = LabelEncoder()\n", "integer_encoded = label_encoder.fit_transform(player_data.name)\n", "player_data[\"name_encoded\"]=integer_encoded\n", "player_data.head()\n", "\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "training 0.028995871664011963%\n", "769075\n" ] } ], "source": [ "legit,fraud=player_data.groupby('class').size()\n", "print('training {}%'.format(fraud/legit*100))\n", "print(legit)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['month', 'day', 'hour', 'minute', 'name_encoded', 'uagent', 'class'], dtype='object')\n" ] } ], "source": [ "player_data=player_data.drop('name',axis=1)\n", "player_data=player_data.drop('unix_timestamp',axis=1)\n", "player_data=player_data[['month','day','hour','minute','name_encoded','uagent','class']]\n", "print(player_data.columns)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of frauds: 223\n", "Number of non-frauds: 769075\n", "Percentage of fradulent data: 0.0289874664954283\n" ] } ], "source": [ "nonfrauds, frauds = player_data.groupby('class').size()\n", "print('Number of frauds: ', frauds)\n", "print('Number of non-frauds: ', nonfrauds)\n", "print('Percentage of fradulent data:', 100.*frauds/(frauds + nonfrauds))\n", "\n", "#player_data=player_data.drop('name',axis=1)\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The class column corresponds to whether or not a transaction is fradulent. We see that the majority of data is non-fraudulant with only $1731$ ($0.127\\%$) of the data corresponding to fraudulant examples." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['uagent', 'day', 'month', 'hour', 'minute', 'src_ip_encoded', 'class'], dtype='object')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
uagentdaymonthhourminutesrc_ip_encodedclass
count769298.000000769298.000000769298.000000769298.000000769298.0000007.692980e+05769298.000000
mean5.4988644.0381174.70298611.69616630.3156931.600717e+090.000290
std2.8383231.9868810.8193576.89848317.0309386.998586e+080.017023
min1.0000001.0000001.0000000.0000000.0000001.066292e+060.000000
25%3.0000002.0000004.0000006.00000016.0000001.026122e+090.000000
50%5.0000004.0000005.00000012.00000031.0000002.136689e+090.000000
75%8.0000006.0000005.00000018.00000045.0000002.147484e+090.000000
max11.0000007.00000012.00000023.00000059.0000002.147484e+091.000000
\n", "
" ], "text/plain": [ " uagent day month hour \\\n", "count 769298.000000 769298.000000 769298.000000 769298.000000 \n", "mean 5.498864 4.038117 4.702986 11.696166 \n", "std 2.838323 1.986881 0.819357 6.898483 \n", "min 1.000000 1.000000 1.000000 0.000000 \n", "25% 3.000000 2.000000 4.000000 6.000000 \n", "50% 5.000000 4.000000 5.000000 12.000000 \n", "75% 8.000000 6.000000 5.000000 18.000000 \n", "max 11.000000 7.000000 12.000000 23.000000 \n", "\n", " minute src_ip_encoded class \n", "count 769298.000000 7.692980e+05 769298.000000 \n", "mean 30.315693 1.600717e+09 0.000290 \n", "std 17.030938 6.998586e+08 0.017023 \n", "min 0.000000 1.066292e+06 0.000000 \n", "25% 16.000000 1.026122e+09 0.000000 \n", "50% 31.000000 2.136689e+09 0.000000 \n", "75% 45.000000 2.147484e+09 0.000000 \n", "max 59.000000 2.147484e+09 1.000000 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(player_data.columns)\n", "player_data[['uagent','day','month','hour','minute','src_ip_encoded','class']].describe()\n", "\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['uagent', 'day', 'month', 'hour', 'minute', 'src_ip_encoded', 'class'], dtype='object')\n", "feature_columns=Index(['uagent', 'day', 'month', 'hour', 'minute', 'src_ip_encoded'], dtype='object')\n", "label_column=class\n" ] } ], "source": [ "print(player_data.columns)\n", "feature_columns = player_data.columns[:-1]\n", "label_column = player_data.columns[-1]\n", "\n", "print('feature_columns={}'.format(feature_columns))\n", "print('label_column={}'.format(label_column))\n", "\n", "features = player_data[feature_columns].values.astype('float32')\n", "labels = (player_data[label_column].values).astype('float32')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's do some analysis and discuss different ways we can preprocess our data. Let's discuss the way in which this data was preprocessed." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## SageMaker Linear Learner" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Prepare Data and Upload to S3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The Amazon common libraries provide utilities to convert NumPy n-dimensional arrays into a the Record-IO format which SageMaker uses for a concise representation of features and labels. The Record-IO format is implemented via protocol buffer so the serialization is very efficient." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "import io\n", "import sagemaker.amazon.common as smac\n", "\n", "buf = io.BytesIO()\n", "smac.write_numpy_to_dense_tensor(buf, features, labels)\n", "buf.seek(0);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we upload the data to S3 using boto3." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Uploaded training data location: s3://percona2020-player-events/transactions_cheat_model/train/recordio-pb-data\n", "Training artifacts will be uploaded to: s3://percona2020-player-events/transactions_cheat_model/output\n" ] } ], "source": [ "import boto3\n", "import os\n", "import sagemaker\n", "\n", "session = sagemaker.Session()\n", "bucket = 'percona2020-player-events'\n", "\n", "prefix = 'transactions_cheat_model'\n", "key = 'recordio-pb-data'\n", "\n", "boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)\n", "\n", "s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)\n", "print('Uploaded training data location: {}'.format(s3_train_data))\n", "\n", "output_location = 's3://{}/{}/output'.format(bucket, prefix)\n", "print('Training artifacts will be uploaded to: {}'.format(output_location))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we train a Linear Learner using SageMaker's built-in algorithm. To specify the Linear Learner algorithm, we use a utility function to obtain it's URI. A complete list of build-in algorithms is found here: https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "from sagemaker.amazon.amazon_estimator import get_image_uri\n", "\n", "container = get_image_uri(boto3.Session().region_name, 'linear-learner')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "SageMaker abstracts training with Estimators. We can pass container, and all parameters to the estimator, as well as the hyperparameters for the linear learner and fit the estimator to the data in S3." ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "from sagemaker import get_execution_role\n", "\n", "linear = sagemaker.estimator.Estimator(container,\n", " get_execution_role(), \n", " train_instance_count=1, \n", " train_instance_type='ml.c4.xlarge',\n", " output_path=output_location,\n", " sagemaker_session=session)\n", "linear.set_hyperparameters(feature_dim=features.shape[1],\n", " predictor_type='binary_classifier',\n", " mini_batch_size=200)\n", "\n", "linear.fit({'train': s3_train_data},wait=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Host Linear Classifier" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we deploy the estimator to and endpoint." ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using already existing model: linear-learner-2020-04-26-06-50-49-092\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "-------------!" ] } ], "source": [ "from sagemaker.predictor import csv_serializer, json_deserializer\n", "\n", "linear_predictor = linear.deploy(initial_instance_count=1,\n", " endpoint_name=\"auth-cheat\",\n", " instance_type='ml.m4.xlarge')\n", "# Specify input and output formats.\n", "linear_predictor.content_type = 'text/csv'\n", "linear_predictor.serializer = csv_serializer\n", "linear_predictor.deserializer = json_deserializer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Call the model from aurora\n", "select *, trans_cheat_score(month,day,hour,minute,name_encoded,uagent) class from transactions where class>0;\n", "\n", "select playerGuid from (select playerGuid,trans_cheat_score(12,5,60,43,7,3) class from transactions) as t where t.class>0;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Clean up\n", "\n", "We will leave the prediction endpoint running at the end of this notebook so we can handle incoming event streams. However, don't forget to delete the prediction endpoint when you're done. You can do that at the Amazon SageMaker console in the Endpoints page. Or you can run `linear_predictor.delete_endpoint()`" ] } ], "metadata": { "kernelspec": { "display_name": "conda_amazonei_tensorflow_p36", "language": "python", "name": "conda_amazonei_tensorflow_p36" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 4 }