{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Players non-legit authentication detector with SageMaker Linear Regression - Logistic Regression"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Investigate and process the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training input/output will be stored in: s3://percona2020-player-events/\n"
     ]
    }
   ],
   "source": [
    "import boto3\n",
    "import botocore\n",
    "import sagemaker\n",
    "import sys\n",
    "\n",
    "\n",
    "bucket = 'percona2020-player-events'   # <--- specify a bucket you have access to\n",
    "execution_role = sagemaker.get_execution_role()\n",
    "\n",
    "\n",
    "\n",
    "# check if the bucket exists\n",
    "try:\n",
    "    boto3.Session().client('s3').head_bucket(Bucket=bucket)\n",
    "except botocore.exceptions.ParamValidationError as e:\n",
    "    print('Hey! You either forgot to specify your S3 bucket'\n",
    "          ' or you gave your bucket an invalid name!')\n",
    "except botocore.exceptions.ClientError as e:\n",
    "    if e.response['Error']['Code'] == '403':\n",
    "        print(\"Hey! You don't have permission to access the bucket, {}.\".format(bucket))\n",
    "    elif e.response['Error']['Code'] == '404':\n",
    "        print(\"Hey! Your bucket, {}, doesn't exist!\".format(bucket))\n",
    "    else:\n",
    "        raise\n",
    "else:\n",
    "    print('Training input/output will be stored in: s3://{}/'.format(bucket))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                           PRE AWSLogs/\n",
      "                           PRE curated-data/\n",
      "                           PRE encounters/\n",
      "                           PRE players_cheat_model/\n",
      "                           PRE sagemaker/\n",
      "                           PRE transactions_cheat_model/\n",
      "2020-03-24 16:26:21    5014975 curated-data\n",
      "2020-02-24 04:40:50    2496166 model.tar.gz\n",
      "2020-02-24 04:55:37    5247027 player-dynamic-encounters.csv\n",
      "2020-02-26 16:48:23 15192938184 player_encounters.csv\n",
      "2020-04-26 06:43:22   92520718 players-auth.csv.part_00000\n",
      "2020-04-23 05:32:36   55749535 players-transactions.csv.part_00000\n",
      "2020-03-24 23:22:41    5307147 results.csv\n",
      "2020-04-20 22:43:26   15196650 training_player_trans.csv\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "aws s3 ls s3://percona2020-player-events/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['timestamp', 'playerGuId', 'uagent', 'day', 'month', 'hour', 'minute',\n",
      "       'src_ip', 'src_ip_encoded', 'class', 'cidr'],\n",
      "      dtype='object')\n",
      "CPU times: user 2.87 s, sys: 507 ms, total: 3.38 s\n",
      "Wall time: 2.21 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "import pandas as pd\n",
    "import urllib.request\n",
    "import boto3\n",
    "\n",
    "# execute in RDS to generate data_source\n",
    "# select month,day,hour,minute,unix_timestamp,name,uagent,class from  transactions INTO OUTFILE S3 's3-us-west-2://percona2020-player-events/players-transactions.csv' FORMAT CSV HEADER  OVERWRITE ON;\n",
    "\n",
    "data_filename = 'players-auth.csv.part_00000' # <--- specify the file exported from RDS\n",
    "data_objectname = 'players-auth.csv.part_00000'\n",
    "data_source = 'percona2020-player-events'\n",
    "\n",
    "\n",
    "s3 = boto3.client('s3')\n",
    "s3.download_file(data_source, data_objectname, data_filename)\n",
    "\n",
    "player_data = pd.read_csv(data_filename, delimiter=',')\n",
    "print(player_data.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['timestamp', 'playerGuId', 'uagent', 'day', 'month', 'hour', 'minute',\n",
      "       'src_ip', 'src_ip_encoded', 'class', 'cidr'],\n",
      "      dtype='object')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>timestamp</th>\n",
       "      <th>playerGuId</th>\n",
       "      <th>uagent</th>\n",
       "      <th>day</th>\n",
       "      <th>month</th>\n",
       "      <th>hour</th>\n",
       "      <th>minute</th>\n",
       "      <th>src_ip</th>\n",
       "      <th>src_ip_encoded</th>\n",
       "      <th>class</th>\n",
       "      <th>cidr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-04-19 20:00:20</td>\n",
       "      <td>463ca444-cdd8-4b64-b6c9-9ddc0de78854</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "      <td>208.249.139.4</td>\n",
       "      <td>2147483647</td>\n",
       "      <td>0</td>\n",
       "      <td>208.249.139/24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-04-15 06:18:40</td>\n",
       "      <td>821f2ac8-5206-44e4-8450-79ab5448cef1</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>18</td>\n",
       "      <td>230.147.6.251</td>\n",
       "      <td>2147483647</td>\n",
       "      <td>0</td>\n",
       "      <td>230.147.6/24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-04-30 07:12:49</td>\n",
       "      <td>857a78f7-d433-440e-bc38-19a0458b5adf</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "      <td>12</td>\n",
       "      <td>45.96.190.224</td>\n",
       "      <td>761315040</td>\n",
       "      <td>0</td>\n",
       "      <td>45.96.190/24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-04-05 09:05:07</td>\n",
       "      <td>aac335fd-3ec0-4ba2-9fa0-e52bbfcb0583</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "      <td>5</td>\n",
       "      <td>172.80.139.105</td>\n",
       "      <td>2147483647</td>\n",
       "      <td>0</td>\n",
       "      <td>172.80.139/24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-06-05 00:30:17</td>\n",
       "      <td>78a13dd5-874b-4850-a1b4-da5fd646fb25</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>30</td>\n",
       "      <td>205.93.28.90</td>\n",
       "      <td>2147483647</td>\n",
       "      <td>0</td>\n",
       "      <td>205.93.28/24</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             timestamp                            playerGuId  uagent  day  \\\n",
       "0  2020-04-19 20:00:20  463ca444-cdd8-4b64-b6c9-9ddc0de78854       8    1   \n",
       "1  2020-04-15 06:18:40  821f2ac8-5206-44e4-8450-79ab5448cef1       6    4   \n",
       "2  2020-04-30 07:12:49  857a78f7-d433-440e-bc38-19a0458b5adf       4    5   \n",
       "3  2020-04-05 09:05:07  aac335fd-3ec0-4ba2-9fa0-e52bbfcb0583       1    1   \n",
       "4  2020-06-05 00:30:17  78a13dd5-874b-4850-a1b4-da5fd646fb25       5    6   \n",
       "\n",
       "   month  hour  minute          src_ip  src_ip_encoded  class            cidr  \n",
       "0      4    20       0   208.249.139.4      2147483647      0  208.249.139/24  \n",
       "1      4     6      18   230.147.6.251      2147483647      0    230.147.6/24  \n",
       "2      4     7      12   45.96.190.224       761315040      0    45.96.190/24  \n",
       "3      4     9       5  172.80.139.105      2147483647      0   172.80.139/24  \n",
       "4      6     0      30    205.93.28.90      2147483647      0    205.93.28/24  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(player_data.columns)\n",
    "player_data.head()\n",
    "#player_data[['timestamp','playerGuId','name', 'class']].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's take a peek at our data (we only show a subset of the columns in the table):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uagent</th>\n",
       "      <th>day</th>\n",
       "      <th>month</th>\n",
       "      <th>hour</th>\n",
       "      <th>minute</th>\n",
       "      <th>src_ip_encoded</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "      <td>2147483647</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>18</td>\n",
       "      <td>2147483647</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "      <td>12</td>\n",
       "      <td>761315040</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "      <td>5</td>\n",
       "      <td>2147483647</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>30</td>\n",
       "      <td>2147483647</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   uagent  day  month  hour  minute  src_ip_encoded  class\n",
       "0       8    1      4    20       0      2147483647      0\n",
       "1       6    4      4     6      18      2147483647      0\n",
       "2       4    5      4     7      12       761315040      0\n",
       "3       1    1      4     9       5      2147483647      0\n",
       "4       5    6      6     0      30      2147483647      0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "player_data=player_data.drop('playerGuId',axis=1)\n",
    "player_data=player_data.drop('timestamp',axis=1)\n",
    "player_data=player_data.drop('src_ip',axis=1)\n",
    "player_data=player_data.drop('cidr',axis=1)\n",
    "player_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "encode transaction type (name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>month</th>\n",
       "      <th>day</th>\n",
       "      <th>hour</th>\n",
       "      <th>minute</th>\n",
       "      <th>unix_timestamp</th>\n",
       "      <th>name</th>\n",
       "      <th>uagent</th>\n",
       "      <th>class</th>\n",
       "      <th>name_encoded</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "      <td>44</td>\n",
       "      <td>1587793483</td>\n",
       "      <td>LootBoxesType2</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>19</td>\n",
       "      <td>39</td>\n",
       "      <td>1590781178</td>\n",
       "      <td>LootBoxesType2</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>22</td>\n",
       "      <td>58</td>\n",
       "      <td>1587423493</td>\n",
       "      <td>Wormhole</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>16</td>\n",
       "      <td>12</td>\n",
       "      <td>1587312731</td>\n",
       "      <td>Wormhole</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>19</td>\n",
       "      <td>31</td>\n",
       "      <td>1589484694</td>\n",
       "      <td>Wormhole</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   month  day  hour  minute  unix_timestamp            name  uagent  class  \\\n",
       "0      4    7     5      44      1587793483  LootBoxesType2       8      0   \n",
       "1      5    6    19      39      1590781178  LootBoxesType2       6      0   \n",
       "2      4    2    22      58      1587423493        Wormhole       7      0   \n",
       "3      4    1    16      12      1587312731        Wormhole       8      0   \n",
       "4      5    5    19      31      1589484694        Wormhole       9      0   \n",
       "\n",
       "   name_encoded  \n",
       "0             4  \n",
       "1             4  \n",
       "2             6  \n",
       "3             6  \n",
       "4             6  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import csv\n",
    "import sys\n",
    "import pandas as pd\n",
    "pd.set_option(\"display.max_rows\", None, \"display.max_columns\", None)\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "label_encoder = LabelEncoder()\n",
    "integer_encoded = label_encoder.fit_transform(player_data.name)\n",
    "player_data[\"name_encoded\"]=integer_encoded\n",
    "player_data.head()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training 0.028995871664011963%\n",
      "769075\n"
     ]
    }
   ],
   "source": [
    "legit,fraud=player_data.groupby('class').size()\n",
    "print('training {}%'.format(fraud/legit*100))\n",
    "print(legit)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['month', 'day', 'hour', 'minute', 'name_encoded', 'uagent', 'class'], dtype='object')\n"
     ]
    }
   ],
   "source": [
    "player_data=player_data.drop('name',axis=1)\n",
    "player_data=player_data.drop('unix_timestamp',axis=1)\n",
    "player_data=player_data[['month','day','hour','minute','name_encoded','uagent','class']]\n",
    "print(player_data.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of frauds:  223\n",
      "Number of non-frauds:  769075\n",
      "Percentage of fradulent data: 0.0289874664954283\n"
     ]
    }
   ],
   "source": [
    "nonfrauds, frauds = player_data.groupby('class').size()\n",
    "print('Number of frauds: ', frauds)\n",
    "print('Number of non-frauds: ', nonfrauds)\n",
    "print('Percentage of fradulent data:', 100.*frauds/(frauds + nonfrauds))\n",
    "\n",
    "#player_data=player_data.drop('name',axis=1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The class column corresponds to whether or not a transaction is fradulent. We see that the majority of data is non-fraudulant with only $1731$ ($0.127\\%$) of the data corresponding to fraudulant examples."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['uagent', 'day', 'month', 'hour', 'minute', 'src_ip_encoded', 'class'], dtype='object')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uagent</th>\n",
       "      <th>day</th>\n",
       "      <th>month</th>\n",
       "      <th>hour</th>\n",
       "      <th>minute</th>\n",
       "      <th>src_ip_encoded</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>769298.000000</td>\n",
       "      <td>769298.000000</td>\n",
       "      <td>769298.000000</td>\n",
       "      <td>769298.000000</td>\n",
       "      <td>769298.000000</td>\n",
       "      <td>7.692980e+05</td>\n",
       "      <td>769298.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>5.498864</td>\n",
       "      <td>4.038117</td>\n",
       "      <td>4.702986</td>\n",
       "      <td>11.696166</td>\n",
       "      <td>30.315693</td>\n",
       "      <td>1.600717e+09</td>\n",
       "      <td>0.000290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.838323</td>\n",
       "      <td>1.986881</td>\n",
       "      <td>0.819357</td>\n",
       "      <td>6.898483</td>\n",
       "      <td>17.030938</td>\n",
       "      <td>6.998586e+08</td>\n",
       "      <td>0.017023</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.066292e+06</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>16.000000</td>\n",
       "      <td>1.026122e+09</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>5.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>31.000000</td>\n",
       "      <td>2.136689e+09</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>8.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>18.000000</td>\n",
       "      <td>45.000000</td>\n",
       "      <td>2.147484e+09</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>11.000000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>23.000000</td>\n",
       "      <td>59.000000</td>\n",
       "      <td>2.147484e+09</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              uagent            day          month           hour  \\\n",
       "count  769298.000000  769298.000000  769298.000000  769298.000000   \n",
       "mean        5.498864       4.038117       4.702986      11.696166   \n",
       "std         2.838323       1.986881       0.819357       6.898483   \n",
       "min         1.000000       1.000000       1.000000       0.000000   \n",
       "25%         3.000000       2.000000       4.000000       6.000000   \n",
       "50%         5.000000       4.000000       5.000000      12.000000   \n",
       "75%         8.000000       6.000000       5.000000      18.000000   \n",
       "max        11.000000       7.000000      12.000000      23.000000   \n",
       "\n",
       "              minute  src_ip_encoded          class  \n",
       "count  769298.000000    7.692980e+05  769298.000000  \n",
       "mean       30.315693    1.600717e+09       0.000290  \n",
       "std        17.030938    6.998586e+08       0.017023  \n",
       "min         0.000000    1.066292e+06       0.000000  \n",
       "25%        16.000000    1.026122e+09       0.000000  \n",
       "50%        31.000000    2.136689e+09       0.000000  \n",
       "75%        45.000000    2.147484e+09       0.000000  \n",
       "max        59.000000    2.147484e+09       1.000000  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(player_data.columns)\n",
    "player_data[['uagent','day','month','hour','minute','src_ip_encoded','class']].describe()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['uagent', 'day', 'month', 'hour', 'minute', 'src_ip_encoded', 'class'], dtype='object')\n",
      "feature_columns=Index(['uagent', 'day', 'month', 'hour', 'minute', 'src_ip_encoded'], dtype='object')\n",
      "label_column=class\n"
     ]
    }
   ],
   "source": [
    "print(player_data.columns)\n",
    "feature_columns = player_data.columns[:-1]\n",
    "label_column = player_data.columns[-1]\n",
    "\n",
    "print('feature_columns={}'.format(feature_columns))\n",
    "print('label_column={}'.format(label_column))\n",
    "\n",
    "features = player_data[feature_columns].values.astype('float32')\n",
    "labels = (player_data[label_column].values).astype('float32')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's do some analysis and discuss different ways we can preprocess our data. Let's discuss the way in which this data was preprocessed."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SageMaker Linear Learner"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Prepare Data and Upload to S3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The Amazon common libraries provide utilities to convert NumPy n-dimensional arrays into a the Record-IO format which SageMaker uses for a concise representation of features and labels. The Record-IO format is implemented via protocol buffer so the serialization is very efficient."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "import io\n",
    "import sagemaker.amazon.common as smac\n",
    "\n",
    "buf = io.BytesIO()\n",
    "smac.write_numpy_to_dense_tensor(buf, features, labels)\n",
    "buf.seek(0);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we upload the data to S3 using boto3."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Uploaded training data location: s3://percona2020-player-events/transactions_cheat_model/train/recordio-pb-data\n",
      "Training artifacts will be uploaded to: s3://percona2020-player-events/transactions_cheat_model/output\n"
     ]
    }
   ],
   "source": [
    "import boto3\n",
    "import os\n",
    "import sagemaker\n",
    "\n",
    "session = sagemaker.Session()\n",
    "bucket = 'percona2020-player-events'\n",
    "\n",
    "prefix = 'transactions_cheat_model'\n",
    "key = 'recordio-pb-data'\n",
    "\n",
    "boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)\n",
    "\n",
    "s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)\n",
    "print('Uploaded training data location: {}'.format(s3_train_data))\n",
    "\n",
    "output_location = 's3://{}/{}/output'.format(bucket, prefix)\n",
    "print('Training artifacts will be uploaded to: {}'.format(output_location))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we train a Linear Learner using SageMaker's built-in algorithm. To specify the Linear Learner algorithm, we use a utility function to obtain it's URI. A complete list of build-in algorithms is found here: https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.amazon.amazon_estimator import get_image_uri\n",
    "\n",
    "container = get_image_uri(boto3.Session().region_name, 'linear-learner')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "SageMaker abstracts training with Estimators. We can pass container, and all parameters to the estimator, as well as the hyperparameters for the linear learner and fit the estimator to the data in S3."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker import get_execution_role\n",
    "\n",
    "linear = sagemaker.estimator.Estimator(container,\n",
    "                                       get_execution_role(), \n",
    "                                       train_instance_count=1, \n",
    "                                       train_instance_type='ml.c4.xlarge',\n",
    "                                       output_path=output_location,\n",
    "                                       sagemaker_session=session)\n",
    "linear.set_hyperparameters(feature_dim=features.shape[1],\n",
    "                           predictor_type='binary_classifier',\n",
    "                           mini_batch_size=200)\n",
    "\n",
    "linear.fit({'train': s3_train_data},wait=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Host Linear Classifier"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we deploy the estimator to and endpoint."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using already existing model: linear-learner-2020-04-26-06-50-49-092\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-------------!"
     ]
    }
   ],
   "source": [
    "from sagemaker.predictor import csv_serializer, json_deserializer\n",
    "\n",
    "linear_predictor = linear.deploy(initial_instance_count=1,\n",
    "                                 endpoint_name=\"auth-cheat\",\n",
    "                                 instance_type='ml.m4.xlarge')\n",
    "# Specify input and output formats.\n",
    "linear_predictor.content_type = 'text/csv'\n",
    "linear_predictor.serializer = csv_serializer\n",
    "linear_predictor.deserializer = json_deserializer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Call the model from aurora\n",
    "select *, trans_cheat_score(month,day,hour,minute,name_encoded,uagent) class from transactions where class>0;\n",
    "\n",
    "select playerGuid from (select playerGuid,trans_cheat_score(12,5,60,43,7,3) class from transactions) as t where t.class>0;"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Clean up\n",
    "\n",
    "We will leave the prediction endpoint running at the end of this notebook so we can handle incoming event streams. However, don't forget to delete the prediction endpoint when you're done. You can do that at the Amazon SageMaker console in the Endpoints page. Or you can run `linear_predictor.delete_endpoint()`"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_amazonei_tensorflow_p36",
   "language": "python",
   "name": "conda_amazonei_tensorflow_p36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}