{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# STK player bot detector with SageMaker Linear Regression - Logistic Regression¶\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Install packages" ] }, { "cell_type": "code", "execution_count": 707, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: mysql-connector-python in /home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages (8.0.21)\n", "Requirement already satisfied: protobuf>=3.0.0 in /home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages (from mysql-connector-python) (3.13.0)\n", "Requirement already satisfied: six>=1.9 in /home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages (from protobuf>=3.0.0->mysql-connector-python) (1.15.0)\n", "Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages (from protobuf>=3.0.0->mysql-connector-python) (49.2.0.post20200714)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "bash: line 2: /home/ec2-user/SageMaker/amazon-aurora-call-to-amazon-sagemaker-sample/stk-sample/env-build/sagemaker/env: No such file or directory\n", "WARNING: You are using pip version 20.1.1; however, version 20.2.3 is available.\n", "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.\n" ] } ], "source": [ " %%bash \n", "home=`pwd`\n", ". $home/env\n", "pip3 install mysql-connector-python" ] }, { "cell_type": "code", "execution_count": 708, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "arn:aws:iam::163538056407:role/service-role/AmazonSageMaker-ExecutionRole-20200913T212219\n" ] } ], "source": [ "import sagemaker\n", "execution_role = sagemaker.get_execution_role()\n", "print(execution_role)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get training and test data from Aurora DB" ] }, { "cell_type": "code", "execution_count": 711, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 6.05 ms, sys: 0 ns, total: 6.05 ms\n", "Wall time: 5.6 ms\n" ] } ], "source": [ "%%time\n", "\n", "import json\n", "import boto3\n", "import csv\n", "import mysql.connector\n", "import sys\n", "import os\n", "import configparser\n", "\n", "config = configparser.ConfigParser()\n", "\n", "config.read('rds.ini')\n", "\n", "ENDPOINT=config['default']['ENDPOINT']\n", "USR=config['default']['USR']\n", "PASWD=config['default']['PASWD']\n", "REGION=config['default']['REGION']\n", "DBNAME=config['default']['DBNAME']\n", "\n", "client = boto3.client('rds')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Getting the right size of the grouping" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "import pandas as pd\n", "\n", "group_size=100\n", "\n", "try:\n", " conn = mysql.connector.connect(host=ENDPOINT, user=USR, passwd=PASWD, port=3306, database=DBNAME)\n", " cur = conn.cursor()\n", " cur.execute(\"\"\"SELECT count(id) from actions where class is not null\"\"\")\n", " actions_size_list=cur.fetchall()\n", " actions_size_df=pd.DataFrame(actions_size_list)\n", " actions_size=actions_size_df[0].values[0]\n", " print(\"actions_size=\"+str(actions_size))\n", " modulo_size=actions_size/group_size\n", " print(\"modulo_size=\"+str(modulo_size))\n", "except Exception as e:\n", " print(\"Database connection failed due to {}\".format(e))" ] }, { "cell_type": "code", "execution_count": 681, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Database connection failed due to 2005 (HY000): Unknown MySQL server host 'stk-instance-1.ccgqlhzmgy9f.us-west-2.rds.amazonaws.com' (0)\n", "CPU times: user 1.89 ms, sys: 0 ns, total: 1.89 ms\n", "Wall time: 55.5 ms\n" ] } ], "source": [ "%%time\n", "import pandas as pd\n", "\n", "modulo_size=100\n", "\n", "try:\n", " conn = mysql.connector.connect(host=ENDPOINT, user=USR, passwd=PASWD, port=3306, database=DBNAME)\n", " cur = conn.cursor()\n", " \n", " cur.execute(\"\"\"SELECT id,m_ticks,m_action,m_value,m_value_l,m_value_r,class FROM actions WHERE class=1 order by id\"\"\")\n", " bot_action_list=cur.fetchall()\n", " cur.execute(\"\"\"SELECT id,m_ticks,m_action,m_value,m_value_l,m_value_r,class FROM actions WHERE class=0 order by id\"\"\")\n", " player_action_list=cur.fetchall() \n", " cur.execute(\"\"\"\n", " SELECT FLOOR(id/100) sessionid,id,m_ticks,m_kart_id,m_action,m_value,m_value_l,m_value_r,class\n", " FROM actions \n", " WHERE class IS NOT NULL\n", " ORDER BY id,m_kart_id \n", " \"\"\")\n", " action_list=cur.fetchall() \n", "except Exception as e:\n", " print(\"Database connection failed due to {}\".format(e))\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Investigate and process the data" ] }, { "cell_type": "code", "execution_count": 682, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "bot_actions = pd.DataFrame(bot_action_list)\n", "player_actions = pd.DataFrame(player_action_list)" ] }, { "cell_type": "code", "execution_count": 683, "metadata": {}, "outputs": [], "source": [ "bot_actions.rename(columns={0:'id',\n", " 1:'m_ticks',\n", " 2:'m_action',\n", " 3:'m_value',\n", " 4:'m_value_l',\n", " 5:'m_value_r',\n", " 6:'class'\n", " }, \n", " inplace=True)" ] }, { "cell_type": "code", "execution_count": 684, "metadata": {}, "outputs": [], "source": [ "player_actions.rename(columns={0:'id',\n", " 1:'m_ticks',\n", " 2:'m_action',\n", " 3:'m_value',\n", " 4:'m_value_l',\n", " 5:'m_value_r',\n", " 6:'class'\n", " }, \n", " inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Let's look for interesting data patterns" ] }, { "cell_type": "code", "execution_count": 685, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Corralation between ticks in bots and humans" ] }, { "cell_type": "code", "execution_count": 686, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZkAAAEGCAYAAAC3lehYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3df5TV9X3n8ecLEETsiAgoMiK4khYSLYmjh0TT3URdqYtBrdnS6sqxnmUltsmepE20OTFpe7IneLZxa9ph141ZUYhKrUZmtnRjsEkOLNEOiT8Co+vEn6NExh+h2kQC+N4/vp8bL+PMcL/D/d4fc1+Pc+bMvZ/7+dz7vtcr73l/Pt/v56uIwMzMrAjj6h2AmZmNXU4yZmZWGCcZMzMrjJOMmZkVxknGzMwKM6HeAdTD9OnTY+7cufUOw8ysqWzfvv2ViJiRZ0xLJpm5c+fS09NT7zDMzJqKpOfyjvF0mZmZFcZJxszMCuMkY2ZmhXGSMTOzwjjJmJlZYVry6DIzs5bT2wvf+Q4cfzx85CMwI9eRyKNWeCUjaaqkeyQ9IalX0gclTZP0gKSn0u9jy/pfL6lP0pOSLihrP0PS4+mxmyUptU+SdHdqf0jS3KLfk5lZU/md34GFC+GTn4Tf/V048US4886avHQtpsv+CviHiPgN4DeBXuA6YHNEzAc2p/tIWggsB94LLAE6JY1Pz7MGWAnMTz9LUvvVwOsRcSpwE7C6Bu/JzKzxDQzAtGlw770Ht+/fD3/wB9njBSs0yUhqA34LuBUgIn4ZET8DlgFrU7e1wMXp9jLgrojYGxHPAH3AWZJmAW0RsS2yC+DcPmhM6bnuAc4tVTlmZi3rvPNg5kx4/fWhH9+/H559tvAwiq5kTgEGgP8l6UeSvi5pCnB8ROwCSL9npv6zgRfKxventtnp9uD2g8ZExH5gD3BcMW/HzKzBrVkDEmzePHK/AwegBttrFZ1kJgAfANZExPuBfyFNjQ1jqAokRmgfaczBTyytlNQjqWegBiWimVlNdXdnyeUTn6is//LlNVn8LzrJ9AP9EfFQun8PWdJ5OU2BkX7vLut/Utn4duCl1N4+RPtBYyRNAI4BXhscSETcEhEdEdExo0ZHVZiZFa67GyZOhIsuqnzM+PHwzW8WF1OZQpNMRPwUeEHSr6emc4GdwEZgRWpbAdyfbm8ElqcjxuaRLfA/nKbU3pC0OK23XDloTOm5LgMeTOs2ZmZjV3c3TJiQJZd9+yofN2FCth5TI7U4T+aPgPWSJgJPA1eRJbcNkq4Gngc+DhAROyRtIEtE+4FrI+JAep5VwG3AZGBT+oHsoII7JPWRVTDLa/CezMzqo7sbLr44W1PJa/Vq+Oxnqx/TCNSKf/R3dHSEt/o3s6YzdSrs2ZN/3LnnZidiHiZJ2yOiI88YbytjZtbo1q/PFvVHk2B2765KghktJxkzs0a1dWu2hnLFFfnHrl4NETXbPmY4TjJmZo3mxhuz5HLOOfnXXjo7s+RS47WX4XiDTDOzRrF1a5ZY8pLgjjvg8surH9NhciVjZlZvW7dCW9voEsy6dfD22w2ZYMBJxsysftavhyOPzJLLG2/kG/uFL2TTYg2aXEo8XWZmVmu9vXDaaaM712XKFHjzzerHVBBXMmZmtdLbC3PmZNd2yZtgJk6Erq6mSjDgJGNmVrzu7uxEyoUL4YUXDt2/3LhxWXLZuxeWLi0mvgJ5uszMrCiHswUMZIv6Db7mciiuZMzMqm3NmqwCueii0SWYJlnUr4QrGTOzalm/fnRn55d0dsKqVdWLpwE4yZiZHa7eXli0CH75y9GNHwPTYsPxdJmZ2eGYPz9b0B9NghlD02LDcZIxMxuNG2/MtnPp68s/trR55Z//efXjajCeLjMzy2vcuCxJ5DWGp8WG40rGzKxSa9Zk1UveBFOqXFoswYArGTOzykycCPv25RvTgpXLYK5kzMxGsnVrVr3kSTAtsKBfKVcyZmbDmTEDXnml8v6XXAL33ltcPE3IlYyZ2WA33JBVL3kSzO7dTjBDcCVjZlbS25ud85LHVVfBN75RTDxjgJOMmRnAmWdCT0++Mbt3Z1NqNixPl5lZa+vtzabG8iSYxYuzhX0nmEMqPMlIelbS45IekdST2qZJekDSU+n3sWX9r5fUJ+lJSReUtZ+RnqdP0s2SlNonSbo7tT8kaW7R78nMxohFi/JPj+3cCdu2FRPPGFSrSuYjEbEoIjrS/euAzRExH9ic7iNpIbAceC+wBOiUND6NWQOsBOannyWp/Wrg9Yg4FbgJWF2D92NmzaxUvTz6aOVjrrkmq14WLCgurjGoXtNly4C16fZa4OKy9rsiYm9EPAP0AWdJmgW0RcS2iAjg9kFjSs91D3BuqcoxM3uXM8/MV71MmpQllzVriotpDKtFkgng25K2S1qZ2o6PiF0A6ffM1D4bKL82aX9qm51uD24/aExE7Af2AMcNDkLSSkk9knoGBgaq8sbMrImMZu1l3Tp4663iYmoBtTi67OyIeEnSTOABSU+M0HeoCiRGaB9pzMENEbcAtwB0dHSMYmc7M2taixblmxo78kj4xS+Ki6eFFF7JRMRL6fdu4D7gLODlNAVG+r07de8HTiob3g68lNrbh2g/aIykCcAxwGtFvBczazIDA/nXXtatc4KpokKTjKQpkn6tdBv4t8CPgY3AitRtBXB/ur0RWJ6OGJtHtsD/cJpSe0PS4rTecuWgMaXnugx4MK3bmFkru/RSmDnz0P1K2tq831gBip4uOx64L63DTwC+GRH/IOmfgA2SrgaeBz4OEBE7JG0AdgL7gWsj4kB6rlXAbcBkYFP6AbgVuENSH1kFs7zg92RmjWw0Z+13dcHSpcXE0+LUin/0d3R0RE/eM3vNrPF57aVQkraXnYpSEZ/xb2bNr7Qdv9deGo73LjOz5tbeDi++WHn/tjbYs6e4eOwgrmTMrDmVqpc8CaarywmmxlzJmFnzyVu9TJrkkyrrxJWMmTWP7u781YvP2q8rVzJm1hymTYPXX6+8v9deGoIrGTNrfFK+BOO1l4bhJGNmjau0uF+phQuzs/Z9YmXD8HSZmTWmU06BZ56pvP/Onb7WSwNykjGzxjIwkG/Psdmzob//0P2sLjxdZmaNI++mllu2OME0OFcyZlZ/eTe1nDMHnnuuuHisalzJmFl9fehD+RLMli1OME3ElYyZ1cf69XDFFfnGtOCu8c3OScbMam/KFPj5zyvvP3lyvv7WMDxdZma1U9oWJk/CWLfOCaaJuZIxs9qYPh1efbXy/t7UckxwJWNmxZPyJRhvajlmOMmYWXFuvDHftjATJmSL+5dfXlxMVlNOMmZWfb29WXL53OcqH9PZCfv2FReT1YXXZMysuhYtgkcfrbz/vHnw9NPFxWN15UrGzKqjtGNyngSzZYsTzBjnSsbMDl/eyyF7W5iW4UrGzA5P3sshe1uYllKTJCNpvKQfSepO96dJekDSU+n3sWV9r5fUJ+lJSReUtZ8h6fH02M1SdsiKpEmS7k7tD0maW4v3ZNbyPvOZfEeOjRuXHTl29tnFxWQNp1aVzKeA3rL71wGbI2I+sDndR9JCYDnwXmAJ0ClpfBqzBlgJzE8/S1L71cDrEXEqcBOwuti3YtbiSmsvX/1q5WM6O+HAgeJisoZVeJKR1A78O+DrZc3LgLXp9lrg4rL2uyJib0Q8A/QBZ0maBbRFxLaICOD2QWNKz3UPcG6pyjGzKluwAM45p/L+bW1Z9bJqVXExWUOrRSXz34DPAm+XtR0fEbsA0u/SVYpmAy+U9etPbbPT7cHtB42JiP3AHuC4wUFIWimpR1LPwMDA4b4ns9YzeTI88UTl/bu6YM+e4uKxplBokpG0FNgdEdsrHTJEW4zQPtKYgxsibomIjojomDFjRoXhmBmQ7Zpc6TYvxx+fVS9LlxYbkzWFoiuZs4GPSXoWuAv4qKR1wMtpCoz0e3fq3w+cVDa+HXgptbcP0X7QGEkTgGOA14p4M2Ytp3TmfqW7IH/xi/DTnxYbkzWVQpNMRFwfEe0RMZdsQf/BiLgC2AisSN1WAPen2xuB5emIsXlkC/wPpym1NyQtTustVw4aU3quy9Jr+MpGZodr6dLKr1g5eTLs3g1f+lKhIVnzqdfJmF8BNki6Gnge+DhAROyQtAHYCewHro2I0iEpq4DbgMnApvQDcCtwh6Q+sgpmea3ehNmYdfTR8C//Ulnfri5Pjdmw1Ip/9Hd0dERPT0+9wzBrPN3dcNFFlfdvwX8/Wpmk7RHRkWeMt5Uxs8zMmZDnyEsnGKuAt5Uxa3Wlxf1KE8xRRznBWMWcZMxa2aJFlS/uQ3bFykrXaszwdJlZa9q6Nd+Z++DqxUbFlYxZqznllHwJ5vTTnWBs1JxkzFpFae3lmWcqH7NzZ76LkJkN4ukys1aQ95LIRx3ltRerClcyZmNZd3f+SyJ7cd+qyJWM2Vg1dWq+XZDb2rxrslWdKxmzsWb9+qx6yZMwvC2/FaTiSkbSFOAXEfG2pPcAvwFsioh9hUVnZvlMnlz5lvyl/pXusGw2Cnkqme8DR0qaTXbJ5KvINqw0s3orXRI5T4JZt84JxgqXZ01GEfHztHPy1yLiRkk/KiowM6tQezu8+GLl/U84AXbtKi4eszJ5KhlJ+iBwOfC/U5sPHDCrl9J5L3kSzJYtTjBWU3mSzKeA64H70nVfTgH+sZiwzGxE55+fb8+x0ln7Z59dXExmQ8hTiTwfER8r3YmIpyXdUUBMZjaSSZPgl7+svP/OnbBgQXHxmI0gTyXzd2nRHwBJ/xr4RvVDMrMhrVmTTY9VmmAuuSSrXpxgrI7yVDLXAN+SdBHwAeC/ABcWEpWZHWziRNiX42yB3bthxozi4jGrUMVJJiL+SdIngW8DbwHnR0SOy+iZWW55t+SfPRv6+4uLxyynQyYZSV1A+T7fRwF7gFslUb5OY2ZVMjAA8+fnOwt/yxYv7FvDqaSS+a+FR2Fm7zjvPNi8Od8YX+/FGtQhk0xEfA9A0jxgV0S8le5PBo4vNjyzFrJ+PVxxRb4xl1wC995bTDxmVZBn4f9vgQ+V3T+Q2s6sakRmrWjKlPxbvHhx35pAnkOYJ0TEr46dTLcnVj8ksxZSut5LngSzeHE2PeYEY00gT5IZkPSrRX5Jy4BXRhog6UhJD0t6VNIOSX+W2qdJekDSU+n3sWVjrpfUJ+lJSReUtZ8h6fH02M2SlNonSbo7tT8kaW6O92RWP9Onw0UX5Ruzcyds21ZMPGYFyJNkrgH+VNLzkl4APgf8p0OM2Qt8NCJ+E1gELJG0GLgO2BwR88l2dL4OQNJCYDnwXmAJ0ClpfHquNcBKYH76WZLarwZej4hTgZuA1Tnek1ntlfYce/XVysdcc41PrLSmlOc8mZ8AiyUdTbYj8xsVjAngzXT3iPQTwDLg36T2tcB3yZLWMuCuiNgLPCOpDzhL0rNAW0RsA5B0O3AxsCmN+VJ6rnuAv5ak9NpmjWXRonyXQp45E15+ubh4zApWyXkyV0TEOkmfHtQOQER89RDjxwPbgVOBv4mIhyQdHxG70vhdkmam7rOBH5QN709t+9Ltwe2lMS+k59ovaQ9wHIOm8iStJKuEmDNnzqHetll15T2pEnzei40JlVQyU9LvXxvisUNWCxFxAFgkaSpwn6T3jdBdw7zGcO0jjRkcxy3ALQAdHR2ucqx2FiyAJ56ovP+8efD008XFY1ZDlZwn8z/Sze9ExNbyxyRV/GdWRPxM0nfJ1lJeljQrVTGzgN2pWz9wUtmwduCl1N4+RHv5mH5JE4BjgNcqjcusMN3d+Rf2Xb3YGJNn4f9rFbb9iqQZqYIpnbx5HvAEsBFYkbqtAO5PtzcCy9MRY/PIFvgfTlNrb0hanI4qu3LQmNJzXQY86PUYq7tp0/IlmGnTfL0XG5MqWZP5INlJmDMGrcu0AeOHHvUrs4C1aV1mHLAhIrolbQM2pEs5Pw98HCBdDG0DsBPYD1ybptsAVgG3AZPJFvw3pfZbgTvSQQKvkR2dZlY/GmoGdwRdXbB0aTGxmNVZJWsyE4GjU9/ydZl/JqschhURjwHvH6L9VeDcYcZ8GfjyEO09wLvWc9I2Nx8fKQ6zmsi7LUxbW74NMM2aUKV7l31P0m0R8dxw/SR9LSL+qKrRmTWLyZPhrbcq7+/qxVpExWsyIyWYxJPJ1npKV6usNMEccUS29uIEYy0iz8K/mZVs3Zoll098ovIxnZ2VXzrZbIzIswuzmQG0t8OLL1be/8QT8/U3G0OqWcnkPKTGrMmUdkzOkzC++EUnGGtpFVcykjqAzwMnp3Ei257s9NTlr6ofnlmDmD4934aW4Ou9mJFvumw98CfA48Dbgx+MiNuqFJNZ4+jthYUL843x1SrNfiVPkhmIiI2FRWLWaPLumAyuXswGybMm80VJX5f0e5IuLf0UFplZvZSOHMuTYFav9tUqzYaQp5K5CvgNsmvClKbLAvC8gI0deY8cmzMHnjvUKWRmrStPkvnNiDitsEjM6invljDgHZPNKpAnyfxA0sKI2FlYNGa11tsLp50GBw4cum/JtGn5jzQza1F51mTOAR6R9KSkxyQ9LumxogIzK9yHPpQdOZYnwXR1OcGY5ZCnkllSWBRmtTQwADNnHrpfOe+YbDYquTbIHOqnyODMqu7SS/MnmK4uJxizUfLeZdYaRnNS5ezZ0N9fTDxmLcK7MNvYd+aZ+RPMli1OMGZV4CRjY9fAQHZSZU9P5WO+8IXspEofmmxWFZ4us7Hp0kvhvvsq7z9hAuzbV1w8Zi3KlYyNPVK+BNPZ6QRjVhAnGRs7SpdCrtSkSdnU2KpVxcVk1uI8XWZjw4QJ+U6qXLcOLr+8uHjMDHAlY83uxhuz6qXSBHPkkVn14gRjVhOFJhlJJ0n6R0m9knZI+lRqnybpAUlPpd/Hlo25XlJf2r7mgrL2M9JWNn2SbpayeRFJkyTdndofkjS3yPdkDaK0Hf/nPlf5mHXr4Be/KC4mM3uXoiuZ/cBnImIBsBi4VtJC4Dpgc0TMBzan+6THlgPvJdvGplPS+PRca4CVwPz0U9rm5mrg9Yg4FbgJWF3we7J6a2+Hc86pvH9p7cXVi1nNFZpkImJXRPww3X4D6AVmA8uAtanbWuDidHsZcFdE7I2IZ4A+4CxJs4C2iNgWEQHcPmhM6bnuAc4tVTk2xqxfn1Uvea73sm4dvPVWcTGZ2YhqtvCfprHeDzwEHB8RuyBLRJJKm0nNBn5QNqw/te1Ltwe3l8a8kJ5rv6Q9wHHAK4W8Eau90WzHD1n1YmZ1VZOFf0lHA38H/OeI+OeRug7RFiO0jzRmcAwrJfVI6hkYGDhUyNYo5s/Pvx1/6VLIZlZ3hScZSUeQJZj1EVG6VPPLaQqM9Ht3au8HTiob3g68lNrbh2g/aIykCcAxwGuD44iIWyKiIyI6Zvg67I2vNDXW11f5mHHjsuTy2c8WF5eZ5VL00WUCbgV6I+KrZQ9tBFak2yuA+8val6cjxuaRLfA/nKbW3pC0OD3nlYPGlJ7rMuDBtG5jzWjr1ixZ5L0Ucmdn/uk0Mytc0WsyZwP/AXhc0iOp7U+BrwAbJF0NPA98HCAidkjaAOwkOzLt2ogo/cuxCrgNmAxsSj+QJbE7JPWRVTDLC35PVoSBgWxqLO91WxYuhB07ionJzA6bWvGP/o6OjujJszOvFevCC2HTpkP3G2znTliwoPrxmNmQJG2PiI48Y3zGv9VPb2+27pI3wZS243eCMWt43rvM6uP88+E738k35sMfhu9/v5h4zKwQTjJWWwMDMGtW/kX63bvBRwWaNR1Pl1ntXHopzJyZL8GUpsacYMyakisZK97AQJZc8rjmmuz6MGbW1JxkrFi///tw552V929ry38Ys5k1LE+XWTFKV6nMk2C6upxgzMYYVzJWfXmvUgnea8xsjHIlY9VT2m9sNAv7ZjYmuZKx6jjqqHxXnZw0ydd5MWsBrmTs8JQug5wnwfhCYmYtw5WMjd773w+PPHLofiXjx8P+/cXFY2YNx5WMjc6UKfkSTGenE4xZC3KSsXxuuCGbHvv5zyvrX7pK5apVxcZlZg3J02VWma1b4Zxz8o3xUWNmLc+VjB3aKafkSzBnnOEEY2aAKxkbyWj2HPOFxMysjCsZG1ppx+Q8fCExMxvEScYOVrpa5X33VT7mwgs9PWZmQ/J0mb1j0SJ49NHK+x95JDz/vK/1YmbDciVj75y1nyfBdHVlZ/k7wZjZCFzJtLqTT86qkUpNnlz5OTJm1vJcybSqUvWSJ8GsW+cEY2a5uJJpRe3t8OKLlff3jslmNkqFVjKSviFpt6Qfl7VNk/SApKfS72PLHrteUp+kJyVdUNZ+hqTH02M3S1JqnyTp7tT+kKS5Rb6fptfdnVUveRKMd0w2s8NQ9HTZbcCSQW3XAZsjYj6wOd1H0kJgOfDeNKZT0vg0Zg2wEpiffkrPeTXwekScCtwErC7snTS7qVPhoosq79/Wlh2WfPnlxcVkZmNeoUkmIr4PvDaoeRmwNt1eC1xc1n5XROyNiGeAPuAsSbOAtojYFhEB3D5oTOm57gHOLVU5lpSuVrlnT+Vjurry9TczG0Y91mSOj4hdABGxS1LptPLZwA/K+vWntn3p9uD20pgX0nPtl7QHOA54ZfCLSlpJVg0xZ86cqr2ZhjZzZrY1TKWmTYNXXy0uHjNrOY10dNlQFUiM0D7SmHc3RtwSER0R0TGjFc7taG/Pl2C6upxgzKzq6lHJvCxpVqpiZgG7U3s/cFJZv3bgpdTePkR7+Zh+SROAY3j39FzrmTKl8kONZ8+G/v5D9zMzG4V6VDIbgRXp9grg/rL25emIsXlkC/wPp6m1NyQtTustVw4aU3quy4AH07pN68pzQbEtW5xgzKxQRR/CfCewDfh1Sf2Srga+Apwv6Sng/HSfiNgBbAB2Av8AXBsRB9JTrQK+TnYwwE+ATan9VuA4SX3Ap0lHqrWkNWuyBFOJ00/Pjhw7++xiYzKzlqdW/MO/o6Mjenp66h1G9UyYAAcOHLof+HovZjZqkrZHREeeMY208G953XhjVr1UmmB8vRczqzFvK9Osxo3Ldw2XFqxYzaz+XMk0mxtuyKqXSpPGxIlOMGZWN04yzaK0sP8Xf1H5mKuugr17i4vJzOwQPF3WDPIs7Jfs3u0LiplZ3bmSaWR5F/YBVq/OpsecYMysAbiSaURbt8I55+QbM25c/mrHzKxgrmQazQkn5E8wnZ1OMGbWkFzJNIru7nzXe4HsyDEv7JtZA3Ml0wimT8+fYNatc4Ixs4bnJFNPpcsh59liv7Sw7ytWmlkT8HRZvUydmu/qk17YN7Mm5EqmHvJeDtkL+2bWpJxkaqm3t/Lt+AHOPTebGlu1qriYzMwK5OmyWnnf+2DHjsr7+4x9MxsDXMkUrbTnWKUJZto0n7FvZmOGK5kiHXEE7N9fef+uLli6tLh4zMxqzEmmKHnWXk48EV58sbhYzMzqxNNl1ZZ3cb+tzQnGzMYsJ5lqmjcPFi6svH9nZ75Dmc3MmoyTTDWUtuR/9tnK+p9wgg9NNrOW4DWZwzEwADNn5huzZQucfXYx8ZiZNRhXMqN14YX5EswHPpBVL04wZtZCnGTyKm1quWlT5WOmToXt24uLycysQY2JJCNpiaQnJfVJuq6wFzr55Pxb8nd2wuuvFxOPmVmDa/okI2k88DfAbwMLgd+TlOMQrwqdfDI8/3zl/Ves8OK+mbW8sbDwfxbQFxFPA0i6C1gG7KzaK3R3V55gpkyBN9+s2kubmTWzpq9kgNnAC2X3+1PbQSStlNQjqWdgYCDfK3zrW5X16+pygjEzKzMWksxQp9fHuxoibomIjojomJF388mLLx758VNOyabGvO+YmdlBxkKS6QdOKrvfDrxU1VdYuhROO23ox3buhJ/8pKovZ2Y2VoyFJPNPwHxJ8yRNBJYDG6v+Ko89lk2HnX46zJoFX/hCVr0sWFD1lzIzGyuafuE/IvZL+kPg/wDjgW9ERI6rg+WwdKmnxMzMcmj6JAMQEX8P/H294zAzs4ONhekyMzNrUE4yZmZWGCcZMzMrjJOMmZkVRhHvOm9xzJM0ADw3yuHTgVeqGE41Obb8GjUucGyj1aixNWpcUHlsJ0dErrPZWzLJHA5JPRHRUe84huLY8mvUuMCxjVajxtaocUGxsXm6zMzMCuMkY2ZmhXGSye+WegcwAseWX6PGBY5ttBo1tkaNCwqMzWsyZmZWGFcyZmZWGCcZMzMrTkS03A9wJPAw8CiwA/iz1P4XwGPAI8C3gRPLxlwP9AFPAheUtZ8BPJ4eu5l3piAnAXen9oeAuUXEBswFfpHaHwH+exGxDRdX2eN/THaxuOmN8pkNF1utPrND/Pf8EvBiWQwXNsrnNlxsjfBdA/4ofS47gBsb5TMbLrYG+a7dXfb6zwKP1PJzq9s/9PX8Ibua5tHp9hHpw1oMtJX1+WTpCwEsTP/hJgHzgJ8A49NjDwMfTM+5Cfjt1P6JsvHLgbsLim0u8ONhnqtqsQ0XV7p/EtmlFp7jnX/I6/6ZjRBbTT6zQ/z3/BLwx0P0r/vnNkJsdf2uAR8BvgNMSo/NbKDPbLjY6v5dG9TnL4Ebavm5teR0WWTeTHePSD8REf9c1m0K71zGeRlwV0TsjYhnyLL4WZJmkf3jvy2yT/124OKyMWvT7XuAcyUNdanow41tSNWObbi40v2bgM8Oiqnun9kIsQ2pDrENpVE+t4rV8Lu2CvhKROxN/XaXvUa9P7PhYhtSPf57puf498CdZa9T+OfWkkkGQNJ4SY8Au4EHIuKh1P5lSS8AlwM3pO6zgRfKhventtnp9uD2g8ZExH5gD3BcAbEBzJP0I0nfk/ThstevamxDxSXpY8CLEfHooO51/8xGiA1q9JkNF1t66A8lPSbpG5KOHfw6g2JohNigjt814D3AhyU9lF7/zMGvMej1a/mZDRcbNMZ3DeDDwMsR8dTg1xkUQ1Vja9kkExEHImIR0E6Wvd+X2j8fEScB64E/TN2HytQxQvtIY6od2y5gTkS8H/g08E1JbUXENkRcpwOf5+CEV1Lvz2yk2Gr2mQ0T2/uANcC/AhaleP7yEK/TCNjteFkAAALRSURBVLHV87v2PrKLLB5LNj31J8CG9Fd0I3xmw8XWCN+1kt/jnSpmpNepamwtm2RKIuJnwHeBJYMe+ibwO+l2P9ncfkk78FJqbx+i/aAxkiYAxwCvVTu2VOq+mm5vJ5tXfU+RsZXFtYxsLvdRSc+m1/ihpBOo/2c2bGz1+MwGxbYkIl5O/yC8DfxP4KzBrzMohrrHVufv2pL0XPemaaGHgbfJNnas+2c2XGyN8F0re55LyRbtS2ryubVkkpE0Q9LUdHsycB7whKT5Zd0+BjyRbm8ElkuaJGkeMB94OCJ2AW9IWpz+arkSuL9szIp0+zLgwTS/WdXYUv/x6fYpKbanqx3bMHH9KCJmRsTciJhL9gX8QET8tAE+s2Fjq9VnNkJsTyib9y65BPhx2evU+7s2ZGx1/q49AXwL+Ghqfw8wkWzn4Lp/ZsPF1gjftfTwecATEVE+DVabzy0qOGphrP0ApwM/Ijsk+Me8c7TF36X7jwFdwOyyMZ8n+yvkSdKRFqm9I435CfDXvHOo35HA35Itpj0MnFJEbGQVzQ6yo0R+CFxURGzDxTWoz7McfAhzXT+z4WKr1Wd2iP+ed5AdIvoY2f+4sxrlcxsutnp/18j+4V6X2n4IfLSBPrMhY2uE71p67DbgmiHGFP65eVsZMzMrTEtOl5mZWW04yZiZWWGcZMzMrDBOMmZmVhgnGTMzK4yTjFmdSfq/w7TfJumyWsdjVk1OMmZ1FhEfqncMZkWZUO8AzFqdpDcj4uh0dvXXyM4cf4ah94kyayquZMwaxyXArwOnAf8RcIVjTc9Jxqxx/BZwZ2SbU74EPFjvgMwOl5OMWWPxPk82pjjJmDWO75Ptijs+7YT8kXoHZHa4vPBv1jjuI1v0fxz4f8D36huO2eHzLsxmZlYYT5eZmVlhnGTMzKwwTjJmZlYYJxkzMyuMk4yZmRXGScbMzArjJGNmZoX5/4RbUU+9atxUAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "bot_actions.plot(kind='scatter',x='id',y='m_ticks',color='red')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 687, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "player_actions.plot(kind='scatter',x='id',y='m_ticks',color='blue')\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Correlation between action values in bots and humans" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can see that `m_ticks` are random for humans and consistent for bots.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Correlation between action values in bots and humans" ] }, { "cell_type": "code", "execution_count": 688, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "bot_actions.plot(kind='scatter',x='id',y='m_value',color='red')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 689, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "player_actions.plot(kind='scatter',x='id',y='m_value',color='blue')\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Alright, enough with the graphs, lets build a data structure for a model training" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Build few models - full data, m_ticks, and m_value" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Full data - (ticks,action,value,value_l,value_r) - prepare the data " ] }, { "cell_type": "code", "execution_count": 690, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['sessionid', 'id', 'm_ticks', 'm_kart_id', 'm_action', 'm_value',\n", " 'm_value_l', 'm_value_r', 'class'],\n", " dtype='object')\n", "sessionid 42790\n", "id 42790\n", "m_ticks 42790\n", "m_kart_id 42790\n", "m_action 42790\n", "m_value 42790\n", "m_value_l 42790\n", "m_value_r 42790\n", "class 42790\n", "dtype: int64\n" ] } ], "source": [ "actions = pd.DataFrame(action_list)\n", "actions.rename(columns={0:'sessionid',\n", " 1:'id',\n", " 2:'m_ticks',\n", " 3:'m_kart_id',\n", " 4:'m_action',\n", " 5:'m_value',\n", " 6:'m_value_l',\n", " 7:'m_value_r',\n", " 8:'class'\n", " }, \n", " inplace=True)\n", "print(actions.columns)\n", "print(actions.count())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We used `id` for sorting the session actions but now we dont need it anymore." ] }, { "cell_type": "code", "execution_count": 704, "metadata": {}, "outputs": [], "source": [ "full_actions=actions.drop('id',axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Checking the amount of data we lose in aggreagtion. The session length is 25 so we are going loose all the session that are smaller than 25" ] }, { "cell_type": "code", "execution_count": 705, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sessionid\n", "0 26\n", "1 29\n", "2 25\n", "3 28\n", "4 27\n", " ... \n", "575 100\n", "576 100\n", "577 100\n", "578 100\n", "579 14\n", "Length: 450, dtype: int64\n", "42601\n" ] }, { "data": { "text/plain": [ "sessionid 42790\n", "m_ticks 42790\n", "m_kart_id 42790\n", "m_action 42790\n", "m_value 42790\n", "m_value_l 42790\n", "m_value_r 42790\n", "class 42790\n", "dtype: int64" ] }, "execution_count": 705, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(full_actions.groupby(['sessionid'],axis=0).size())\n", "sum=0\n", "for i in full_actions.groupby(['sessionid'],axis=0).size():\n", " #print(i,end='####')\n", " if(i>25):\n", " sum=sum+i\n", "print(sum)\n", "full_actions.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Checking the size of sessions. Session is an ordered set of actions grouped by the sessiondid, the player kart id and the classification. \n", "\n", "Classification is set as 1 in case of human, and 0 as bot.\n", "\n", "The `sum` is the number of remaining data after aggregating on both the kartid and class." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Construct a dictonary based on `sessionid` and action classification `class`" ] }, { "cell_type": "code", "execution_count": 706, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "42790\n" ] } ], "source": [ "player_session_dict = {}\n", "i=0\n", "\n", "for index, row in full_actions.iterrows():\n", " i=i+1\n", " play_action_val=row['m_ticks'],row['m_action'],row['m_value'],row['m_value_l'],row['m_value_r']\n", " play_action_key=row['sessionid'],row['m_kart_id'],row['class']\n", " if(player_session_dict.get(play_action_key)):\n", " session_list=player_session_dict[play_action_key]\n", " session_list.append(play_action_val)\n", " player_session_dict[play_action_key]=session_list \n", " else:\n", " session_list=[]\n", " session_list.append(play_action_val)\n", " player_session_dict[play_action_key]=session_list\n", "print(i)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check the session length (number of actions in a session)" ] }, { "cell_type": "code", "execution_count": 694, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "number of actions pulled from aurora=42790\n", "number of actions after initial aggregations=41513\n" ] }, { "data": { "text/plain": [ "939" ] }, "execution_count": 694, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum=0\n", "for key in player_session_dict.keys():\n", " if (len(player_session_dict[key])>=25):\n", " #print(len(player_session_dict[key]),end = '###')\n", " sum=sum+len(player_session_dict[key])\n", "print()\n", "print('number of actions pulled from aurora={}'.format(i))\n", "print('number of actions after initial aggregations={}'.format(sum))\n", "len(player_session_dict)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Transpose the dataframe to a new column strucutre that concatenates series of consecutive actions in a fixed size of 25" ] }, { "cell_type": "code", "execution_count": 695, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1306\n" ] } ], "source": [ "import numpy as np\n", "session_len=25\n", "#a line in the array is five set of actions plus the classification \n", "size_of_arr_line=session_len*5+1\n", "sessions_array = np.zeros((1306,size_of_arr_line), dtype=float, order='C')\n", "\n", "i=0\n", "sum_of_actions=0\n", "for key in player_session_dict:\n", " raw_session=player_session_dict[key]\n", " #break the session into equal parts 25 actions long\n", " sessions_list=[raw_session[i:i + session_len] for i in range(0, len(raw_session), session_len)] \n", " #every session has 25 action set\n", " for session in sessions_list:\n", " #including only the full sessions with excatly 25 actions\n", " if(len(session)==session_len):\n", " new_row=[]\n", " for action in session:\n", " new_row.append(action[0])\n", " new_row.append(action[1])\n", " new_row.append(action[2])\n", " new_row.append(action[3])\n", " new_row.append(action[4])\n", " #add class field to the last column \n", " new_row.append(key[2])\n", " sessions_array[i]=np.asarray(new_row)\n", " i=i+1 \n", "print(i)" ] }, { "cell_type": "code", "execution_count": 696, "metadata": {}, "outputs": [], "source": [ "df=pd.DataFrame(sessions_array) " ] }, { "cell_type": "code", "execution_count": 697, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...116117118119120121122123124125
09114.01.03159.02218.0-3296.09145.00.032768.02218.0-3159.0...0.00.032768.00.01385.02.032768.00.00.00.0
11443.01.032768.00.00.01521.01.00.00.0-32768.0...0.00.032768.00.03047.02.00.00.00.00.0
23967.00.032768.00.00.04082.02.032768.032768.00.0...0.032768.00.00.05361.00.00.032768.00.00.0
35420.01.032768.00.00.05480.01.00.00.0-32768.0...1.00.00.0-32768.07166.00.032768.00.00.00.0
47963.01.032768.032768.00.07964.00.00.032768.0-32768.0...1.032768.00.00.09307.01.00.00.0-32768.00.0
\n", "

5 rows × 126 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 \\\n", "0 9114.0 1.0 3159.0 2218.0 -3296.0 9145.0 0.0 32768.0 2218.0 \n", "1 1443.0 1.0 32768.0 0.0 0.0 1521.0 1.0 0.0 0.0 \n", "2 3967.0 0.0 32768.0 0.0 0.0 4082.0 2.0 32768.0 32768.0 \n", "3 5420.0 1.0 32768.0 0.0 0.0 5480.0 1.0 0.0 0.0 \n", "4 7963.0 1.0 32768.0 32768.0 0.0 7964.0 0.0 0.0 32768.0 \n", "\n", " 9 ... 116 117 118 119 120 121 122 \\\n", "0 -3159.0 ... 0.0 0.0 32768.0 0.0 1385.0 2.0 32768.0 \n", "1 -32768.0 ... 0.0 0.0 32768.0 0.0 3047.0 2.0 0.0 \n", "2 0.0 ... 0.0 32768.0 0.0 0.0 5361.0 0.0 0.0 \n", "3 -32768.0 ... 1.0 0.0 0.0 -32768.0 7166.0 0.0 32768.0 \n", "4 -32768.0 ... 1.0 32768.0 0.0 0.0 9307.0 1.0 0.0 \n", "\n", " 123 124 125 \n", "0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 \n", "2 32768.0 0.0 0.0 \n", "3 0.0 0.0 0.0 \n", "4 0.0 -32768.0 0.0 \n", "\n", "[5 rows x 126 columns]" ] }, "execution_count": 697, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 698, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "125\n", "0.0 1186\n", "1.0 120\n", "dtype: int64\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...116117118119120121122123124125
09114.01.03159.02218.0-3296.09145.00.032768.02218.0-3159.0...0.00.032768.00.01385.02.032768.00.00.00.0
\n", "

1 rows × 126 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 \\\n", "0 9114.0 1.0 3159.0 2218.0 -3296.0 9145.0 0.0 32768.0 2218.0 -3159.0 \n", "\n", " ... 116 117 118 119 120 121 122 123 124 125 \n", "0 ... 0.0 0.0 32768.0 0.0 1385.0 2.0 32768.0 0.0 0.0 0.0 \n", "\n", "[1 rows x 126 columns]" ] }, "execution_count": 698, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(df.groupby(125,axis=0).size())\n", "df.head(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To check if we did not loose data we need to multiple the number by 25" ] }, { "cell_type": "code", "execution_count": 699, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "class\n", "0 38973\n", "1 3817\n", "dtype: int64" ] }, "execution_count": 699, "metadata": {}, "output_type": "execute_result" } ], "source": [ "min_actions.groupby(['class'],axis=0).size()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check how much data was lost during transformation." ] }, { "cell_type": "code", "execution_count": 700, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.03143830233167409\n", "0.03237606464293514\n" ] } ], "source": [ "print(120/3817)\n", "print(1186/36632)" ] }, { "cell_type": "code", "execution_count": 701, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RangeIndex(start=0, stop=126, step=1)\n", "feature_columns=RangeIndex(start=0, stop=125, step=1)\n", "label_column=125\n" ] } ], "source": [ "print(df.columns)\n", "feature_columns = df.columns[:-1]\n", "label_column = df.columns[-1]\n", "\n", "print('feature_columns={}'.format(feature_columns))\n", "print('label_column={}'.format(label_column))\n", "\n", "features = df[feature_columns].values.astype('float32')\n", "labels = (df[label_column].values).astype('float32')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Prepare Data and Upload to S3" ] }, { "cell_type": "code", "execution_count": 658, "metadata": {}, "outputs": [], "source": [ "import io\n", "import sagemaker.amazon.common as smac\n", "\n", "buf = io.BytesIO()\n", "smac.write_numpy_to_dense_tensor(buf, features, labels)\n", "buf.seek(0);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we upload the data to S3 using boto3." ] }, { "cell_type": "code", "execution_count": 659, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Uploaded training data location: s3://stk-events/anticheat-model/train/recordio-pb-data\n", "Training artifacts will be uploaded to: s3://stk-events/anticheat-model/output\n" ] } ], "source": [ "import boto3\n", "import os\n", "import sagemaker\n", "\n", "session = sagemaker.Session()\n", "bucket = 'stk-events'\n", "\n", "prefix = 'anticheat-model'\n", "key = 'recordio-pb-data'\n", "\n", "boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)\n", "\n", "s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)\n", "print('Uploaded training data location: {}'.format(s3_train_data))\n", "\n", "output_location = 's3://{}/{}/output'.format(bucket, prefix)\n", "print('Training artifacts will be uploaded to: {}'.format(output_location))" ] }, { "cell_type": "code", "execution_count": 660, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.\n", "Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.\n", "'s3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.\n" ] } ], "source": [ "from sagemaker.amazon.amazon_estimator import get_image_uri\n", "\n", "container = get_image_uri(boto3.Session().region_name, 'linear-learner')\n", "\n", "from sagemaker import get_execution_role\n", "\n", "linear = sagemaker.estimator.Estimator(container,\n", " get_execution_role(), \n", " train_instance_count=1, \n", " train_instance_type='ml.c4.xlarge',\n", " output_path=output_location,\n", " sagemaker_session=session)\n", "linear.set_hyperparameters(feature_dim=features.shape[1],\n", " predictor_type='binary_classifier',\n", " mini_batch_size=200)\n", "\n", "linear.fit({'train': s3_train_data},wait=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Host Linear Classifier" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we deploy the estimator to and endpoint.\n" ] }, { "cell_type": "code", "execution_count": 662, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Parameter image will be renamed to image_uri in SageMaker Python SDK v2.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "-------------------!" ] } ], "source": [ "from sagemaker.predictor import csv_serializer, json_deserializer\n", "\n", "linear_predictor = linear.deploy(initial_instance_count=1,\n", " endpoint_name=\"stk-bot-detector\",\n", " instance_type='ml.m4.xlarge')\n", "# Specify input and output formats.\n", "linear_predictor.content_type = 'text/csv'\n", "linear_predictor.serializer = csv_serializer\n", "linear_predictor.deserializer = json_deserializer" ] }, { "cell_type": "code", "execution_count": 663, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Endpoint name: stk-bot-detector\n" ] } ], "source": [ "print('Endpoint name: {}'.format(linear_predictor.endpoint))" ] }, { "cell_type": "code", "execution_count": 664, "metadata": {}, "outputs": [], "source": [ "from sagemaker.predictor import csv_serializer, json_deserializer\n", "\n", "linear_predictor.content_type = 'text/csv'\n", "linear_predictor.serializer = csv_serializer\n", "linear_predictor.accept = 'application/json'\n", "linear_predictor.deserializer = json_deserializer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Calling the endpoint" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "prepare the parameters for the model endpoint call.\n", "\n", "This SQL will return a game session actions (25 actions) that we trained the model with" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```sql\n", "select concat(id,\"-\",m_kart_id) id, m_ticks,m_action,m_value,m_value_l,m_value_r \n", "from (\n", " select id,m_kart_id,m_ticks,m_action,m_value,m_value_l,m_value_r \n", " from actions order by m_kart_id,id limit 25) t;\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is an example result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "mysql> select * from tmp\n", " -> ;\n", "+-------+---------+----------+---------+-----------+-----------+\n", "| id | ticks | action | value | value_l | value_r |\n", "+-------+---------+----------+---------+-----------+-----------+\n", "| 0 | 31 | 1 | 4479 | 0 | 0 |\n", "| 1 | 32 | 2 | 32768 | 0 | -4479 |\n", "| 2 | 62 | 1 | 5423 | 0 | -4479 |\n", "| 3 | 93 | 1 | 6317 | 0 | -5423 |\n", "| 4 | 124 | 1 | 5064 | 0 | -6317 |\n", "| 5 | 155 | 1 | 5729 | 0 | -5064 |\n", "| 6 | 186 | 1 | 6469 | 0 | -5729 |\n", "| 7 | 217 | 1 | 6714 | 0 | -6469 |\n", "| 8 | 248 | 0 | 32768 | 0 | -6714 |\n", "| 9. | 279 | 1 | 7953 | 32768 | -6714 |\n", "| 10 | 310 | 1 | 14415 | 32768 | -7953 |\n", "| 11 | 341 | 0 | 3698 | 32768 | -14415 |\n", "| 12 | 372 | 0 | 5492 | 3698 | -14415 |\n", "| 13 | 403 | 0 | 20817 | 5492 | -14415 |\n", "| 14 | 434 | 0 | 26290 | 20817 | -14415 |\n", "| 15. | 465 | 0 | 24373 | 26290 | -14415 |\n", "| 16. | 496 | 0 | 11402 | 24373 | -14415 |\n", "| 17. | 527 | 0 | 10370 | 11402 | -14415 |\n", "| 18. | 558 | 0 | 12071 | 10370 | -14415 |\n", "| 19. | 589 | 0 | 15223 | 12071 | -14415 |\n", "| 20. | 620 | 0 | 17070 | 15223 | -14415 |\n", "| 21. | 651 | 0 | 19485 | 17070 | -14415 |\n", "| 22. | 682 | 0 | 16294 | 19485 | -14415 |\n", "| 23. | 713 | 0 | 12267 | 16294 | -14415 |\n", "| 24. | 744 | 0 | 10856 | 12267 | -14415 |\n", "+-------+---------+----------+---------+-----------+-----------+\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We need to concatenate the 25 results into a single line to send it to the model endpoint." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "+---------+----------+---------+-----------+-----------+ +---------+---------+--------+----------+----------+\n", "| ticks_0 | action_0 | value_0 | value_l_0 | value_r_0 |..... |ticks_24 |action_24|value_24|value_l_24|value_r_24|\n", "+---------+----------+---------+-----------+-----------+ +---------+---------+--------+----------+----------+\n", " 31 1 4479 0 0 744 0 10856 12267 -14415\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```sql\n", "select group_concat(concat(m_ticks,',',m_action,',',m_value,',',m_value_l,',',m_value_r)) as col \n", "from (select concat(id,\"-\",m_kart_id) id, m_ticks,m_action,m_value,m_value_l,m_value_r \n", " from ( \n", " select id,m_kart_id,m_ticks,m_action,m_value,m_value_l,m_value_r \n", " from actions order by m_kart_id,id limit 25) \n", " t) \n", "t1;\n", "```" ] }, { "cell_type": "code", "execution_count": 665, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 19.4 ms, sys: 0 ns, total: 19.4 ms\n", "Wall time: 98.7 ms\n" ] } ], "source": [ "%%time\n", "import pandas as pd\n", "\n", "modulo_size=224\n", "\n", "try:\n", " conn = mysql.connector.connect(host=ENDPOINT, user=USR, passwd=PASWD, port=3306, database=DBNAME)\n", " cur = conn.cursor()\n", " cur.execute(\"\"\"\n", " select group_concat(concat(m_ticks,',',m_action,',',m_value,',',m_value_l,',',m_value_r)) as sample\n", " from (\n", " select concat(id,\"-\",m_kart_id) id, m_ticks,m_action,m_value,m_value_l,m_value_r\n", " from (\n", " select id,m_kart_id,m_ticks,m_action,m_value,m_value_l,m_value_r\n", " from (\n", " select id,m_kart_id,m_ticks,m_action,m_value,m_value_l,m_value_r\n", " from actions order by id desc limit 70) t1 order by m_kart_id limit 25) t2) t3; \n", " \"\"\")\n", " sample_list=cur.fetchall() \n", "except Exception as e:\n", " print(\"Database connection failed due to {}\".format(e))" ] }, { "cell_type": "code", "execution_count": 666, "metadata": {}, "outputs": [], "source": [ "sample=''.join(sample_list[0])" ] }, { "cell_type": "code", "execution_count": 667, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'80447,0,0,32768,0,78329,1,0,0,-32768,76364,1,32768,0,0,78647,1,0,0,-32768,76891,0,0,32768,0,79382,1,32768,0,0,77409,4,0,32768,0,79825,1,0,0,-32768,77971,1,32768,0,0,80669,3,0,0,0,78484,1,32768,0,0,76713,1,32768,0,0,79121,1,0,0,-32768,77126,0,32768,0,0,79600,1,32768,0,0,77585,0,32768,0,0,80430,0,32768,0,0,78299,1,32768,0,0,76239,1,0,0,-32768,78622,1,32768,0,0,76875,0,32768,0,0,79270,1,0,0,-32768,77261,0,32768,0,0,79774,1,32768,0,0,77882,1,0,0,-32768'" ] }, "execution_count": 667, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample" ] }, { "cell_type": "code", "execution_count": 668, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'predictions': [{'score': 0.0001962552487384528, 'predicted_label': 0}]}\n" ] } ], "source": [ "results = linear_predictor.predict(sample)\n", "print(results)" ] }, { "cell_type": "code", "execution_count": 671, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 19.1 ms, sys: 0 ns, total: 19.1 ms\n", "Wall time: 63.7 ms\n" ] }, { "data": { "text/plain": [ "'62,1,846,0,-672,589,0,32768,32768,-32768,899,0,30620,19736,-1848,279,0,765,0,-102,651,7,32768,32768,-5391,31,1,672,0,0,837,0,3445,11325,-1848,217,1,69,0,-287,1085,0,32768,18005,-3301,434,0,32768,32768,-20774,775,0,11325,11227,-9552,155,1,456,0,-282,31,2,32768,0,-672,527,1,32768,32768,-29192,1023,1,3301,16818,-1848,372,1,9534,32768,-102,713,1,9552,32768,-19031,93,1,209,0,-846,930,0,32768,30620,-1848,310,0,508,765,-102,868,0,19736,3445,-1848,248,1,102,0,-69,806,1,1848,11325,-9552,186,1,287,0,-456,682,1,19031,32768,-5391'" ] }, "execution_count": 671, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "import pandas as pd\n", "\n", "modulo_size=224\n", "\n", "try:\n", " conn = mysql.connector.connect(host=ENDPOINT, user=USR, passwd=PASWD, port=3306, database=DBNAME)\n", " cur = conn.cursor()\n", " cur.execute(\"\"\"\n", " select group_concat(concat(m_ticks,',',m_action,',',m_value,',',m_value_l,',',m_value_r)) as sample\n", " from (\n", " select concat(id,\"-\",m_kart_id) id, m_ticks,m_action,m_value,m_value_l,m_value_r\n", " from (\n", " select id,m_kart_id,m_ticks,m_action,m_value,m_value_l,m_value_r\n", " from (\n", " select id,m_kart_id,m_ticks,m_action,m_value,m_value_l,m_value_r\n", " from actions where class=1 order by id limit 70) t1 order by m_kart_id limit 25) t2) t3; \n", " \"\"\")\n", " sample_list=cur.fetchall() \n", "except Exception as e:\n", " print(\"Database connection failed due to {}\".format(e))\n", "sample=''.join(sample_list[0])\n", "sample" ] }, { "cell_type": "code", "execution_count": 672, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'predictions': [{'score': 0.15919774770736694, 'predicted_label': 0}]}\n" ] } ], "source": [ "results = linear_predictor.predict(sample)\n", "print(results)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Clean up\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will leave the prediction endpoint running at the end of this notebook so we can handle incoming event streams. However, don't forget to delete the prediction endpoint when you're done. You can do that at the Amazon SageMaker console in the Endpoints page. Or you can run linear_predictor.delete_endpoint()" ] } ], "metadata": { "kernelspec": { "display_name": "conda_pytorch_latest_p36", "language": "python", "name": "conda_pytorch_latest_p36" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 4 }