{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# BikeNow XGBoost Regression\n", "\n", "Import libraries." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://s3-us-east-1.amazonaws.com/bike-demo-stack-applicationdatal-s3bucketdatalake-dncpxowjgqbf\n", "CPU times: user 595 ms, sys: 43.4 ms, total: 639 ms\n", "Wall time: 658 ms\n" ] } ], "source": [ "%%time\n", "\n", "import os\n", "import boto3\n", "import re\n", "from sagemaker import get_execution_role\n", "\n", "role = get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", "bucket='bike-demo-stack-applicationdatal-s3bucketdatalake-dncpxowjgqbf' # put your s3 bucket name here, and create s3 bucket\n", "src_file = 'unload/station_status_history_000'\n", "prefix = 'sagemaker/bikenow-xgboost-regression'\n", "\n", "# customize to your bucket where you have stored the data\n", "bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)\n", "\n", "print(bucket_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Helper functions to split data into training, validation, and testing sets." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 8 µs, sys: 0 ns, total: 8 µs\n", "Wall time: 294 µs\n" ] } ], "source": [ "%%time\n", "\n", "import io\n", "import boto3\n", "import random\n", "\n", "def data_split(FILE_DATA, FILE_TRAIN, FILE_VALIDATION, FILE_TEST, PERCENT_TRAIN, PERCENT_VALIDATION, PERCENT_TEST):\n", " data = [l for l in open(FILE_DATA, 'r')]\n", " train_file = open(FILE_TRAIN, 'w')\n", " valid_file = open(FILE_VALIDATION, 'w')\n", " tests_file = open(FILE_TEST, 'w')\n", "\n", " num_of_data = len(data)\n", " num_train = int((PERCENT_TRAIN/100.0)*num_of_data)\n", " num_valid = int((PERCENT_VALIDATION/100.0)*num_of_data)\n", " num_tests = int((PERCENT_TEST/100.0)*num_of_data)\n", "\n", " data_fractions = [num_train, num_valid, num_tests]\n", " split_data = [[],[],[]]\n", "\n", " rand_data_ind = 0\n", "\n", " for split_ind, fraction in enumerate(data_fractions):\n", " for i in range(fraction):\n", " rand_data_ind = random.randint(0, len(data)-1)\n", " split_data[split_ind].append(data[rand_data_ind])\n", " data.pop(rand_data_ind)\n", "\n", " for l in split_data[0]:\n", " train_file.write(l)\n", "\n", " for l in split_data[1]:\n", " valid_file.write(l)\n", "\n", " for l in split_data[2]:\n", " tests_file.write(l)\n", "\n", " train_file.close()\n", " valid_file.close()\n", " tests_file.close()\n", "\n", "def write_to_s3(fobj, bucket, key):\n", " return boto3.Session(region_name=region).resource('s3').Bucket(bucket).Object(key).upload_fileobj(fobj)\n", "\n", "def upload_to_s3(bucket, channel, filename):\n", " fobj=open(filename, 'rb')\n", " key = prefix+'/'+channel\n", " url = 's3://{}/{}/{}'.format(bucket, key, filename)\n", " print('Writing to {}'.format(url))\n", " write_to_s3(fobj, bucket, key)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Download data and split files." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing to s3://bike-demo-stack-applicationdatal-s3bucketdatalake-dncpxowjgqbf/sagemaker/bikenow-xgboost-regression/train/bikenow.train\n", "Writing to s3://bike-demo-stack-applicationdatal-s3bucketdatalake-dncpxowjgqbf/sagemaker/bikenow-xgboost-regression/validation/bikenow.validation\n", "Writing to s3://bike-demo-stack-applicationdatal-s3bucketdatalake-dncpxowjgqbf/sagemaker/bikenow-xgboost-regression/test/bikenow.test\n", "CPU times: user 4.52 s, sys: 20.5 ms, total: 4.54 s\n", "Wall time: 5.08 s\n" ] } ], "source": [ "%%time\n", "import urllib.request\n", "\n", "# Load the dataset\n", "FILE_DATA = 'bikenow'\n", "boto3.Session(region_name=region).resource('s3').Bucket(bucket).Object(src_file).download_file(FILE_DATA)\n", "#urllib.request.urlretrieve(\"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/abalone\", FILE_DATA)\n", "\n", "#split the downloaded data into train/test/validation files\n", "FILE_TRAIN = 'bikenow.train'\n", "FILE_VALIDATION = 'bikenow.validation'\n", "FILE_TEST = 'bikenow.test'\n", "PERCENT_TRAIN = 70\n", "PERCENT_VALIDATION = 15\n", "PERCENT_TEST = 15\n", "data_split(FILE_DATA, FILE_TRAIN, FILE_VALIDATION, FILE_TEST, PERCENT_TRAIN, PERCENT_VALIDATION, PERCENT_TEST)\n", "\n", "#upload the files to the S3 bucket\n", "upload_to_s3(bucket, 'train', FILE_TRAIN)\n", "upload_to_s3(bucket, 'validation', FILE_VALIDATION)\n", "upload_to_s3(bucket, 'test', FILE_TEST)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get XGBoost container image." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 27 µs, sys: 1 µs, total: 28 µs\n", "Wall time: 30.3 µs\n" ] } ], "source": [ "%%time\n", "from sagemaker.amazon.amazon_estimator import get_image_uri\n", "container = get_image_uri(region, 'xgboost', '0.90-1')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create training job." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training job bikenow-xgboost-regression-2019-12-19-02-00-00\n", "InProgress\n", "InProgress\n", "InProgress\n", "InProgress\n", "Completed\n", "CPU times: user 70.7 ms, sys: 546 µs, total: 71.3 ms\n", "Wall time: 4min\n" ] } ], "source": [ "%%time\n", "import boto3\n", "from time import gmtime, strftime\n", "\n", "job_name = 'bikenow-xgboost-regression-' + strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n", "print(\"Training job\", job_name)\n", "\n", "#Ensure that the training and validation data folders generated above are reflected in the \"InputDataConfig\" parameter below.\n", "\n", "create_training_params = \\\n", "{\n", " \"AlgorithmSpecification\": {\n", " \"TrainingImage\": container,\n", " \"TrainingInputMode\": \"File\"\n", " },\n", " \"RoleArn\": role,\n", " \"OutputDataConfig\": {\n", " \"S3OutputPath\": bucket_path + \"/\" + prefix + \"/single-xgboost\"\n", " },\n", " \"ResourceConfig\": {\n", " \"InstanceCount\": 1,\n", " \"InstanceType\": \"ml.m4.4xlarge\",\n", " \"VolumeSizeInGB\": 5\n", " },\n", " \"TrainingJobName\": job_name,\n", " \"HyperParameters\": {\n", " \"max_depth\":\"5\",\n", " \"eta\":\"0.2\",\n", " \"gamma\":\"4\",\n", " \"min_child_weight\":\"6\",\n", " \"subsample\":\"0.7\",\n", " \"silent\":\"0\",\n", " \"objective\":\"reg:linear\",\n", " \"num_round\":\"50\"\n", " },\n", " \"StoppingCondition\": {\n", " \"MaxRuntimeInSeconds\": 3600\n", " },\n", " \"InputDataConfig\": [\n", " {\n", " \"ChannelName\": \"train\",\n", " \"DataSource\": {\n", " \"S3DataSource\": {\n", " \"S3DataType\": \"S3Prefix\",\n", " \"S3Uri\": bucket_path + \"/\" + prefix + '/train',\n", " \"S3DataDistributionType\": \"FullyReplicated\"\n", " }\n", " },\n", " \"ContentType\": \"csv\",\n", " \"CompressionType\": \"None\"\n", " },\n", " {\n", " \"ChannelName\": \"validation\",\n", " \"DataSource\": {\n", " \"S3DataSource\": {\n", " \"S3DataType\": \"S3Prefix\",\n", " \"S3Uri\": bucket_path + \"/\" + prefix + '/validation',\n", " \"S3DataDistributionType\": \"FullyReplicated\"\n", " }\n", " },\n", " \"ContentType\": \"csv\",\n", " \"CompressionType\": \"None\"\n", " }\n", " ]\n", "}\n", "\n", "\n", "client = boto3.client('sagemaker', region_name=region)\n", "client.create_training_job(**create_training_params)\n", "\n", "import time\n", "\n", "status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']\n", "print(status)\n", "while status !='Completed' and status!='Failed':\n", " time.sleep(60)\n", " status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']\n", " print(status)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/plotting/_core.py:1001: UserWarning: Attempting to set identical left==right results\n", "in singular transformations; automatically expanding.\n", "left=0.0, right=0.0\n", " ax.set_xlim(left, right)\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAusAAAFACAYAAAAFyjKNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAHEhJREFUeJzt3XuUZWdZJ+DfS5qAAiGEtAwQsIMyGQLDtUWDqOHiECLIZWQkwyBgJCqKsEacCcNaMKNLcWBGBbxgjDFhwACiIEoAY6SNQgA7kpAOISbctCHQDZEAogSSd/44u6W6rMvpWLtqV/XzrHXW2ec7+/LW17vO+fWub+9d3R0AAGB6brXRBQAAAEsT1gEAYKKEdQAAmChhHQAAJkpYBwCAiRLWAQBgooR1AACYKGEdAAAmSlgHAICJ2rbRBaylY489tnfs2LHRZQAAsIVdeumln+3u7euxrS0V1nfs2JHdu3dvdBkAAGxhVfWJ9dqWYTAAADBRwjoAAEyUsA4AABMlrAMAwEQJ6wAAMFHCOgAATJSwDgAAEyWsAwDARAnrAAAwUcI6AABMlLAOAAATJawDAMBECesAADBRwjoAAEyUsA4AABMlrAMAwEQJ6wAAMFHCOgAATJSwDgAAEyWsAwDARAnrAAAwUcI6AABMlLAOAAATJawDAMBECesAADBRwjoAAEyUsA4AABMlrAMAwEQJ6wAAMFHCOgAATNRoYb2qzqmqfVW1Z0HbU6rqyqq6uap2rrDs0VX1pqr6cFVdVVUnjVUnAABM1ZhH1s9Ncsqitj1Jnpzk4lWWfUWSd3T3v0vygCRXrXl1AAAwcdvGWnF3X1xVOxa1XZUkVbXsclV1xyTfneSZwzI3JrlxpDIBAGCypjhm/fgk+5P8TlV9oKrOrqrbLTdzVZ1RVburavf+/fvXr0oAABjZFMP6tiQPTvIb3f2gJP+Q5MzlZu7us7p7Z3fv3L59+3rVCAAAo5tiWN+bZG93v294/abMwjsAABxWJhfWu/vTSf6uqk4Ymh6V5EMbWBIAAGyIMS/deH6SS5KcUFV7q+r0qnpSVe1NclKSt1XVO4d571ZVFyxY/LlJXldVH0zywCS/MFadAAAwVWNeDea0Zd568xLzfirJqQteX5Zk2euwAwDA4WByw2AAAIAZYR0AACZKWAcAgIkS1gEAYKKEdQAAmChhHQAAJkpYBwCAiRLWAQBgooR1AACYKGEdAAAmSlgHAICJEtYBAGCihHUAAJgoYR0AACZKWAcAgIkS1gEAYKKEdQAAmChhHQAAJkpYBwCAiRLWAQBgooR1AACYKGEdAAAmSlgHAICJEtYBAGCihHUAAJgoYR0AACZKWAcAgIkS1gEAYKKEdQAAmChhHQAAJkpYBwCAiRLWAQBgooR1AACYKGEdAAAmSlgHAICJGi2sV9U5VbWvqvYsaHtKVV1ZVTdX1c5Vlj+iqj5QVX88Vo0AADBlYx5ZPzfJKYva9iR5cpKL51j+eUmuWuOaAABg0xgtrHf3xUmuX9R2VXdfvdqyVXVcku9LcvZI5QEAwORNdcz6ryT5b0lu3uhCAABgo0wurFfV45Ls6+5L55z/jKraXVW79+/fP3J1AACwfiYX1pN8Z5Lvr6qPJ3l9kkdW1WuXm7m7z+rund29c/v27etVIwAAjG5yYb27X9jdx3X3jiRPTfJn3f1fNrgsAABYd2NeuvH8JJckOaGq9lbV6VX1pKram+SkJG+rqncO896tqi4YqxYAANiMto214u4+bZm33rzEvJ9KcuoS7buS7FrTwgAAYJOY3DAYAABgRlgHAICJEtYBAGCihHUAAJgoYR0AACZKWAcAgIkS1gEAYKKEdQAAmChhHQAAJkpYBwCAiRLWAQBgooR1AACYKGEdAAAmSlgHAICJmiusV9U3VNUJYxcDAAB83aphvaoen+SyJO8YXj+wqt46dmEAAHC4m+fI+v9M8tAkn0+S7r4syfEj1gQAAGS+sP7V7r5hUVuPUQwAAPB12+aY58qq+s9Jjqiqeyf5qSTvGbcsAABgniPrz01y3yRfSXJ+ki8kef6YRQEAAHMcWe/uLyd5UZIXVdURSW7X3f80emUAAHCYm+dqML9bVUdV1e2SXJHkQ1X1M+OXBgAAh7d5hsGc2N1fSPLEJG/P7EowTx+1KgAAYK6wfuuqunVmYf2t3f3VuBoMAACMbp6w/ptJPp7kdkkurqpvzuwkUwAAYETznGD6yiSvXND0iap6xHglAQAAyRxhvaqOTvJDSXYsmv+nRqoJAADIfDdFuiDJezO7EszN45YDAAAcME9Yv213/9fRKwEAAA4yzwmm/6+qnl1Vd62qYw48Rq8MAAAOc/McWb8xycszu4vpgUs2dpJ7jVUUAAAwX1j/6STf2t2fHbsYAADg6+YZBnNtki+PXQgAAHCweY6s/0OSy6rqXUm+cqCxu126EQAARjRPWH/L8AAAANbRimG9qo5I8h+6+2nrVA8AADBYccx6d9+U5Jur6shDXXFVnVNV+6pqz4K2p1TVlVV1c1XtXGa5e1TVu6rqQ8O8zzvUbQMAwFYwzzCYjyZ5d1W9NbPx60mS7v6lVZY7N8mvJnnNgrY9SZ6c5DdXWO5rSX66u/+6qu6Q5NKqurC7PzRHrQAAsGXME9Y/MjxuleQO8664uy+uqh2L2q5Kkqpaabnrklw3TH+xqq5KcvckwjoAAIeVVcN6d/+v9ShkKUPYf1CS960wzxlJzkiSe97znutSFwAArId5rrN+IBAv+3oMVXX7JL+f5Pnd/YXl5uvus7p7Z3fv3L59+9hlAQDAupkrrCdZPG5l+XEsa6Cqbp1ZUH9dd//BmNsCAICpmiusd/dvrvR6LdVsQPtvJ7lqjpNYAQBgy1p1zHpV3SbJf0yyY+H83f2zqyx3fpKTkxxbVXuTvCTJ9UlelWR7krdV1WXd/ZiquluSs7v71CTfmeTpSa6oqsuG1f2P7r7gEH82AADY1Oa5GswfJrkhyaVJvjLvirv7tGXeevMS834qyanD9F9m5GE2AACwGcwT1o/r7lNGrwQAADjIPGPW31NV/370SgAAgIPMc2T94UmeWVUfy2wYTCXp7r7/qJUBAMBhbp6w/tjRqwAAAP6FVYfBdPcnkhyd5PHD4+ihDYBN7JJLkpe+dPYMwDTNc+nG5yV5dpIDNyd6bVWd1d2vGrUyAEZzySXJox6V3HhjcuSRyUUXJSedtNFVAbDYPCeYnp7k27v7xd394iTfkVl4B2CT2rVrFtRvumn2vGvXRlcEwFLmCeuV5KYFr2+K66ADbGonnzw7on7EEbPnk0/e6IoAWMo8J5j+TpL3VdWBmxk9Mclvj1cSAGM76aTZ0Jddu2ZB3RAYgGlaNax39y9V1a7MLuGYJM/q7g+MWhUAozvpJCEdYOqWDetVdVR3f6Gqjkny8eFx4L1juvv68csDAIDD10pH1n83yeOSXJqkF7TX8PpeI9YFAACHvWXDenc/bng+fv3KAQAADlj1ajBVddE8bQAAwNpaacz6bZN8Y5Jjq+pO+frlGo9Kcvd1qA0AAA5rK41Z/9Ekz09yt8zGrR8I619I8qsj1wUAAIe9lcasvyLJK6rqud39qnWsCQAAyHzXWX9VVd0vyYlJbrug/TVjFgYAAIe7VcN6Vb0kycmZhfULkjw2yV8mEdYBAGBEq14NJskPJHlUkk9397OSPCDJHUetCgAAmCus/2N335zka1V1VJJ9Se4xblkAAMCqw2CS7K6qo5P8VmZXhflSkktGrQoAAJjrBNPnDJOvrqp3JDmquz84blkAAMBKN0V68Ervdfdfj1MSAACQrHxk/f8Oz7dNsjPJ5ZndGOn+SXYnOWnc0gAA4PC27Amm3f2I7n5EkuuSPLi7d3b3Q5I8KMkn16tAAAA4XM1zNZgTuvuKAy+6e0+S+4xXEgAAkMx3NZgPVtXZSV47vH5aEieYAgDAyOYJ689K8uNJnje8vjjJb4xWEQAAkGS+Szf+U5JfHh4AAMA6WenSjW/s7v9UVVck6cXvd/f9R60MAAAOcysdWT8w7OVx61EIAABwsGXDendfNzx/Yv3KAQAADlhpGMwXs8Twl8xujNTdfdRoVQEAACseWb/DehYCAAAcbJ6bIiVJquqbquqeBx5zzH9OVe2rqj0L2p5SVVdW1c1VtXOFZU+pqqur6tqqOnPeGgEAYCtZNaxX1fdX1TVJPpbkz5N8PMnb51j3uUlOWdS2J8mTM7tW+3LbOyLJryV5bJITk5xWVSfOsT0AANhS5jmy/nNJviPJ33T38UkeleS9qy3U3RcnuX5R21XdffUqiz40ybXd/dHuvjHJ65M8YY46AQBgS5knrH+1uz+X5FZVdavufleSZYewrIG7J/m7Ba/3Dm1Lqqozqmp3Ve3ev3//iGUBAMD6WvUOpkk+X1W3T/IXSV5XVfuS/MO4Zc2vu89KclaS7Ny5c6mr1wAAwKY0z5H1dyW5Y2Y3SXpHko8kefyINX0yyT0WvD5uaAMAgMPKPGF9W5I/SbIryR2SvGEYFjOWv0py76o6vqqOTPLUJG8dcXsAADBJq4b17v5f3X3fJD+R5K5J/ryq/nS15arq/CSXJDmhqvZW1elV9aSq2pvkpCRvq6p3DvPeraouGLb3tSQ/meSdSa5K8sbuvvIW/nwAALBpzTNm/YB9ST6d5HNJvmm1mbv7tGXeevMS834qyakLXl+Q5IJDqA0AALacea6z/pyq2pXkoiR3TvLs7r7/2IUBAMDhbp4j6/dI8vzuvmzsYgAAgK9bNax39wvXoxAAAOBg81wNBgAA2ADCOgAATJSwDgAAEyWsAwDARAnrAAAwUcI6AABMlLAOAAATJawDAMBECesAADBRwjoAAEyUsA4AABMlrAMAwEQJ6wAAMFHCOgAATJSwDgAAEyWsAwDARAnrAAAwUcI6AABMlLAOAAATJawDAMBECesAADBRwjoAAEyUsA4AABMlrAMAwEQJ6wAAMFHCOgAATJSwDgAAEyWsAwDARAnrAAAwUcI6AABMlLAOAAATJawDAMBEjRrWq+qcqtpXVXsWtB1TVRdW1TXD852WWfZlVXVlVV1VVa+sqhqzVgAAmJqxj6yfm+SURW1nJrmou++d5KLh9UGq6mFJvjPJ/ZPcL8m3JfmeUSsFAICJGTWsd/fFSa5f1PyEJOcN0+cleeJSiya5bZIjk9wmya2TfGakMgEAYJI2Ysz6Xbr7umH600nusniG7r4kybuSXDc83tndV61fiQAAsPE29ATT7u7MjqIfpKq+Ncl9khyX5O5JHllV37XUOqrqjKraXVW79+/fP2q9AACwnjYirH+mqu6aJMPzviXmeVKS93b3l7r7S0nenuSkpVbW3Wd1987u3rl9+/bRigYAgPW2EWH9rUmeMUw/I8kfLjHP3yb5nqraVlW3zuzkUsNgAAA4rIx96cbzk1yS5ISq2ltVpyf5xSTfW1XXJHn08DpVtbOqzh4WfVOSjyS5IsnlSS7v7j8as1YAAJiabWOuvLtPW+atRy0x7+4kPzJM35TkR0csDQAAJs8dTAEAYKKEdQAAmChhHQAAJkpYBwCAiRLWAQBgooR1AACYKGEdAAAmSlgHAICJEtYBAGCihHUAAJgoYR0AACZKWAcAgIkS1gEAYKKEdQAAmChhHQAAJkpYBwCAiRLWAQBgooR1AACYKGEdAAAmSlgHAICJEtYBAGCihHUAAJgoYR0AACZKWAcAgIkS1gEAYKKEdQAAmChhHQAAJkpYBwCAiRLWAQBgooR1AACYKGEdAAAmSlgHAICJEtYBAGCihHUAAJgoYR0AACZKWAcAgIkaLaxX1TlVta+q9ixoO6aqLqyqa4bnOy2z7D2r6k+q6qqq+lBV7RirTgAAmKoxj6yfm+SURW1nJrmou++d5KLh9VJek+Tl3X2fJA9Nsm+sIgEAYKpGC+vdfXGS6xc1PyHJecP0eUmeuHi5qjoxybbuvnBYz5e6+8tj1QkAAFO13mPW79Ld1w3Tn05ylyXm+bdJPl9Vf1BVH6iql1fVEcutsKrOqKrdVbV7//79Y9QMAAAbYsNOMO3uTtJLvLUtyXcleUGSb0tyryTPXGE9Z3X3zu7euX379jFKBQCADbHeYf0zVXXXJBmelxqLvjfJZd390e7+WpK3JHnwOtYIAACTsN5h/a1JnjFMPyPJHy4xz18lObqqDhwmf2SSD61DbQAAMCljXrrx/CSXJDmhqvZW1elJfjHJ91bVNUkePbxOVe2sqrOTpLtvymwIzEVVdUWSSvJbY9UJAABTVbOh41vDzp07e/fu3RtdBgAAW1hVXdrdO9djW+5gCgAAEyWsAwDARAnrAAAwUcI6AABMlLAOAAATJawDAMBECesAADBRwjoAAEyUsA4AABO1pe5gWlVfTHL1RtexRRyb5LMbXcQWoj/Xlv5cO/pybenPtaU/146+XFsndPcd1mND29ZjI+vo6vW69etWV1W79eXa0Z9rS3+uHX25tvTn2tKfa0dfrq2q2r1e2zIMBgAAJkpYBwCAidpqYf2sjS5gC9GXa0t/ri39uXb05drSn2tLf64dfbm21q0/t9QJpgAAsJVstSPrAACwZQjrAAAwUZMJ61V1TFVdWFXXDM93Wma+ZwzzXFNVz1jQ/pCquqKqrq2qV1ZVDe1Pqaorq+rmqtq5aF0vHOa/uqoes6D9lKHt2qo6c6yfeUwj9ueS662qn6mqy4bHnqq6qaqOGd77+LCuy9bzUkdrZQP68uSqumFBf754wbrsm4fen0+rqg8Oy7ynqh6wYF2bdt9cbV+oqttU1RuG999XVTsWvHdIn31VdfywjmuHdR652jY2k3Xuy9cN7Xuq6pyquvXQvuzv/Wazzv15blV9bEG/PXBor+Hz4drh9//B4/7U41nn/vyLBX35qap6y9C+JfbPkfrynKraV1V7Fq1rue+kQ983u3sSjyQvS3LmMH1mkv+9xDzHJPno8HynYfpOw3vvT/IdSSrJ25M8dmi/T5ITkuxKsnPBuk5McnmS2yQ5PslHkhwxPD6S5F5JjhzmOXGj+2dC/TnPeh+f5M8WvP54kmM3uk82S18mOTnJHy+xDfvmLevPhy1Y9rFJ3rfZ98159oUkz0ny6mH6qUneMEwf8mdfkjcmeeow/eokP77SNjbTYwP68tRh360k5y/oyyV/7zfbYwP689wkP7BEHacOnw81fF68b6yfeSv156L1/n6SH9oq++cYfTm8991JHpxkz6J1LfeddMj75mSOrCd5QpLzhunzkjxxiXkek+TC7r6+u/8+yYVJTqmquyY5qrvf27OeeM2B5bv7qu5e6q6mT0jy+u7+Snd/LMm1SR46PK7t7o92941JXj/Mu9mM0p9zrve0zL6EtoqN7MuF7Ju3oD+7+z3DOpLkvUmOW+sfaAPMsy8s7I83JXlUVVUO8bNvWOaRwzqSg//NltvGZrJufZkk3X1BDzL7j+dW2B8XWtf+XMETkrxm6Or3Jjl6+PzYbDakP6vqqMx+798y0s+1Ecboy3T3xUmuX2J7y33XHfK+OaWwfpfuvm6Y/nSSuywxz92T/N2C13uHtrsP04vbV7LSupZq32zG6s8V11tV35jklMz+R35AJ/mTqrq0qs64BT/LRtuIvjypqi6vqrdX1X1X2cZmsyH75uD0zI5oHLBZ98159oV/nqe7v5bkhiR3XmHZ5drvnOTzwzoWb2u5bWwm69mX/2wY/vL0JO9Y0LzU7/1msxH9+fPDcIJfrqrbHEIdm8GG7J+ZBcuLuvsLC9o2+/45Rl+uZLnvpENe17ZVNrSmqupPk/ybJd560cIX3d1V5ZqSq9jo/lxmvY9P8u7uXvi/zId39yer6puSXFhVHx7+JzoZE+vLv07yzd39pao6NbMjG/de622OaWL9eaCmR2QW1h++oHny+yZb1q8nubi7/2J4vel/7zfICzMLQkdmdt3r/57kZze0oq3htCRnL3ht//xX+Nd+161rWO/uRy/3XlV9pqru2t3XDX8O2LfEbJ/MbNzUAcdlNhb9kzn4T4nHDW0r+WSSeyyzzHLtk7JB/bnaep+aRUNguvuTw/O+qnpzZn86mlQgmlJfLjyS0d0XVNWvV9WxWXmfnZQp9eewzftn9sXz2O7+3II6J79vLmOefeHAPHuraluSOyb53CrLLtX+ucz+TLttONK0cP7ltrGZrGdfJkmq6iVJtif50QNty/3ed/dnb+HPtVHWtT8XHLn8SlX9TpIXHEIdm8FG7J/HZvZZ+KQDbVtk/xyrL5ez3HfSoa+rJzDov2cD7l+egwfiv2yJeY5J8rHMTji70zB9zPDe4pPOTl207K4cfILpfXPwyQIfzezkg23D9PH5+gkI993o/plKf6603sx26uuT3G5B2+2S3GHB9HuSnLLR/TPlvszsiPSBG5Y9NMnfDsvaN29Zf94zs/GFD1u0jU27b86zLyT5iRx8otQbh+lD/uxL8ns5+ATT56y0jc302IC+/JFhX/uGRdtY8vd+o/tnE/TnXYfnSvIrSX5xeP19OfgkvvdvdN9shv4clvuxJOdttf1zjL5csNyO/MsTTJf7TjrkfXPDO2/BD3XnJBcluSbJn+brX8w7k5y9YL4fzuyL99okz1rQvjPJnszO0P3VBTvVkzIbD/SVJJ9J8s4Fy7xomP/qDFeUGNpPTfI3w3sv2ui+mVh/Lrne4b1nZnYCxsI67jXs4JcnuXIz9ud692WSnxz66vLMToh82IJ12TcPvT/PTvL3SS4bHru3wr651L6Q2Z//v3+Yvm1mIfvazP6Dc68Fyx7SZ9/QV+8f1vV7SW6z2jY202Od+/JrQ9uB/fHFQ/uyv/eb7bHO/flnSa4YPhNem+T2Q3sl+bVh/iuy4GDdZnusZ38O7+3KogMXW2X/HKkvz09yXZKvZpY3Tx/al/tOOuR988CXHAAAMDFTuhoMAACwgLAOAAATJawDAMBECesAADBRwjoAAEyUsA4wQVV1dFU9Z5i+W1W9acRtPXC4KyEAEyOsA0zT0UmekyTd/anu/oERt/XAzK4/DMDEuM46wARV1euTPCGzG3Bck+Q+3X2/qnpmkidmdtfVeyf5P5ndje/pmd387dTuvr6qviWzG29sT/LlJM/u7g9X1VOSvCTJTUluSPLozG4A8g2Z3fL6pZndMfYVmd0g5B8zuynV1Yew7V2Z3TzlezK7a+APd/f7x+kpgK3NkXWAaTozyUe6+4FJfmbRe/dL8uQk35bk55N8ubsflOSSJD80zHNWkud290OSvCDJrw/tL07ymO5+QGZ37btxaHtDdz+wu9+Q5MNJvmtY54uT/MIhbjtJvnGo/TlJzvnXdQXA4WvbRhcAwCF7V3d/MckXq+qGJH80tF+R5P5VdfskD0vye1V1YJnbDM/vTnJuVb0xyR8ss/47Jjmvqu6dpJPcet5tL5jv/CTp7our6qiqOrq7P38Lf16Aw5awDrD5fGXB9M0LXt+c2ef6rZJ8fjiyfZDu/rGq+vYk35fk0qp6yBLr/7nMQvmTqmpHkl2HsO1/3tTiTa/w8wCwDMNgAKbpi0nucEsW7O4vJPnYMD49NfOAYfpbuvt93f3iJPuT3GOJbd0xs/HrSfLMW1Z+fnDY3sOT3NDdN9zC9QAc1oR1gAnq7s8leXdV7Uny8luwiqclOb2qLk9yZWYnqybJy6vqimG978nsRNB3JTmxqi6rqh9M8rIkL62qD+SW/wX2n4blX53k9Fu4DoDDnqvBALCmhqvBvKC7d290LQCbnSPrAAAwUY6sAwDARDmyDgAAEyWsAwDARAnrAAAwUcI6AABMlLAOAAAT9f8Bj8HzDp5Fm34AAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "from sagemaker.analytics import TrainingJobAnalytics\n", "\n", "metric_name = 'validation:rmse'\n", "\n", "metrics_dataframe = TrainingJobAnalytics(training_job_name=job_name, metric_names=[metric_name]).dataframe()\n", "plt = metrics_dataframe.plot(kind='line', figsize=(12,5), x='timestamp', y='value', style='b.', legend=False)\n", "plt.set_ylabel(metric_name);" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bikenow-xgboost-regression-2019-12-19-02-00-00-model\n", "https://s3-us-east-1.amazonaws.com/bike-demo-stack-applicationdatal-s3bucketdatalake-dncpxowjgqbf/sagemaker/bikenow-xgboost-regression/single-xgboost/bikenow-xgboost-regression-2019-12-19-02-00-00/output/model.tar.gz\n", "arn:aws:sagemaker:us-east-1:699772153487:model/bikenow-xgboost-regression-2019-12-19-02-00-00-model\n", "CPU times: user 15.7 ms, sys: 238 µs, total: 15.9 ms\n", "Wall time: 277 ms\n" ] } ], "source": [ "%%time\n", "import boto3\n", "from time import gmtime, strftime\n", "\n", "model_name=job_name + '-model'\n", "print(model_name)\n", "\n", "info = client.describe_training_job(TrainingJobName=job_name)\n", "model_data = info['ModelArtifacts']['S3ModelArtifacts']\n", "print(model_data)\n", "\n", "primary_container = {\n", " 'Image': container,\n", " 'ModelDataUrl': model_data\n", "}\n", "\n", "create_model_response = client.create_model(\n", " ModelName = model_name,\n", " ExecutionRoleArn = role,\n", " PrimaryContainer = primary_container)\n", "\n", "print(create_model_response['ModelArn'])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bikenow-XGBoostEndpointConfig-2019-12-19-02-06-19\n", "Endpoint Config Arn: arn:aws:sagemaker:us-east-1:699772153487:endpoint-config/bikenow-xgboostendpointconfig-2019-12-19-02-06-19\n" ] } ], "source": [ "from time import gmtime, strftime\n", "\n", "endpoint_config_name = 'bikenow-XGBoostEndpointConfig-' + strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n", "print(endpoint_config_name)\n", "create_endpoint_config_response = client.create_endpoint_config(\n", " EndpointConfigName = endpoint_config_name,\n", " ProductionVariants=[{\n", " 'InstanceType':'ml.m4.xlarge',\n", " 'InitialVariantWeight':1,\n", " 'InitialInstanceCount':1,\n", " 'ModelName':model_name,\n", " 'VariantName':'AllTraffic'}])\n", "\n", "print(\"Endpoint Config Arn: \" + create_endpoint_config_response['EndpointConfigArn'])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bikenow-XGBoostEndpoint-2019-12-19-02-06-37\n", "arn:aws:sagemaker:us-east-1:699772153487:endpoint/bikenow-xgboostendpoint-2019-12-19-02-06-37\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: InService\n", "Arn: arn:aws:sagemaker:us-east-1:699772153487:endpoint/bikenow-xgboostendpoint-2019-12-19-02-06-37\n", "Status: InService\n", "CPU times: user 127 ms, sys: 12.5 ms, total: 140 ms\n", "Wall time: 10min 1s\n" ] } ], "source": [ "%%time\n", "import time\n", "\n", "endpoint_name = 'bikenow-XGBoostEndpoint-' + strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n", "print(endpoint_name)\n", "create_endpoint_response = client.create_endpoint(\n", " EndpointName=endpoint_name,\n", " EndpointConfigName=endpoint_config_name)\n", "print(create_endpoint_response['EndpointArn'])\n", "\n", "resp = client.describe_endpoint(EndpointName=endpoint_name)\n", "status = resp['EndpointStatus']\n", "print(\"Status: \" + status)\n", "\n", "while status=='Creating':\n", " time.sleep(60)\n", " resp = client.describe_endpoint(EndpointName=endpoint_name)\n", " status = resp['EndpointStatus']\n", " print(\"Status: \" + status)\n", "\n", "print(\"Arn: \" + resp['EndpointArn'])\n", "print(\"Status: \" + status)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "runtime_client = boto3.client('runtime.sagemaker', region_name=region)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "!head -1 bikenow.test > bikenow.single.test" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Label: 5 \n", "Prediction: 13\n", "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n", "Wall time: 24.4 ms\n" ] } ], "source": [ "%%time\n", "import json\n", "from itertools import islice\n", "import math\n", "import struct\n", "\n", "file_name = 'bikenow.single.test' #customize to your test file\n", "with open(file_name, 'r') as f:\n", " payload = f.read().strip().split(',', 1)\n", "response = runtime_client.invoke_endpoint(EndpointName=endpoint_name, \n", " ContentType='text/csv', \n", " Body=payload[1])\n", "result = response['Body'].read()\n", "result = result.decode(\"utf-8\")\n", "result = result.split(',')\n", "result = [round(float(i)) for i in result]\n", "label = payload[0]\n", "print ('Label: ',label,'\\nPrediction: ', result[0])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "import sys\n", "import math\n", "def do_predict(data, endpoint_name, content_type):\n", " payload = '\\n'.join(data)\n", " response = runtime_client.invoke_endpoint(EndpointName=endpoint_name, \n", " ContentType=content_type, \n", " Body=payload)\n", " result = response['Body'].read()\n", " result = result.decode(\"utf-8\")\n", " result = result.split(',')\n", " preds = [float((num)) for num in result]\n", " preds = [round(num) for num in preds]\n", " return preds\n", "\n", "def batch_predict(data, batch_size, endpoint_name, content_type):\n", " items = len(data)\n", " arrs = []\n", " \n", " for offset in range(0, items, batch_size):\n", " if offset+batch_size < items:\n", " results = do_predict(data[offset:(offset+batch_size)], endpoint_name, content_type)\n", " arrs.extend(results)\n", " else:\n", " arrs.extend(do_predict(data[offset:items], endpoint_name, content_type))\n", " sys.stdout.write('.')\n", " return(arrs)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "................................................................................................................................................................................................................................................................................................................................................................................\n", " Mean Squared Error = 79.96900764605044\n", "CPU times: user 665 ms, sys: 65 ms, total: 730 ms\n", "Wall time: 3.89 s\n" ] } ], "source": [ "%%time\n", "import json\n", "import numpy as np\n", "\n", "with open(FILE_TEST, 'r') as f:\n", " payload = f.read().strip()\n", "\n", "labels = [int(line.split(',', 1)[0]) for line in payload.split('\\n')]\n", "test_data = [line.split(',', 1)[1] for line in payload.split('\\n')]\n", "preds = batch_predict(test_data, 100, endpoint_name, 'text/csv')\n", "\n", "print('\\n Mean Squared Error = ', np.mean((np.array(labels) - np.array(preds))**2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }