{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# BikeNow XGBoost Regression\n", "\n", "Import libraries." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://s3-us-east-1.amazonaws.com/bike-demo-stack-applicationdatal-s3bucketdatalake-dncpxowjgqbf\n", "CPU times: user 595 ms, sys: 43.4 ms, total: 639 ms\n", "Wall time: 658 ms\n" ] } ], "source": [ "%%time\n", "\n", "import os\n", "import boto3\n", "import re\n", "from sagemaker import get_execution_role\n", "\n", "role = get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", "bucket='bike-demo-stack-applicationdatal-s3bucketdatalake-dncpxowjgqbf' # put your s3 bucket name here, and create s3 bucket\n", "src_file = 'unload/station_status_history_000'\n", "prefix = 'sagemaker/bikenow-xgboost-regression'\n", "\n", "# customize to your bucket where you have stored the data\n", "bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)\n", "\n", "print(bucket_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Helper functions to split data into training, validation, and testing sets." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 8 µs, sys: 0 ns, total: 8 µs\n", "Wall time: 294 µs\n" ] } ], "source": [ "%%time\n", "\n", "import io\n", "import boto3\n", "import random\n", "\n", "def data_split(FILE_DATA, FILE_TRAIN, FILE_VALIDATION, FILE_TEST, PERCENT_TRAIN, PERCENT_VALIDATION, PERCENT_TEST):\n", " data = [l for l in open(FILE_DATA, 'r')]\n", " train_file = open(FILE_TRAIN, 'w')\n", " valid_file = open(FILE_VALIDATION, 'w')\n", " tests_file = open(FILE_TEST, 'w')\n", "\n", " num_of_data = len(data)\n", " num_train = int((PERCENT_TRAIN/100.0)*num_of_data)\n", " num_valid = int((PERCENT_VALIDATION/100.0)*num_of_data)\n", " num_tests = int((PERCENT_TEST/100.0)*num_of_data)\n", "\n", " data_fractions = [num_train, num_valid, num_tests]\n", " split_data = [[],[],[]]\n", "\n", " rand_data_ind = 0\n", "\n", " for split_ind, fraction in enumerate(data_fractions):\n", " for i in range(fraction):\n", " rand_data_ind = random.randint(0, len(data)-1)\n", " split_data[split_ind].append(data[rand_data_ind])\n", " data.pop(rand_data_ind)\n", "\n", " for l in split_data[0]:\n", " train_file.write(l)\n", "\n", " for l in split_data[1]:\n", " valid_file.write(l)\n", "\n", " for l in split_data[2]:\n", " tests_file.write(l)\n", "\n", " train_file.close()\n", " valid_file.close()\n", " tests_file.close()\n", "\n", "def write_to_s3(fobj, bucket, key):\n", " return boto3.Session(region_name=region).resource('s3').Bucket(bucket).Object(key).upload_fileobj(fobj)\n", "\n", "def upload_to_s3(bucket, channel, filename):\n", " fobj=open(filename, 'rb')\n", " key = prefix+'/'+channel\n", " url = 's3://{}/{}/{}'.format(bucket, key, filename)\n", " print('Writing to {}'.format(url))\n", " write_to_s3(fobj, bucket, key)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Download data and split files." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing to s3://bike-demo-stack-applicationdatal-s3bucketdatalake-dncpxowjgqbf/sagemaker/bikenow-xgboost-regression/train/bikenow.train\n", "Writing to s3://bike-demo-stack-applicationdatal-s3bucketdatalake-dncpxowjgqbf/sagemaker/bikenow-xgboost-regression/validation/bikenow.validation\n", "Writing to s3://bike-demo-stack-applicationdatal-s3bucketdatalake-dncpxowjgqbf/sagemaker/bikenow-xgboost-regression/test/bikenow.test\n", "CPU times: user 4.52 s, sys: 20.5 ms, total: 4.54 s\n", "Wall time: 5.08 s\n" ] } ], "source": [ "%%time\n", "import urllib.request\n", "\n", "# Load the dataset\n", "FILE_DATA = 'bikenow'\n", "boto3.Session(region_name=region).resource('s3').Bucket(bucket).Object(src_file).download_file(FILE_DATA)\n", "#urllib.request.urlretrieve(\"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/abalone\", FILE_DATA)\n", "\n", "#split the downloaded data into train/test/validation files\n", "FILE_TRAIN = 'bikenow.train'\n", "FILE_VALIDATION = 'bikenow.validation'\n", "FILE_TEST = 'bikenow.test'\n", "PERCENT_TRAIN = 70\n", "PERCENT_VALIDATION = 15\n", "PERCENT_TEST = 15\n", "data_split(FILE_DATA, FILE_TRAIN, FILE_VALIDATION, FILE_TEST, PERCENT_TRAIN, PERCENT_VALIDATION, PERCENT_TEST)\n", "\n", "#upload the files to the S3 bucket\n", "upload_to_s3(bucket, 'train', FILE_TRAIN)\n", "upload_to_s3(bucket, 'validation', FILE_VALIDATION)\n", "upload_to_s3(bucket, 'test', FILE_TEST)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get XGBoost container image." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 27 µs, sys: 1 µs, total: 28 µs\n", "Wall time: 30.3 µs\n" ] } ], "source": [ "%%time\n", "from sagemaker.amazon.amazon_estimator import get_image_uri\n", "container = get_image_uri(region, 'xgboost', '0.90-1')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create training job." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training job bikenow-xgboost-regression-2019-12-19-02-00-00\n", "InProgress\n", "InProgress\n", "InProgress\n", "InProgress\n", "Completed\n", "CPU times: user 70.7 ms, sys: 546 µs, total: 71.3 ms\n", "Wall time: 4min\n" ] } ], "source": [ "%%time\n", "import boto3\n", "from time import gmtime, strftime\n", "\n", "job_name = 'bikenow-xgboost-regression-' + strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n", "print(\"Training job\", job_name)\n", "\n", "#Ensure that the training and validation data folders generated above are reflected in the \"InputDataConfig\" parameter below.\n", "\n", "create_training_params = \\\n", "{\n", " \"AlgorithmSpecification\": {\n", " \"TrainingImage\": container,\n", " \"TrainingInputMode\": \"File\"\n", " },\n", " \"RoleArn\": role,\n", " \"OutputDataConfig\": {\n", " \"S3OutputPath\": bucket_path + \"/\" + prefix + \"/single-xgboost\"\n", " },\n", " \"ResourceConfig\": {\n", " \"InstanceCount\": 1,\n", " \"InstanceType\": \"ml.m4.4xlarge\",\n", " \"VolumeSizeInGB\": 5\n", " },\n", " \"TrainingJobName\": job_name,\n", " \"HyperParameters\": {\n", " \"max_depth\":\"5\",\n", " \"eta\":\"0.2\",\n", " \"gamma\":\"4\",\n", " \"min_child_weight\":\"6\",\n", " \"subsample\":\"0.7\",\n", " \"silent\":\"0\",\n", " \"objective\":\"reg:linear\",\n", " \"num_round\":\"50\"\n", " },\n", " \"StoppingCondition\": {\n", " \"MaxRuntimeInSeconds\": 3600\n", " },\n", " \"InputDataConfig\": [\n", " {\n", " \"ChannelName\": \"train\",\n", " \"DataSource\": {\n", " \"S3DataSource\": {\n", " \"S3DataType\": \"S3Prefix\",\n", " \"S3Uri\": bucket_path + \"/\" + prefix + '/train',\n", " \"S3DataDistributionType\": \"FullyReplicated\"\n", " }\n", " },\n", " \"ContentType\": \"csv\",\n", " \"CompressionType\": \"None\"\n", " },\n", " {\n", " \"ChannelName\": \"validation\",\n", " \"DataSource\": {\n", " \"S3DataSource\": {\n", " \"S3DataType\": \"S3Prefix\",\n", " \"S3Uri\": bucket_path + \"/\" + prefix + '/validation',\n", " \"S3DataDistributionType\": \"FullyReplicated\"\n", " }\n", " },\n", " \"ContentType\": \"csv\",\n", " \"CompressionType\": \"None\"\n", " }\n", " ]\n", "}\n", "\n", "\n", "client = boto3.client('sagemaker', region_name=region)\n", "client.create_training_job(**create_training_params)\n", "\n", "import time\n", "\n", "status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']\n", "print(status)\n", "while status !='Completed' and status!='Failed':\n", " time.sleep(60)\n", " status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']\n", " print(status)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/plotting/_core.py:1001: UserWarning: Attempting to set identical left==right results\n", "in singular transformations; automatically expanding.\n", "left=0.0, right=0.0\n", " ax.set_xlim(left, right)\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "from sagemaker.analytics import TrainingJobAnalytics\n", "\n", "metric_name = 'validation:rmse'\n", "\n", "metrics_dataframe = TrainingJobAnalytics(training_job_name=job_name, metric_names=[metric_name]).dataframe()\n", "plt = metrics_dataframe.plot(kind='line', figsize=(12,5), x='timestamp', y='value', style='b.', legend=False)\n", "plt.set_ylabel(metric_name);" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bikenow-xgboost-regression-2019-12-19-02-00-00-model\n", "https://s3-us-east-1.amazonaws.com/bike-demo-stack-applicationdatal-s3bucketdatalake-dncpxowjgqbf/sagemaker/bikenow-xgboost-regression/single-xgboost/bikenow-xgboost-regression-2019-12-19-02-00-00/output/model.tar.gz\n", "arn:aws:sagemaker:us-east-1:699772153487:model/bikenow-xgboost-regression-2019-12-19-02-00-00-model\n", "CPU times: user 15.7 ms, sys: 238 µs, total: 15.9 ms\n", "Wall time: 277 ms\n" ] } ], "source": [ "%%time\n", "import boto3\n", "from time import gmtime, strftime\n", "\n", "model_name=job_name + '-model'\n", "print(model_name)\n", "\n", "info = client.describe_training_job(TrainingJobName=job_name)\n", "model_data = info['ModelArtifacts']['S3ModelArtifacts']\n", "print(model_data)\n", "\n", "primary_container = {\n", " 'Image': container,\n", " 'ModelDataUrl': model_data\n", "}\n", "\n", "create_model_response = client.create_model(\n", " ModelName = model_name,\n", " ExecutionRoleArn = role,\n", " PrimaryContainer = primary_container)\n", "\n", "print(create_model_response['ModelArn'])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bikenow-XGBoostEndpointConfig-2019-12-19-02-06-19\n", "Endpoint Config Arn: arn:aws:sagemaker:us-east-1:699772153487:endpoint-config/bikenow-xgboostendpointconfig-2019-12-19-02-06-19\n" ] } ], "source": [ "from time import gmtime, strftime\n", "\n", "endpoint_config_name = 'bikenow-XGBoostEndpointConfig-' + strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n", "print(endpoint_config_name)\n", "create_endpoint_config_response = client.create_endpoint_config(\n", " EndpointConfigName = endpoint_config_name,\n", " ProductionVariants=[{\n", " 'InstanceType':'ml.m4.xlarge',\n", " 'InitialVariantWeight':1,\n", " 'InitialInstanceCount':1,\n", " 'ModelName':model_name,\n", " 'VariantName':'AllTraffic'}])\n", "\n", "print(\"Endpoint Config Arn: \" + create_endpoint_config_response['EndpointConfigArn'])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bikenow-XGBoostEndpoint-2019-12-19-02-06-37\n", "arn:aws:sagemaker:us-east-1:699772153487:endpoint/bikenow-xgboostendpoint-2019-12-19-02-06-37\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: Creating\n", "Status: InService\n", "Arn: arn:aws:sagemaker:us-east-1:699772153487:endpoint/bikenow-xgboostendpoint-2019-12-19-02-06-37\n", "Status: InService\n", "CPU times: user 127 ms, sys: 12.5 ms, total: 140 ms\n", "Wall time: 10min 1s\n" ] } ], "source": [ "%%time\n", "import time\n", "\n", "endpoint_name = 'bikenow-XGBoostEndpoint-' + strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n", "print(endpoint_name)\n", "create_endpoint_response = client.create_endpoint(\n", " EndpointName=endpoint_name,\n", " EndpointConfigName=endpoint_config_name)\n", "print(create_endpoint_response['EndpointArn'])\n", "\n", "resp = client.describe_endpoint(EndpointName=endpoint_name)\n", "status = resp['EndpointStatus']\n", "print(\"Status: \" + status)\n", "\n", "while status=='Creating':\n", " time.sleep(60)\n", " resp = client.describe_endpoint(EndpointName=endpoint_name)\n", " status = resp['EndpointStatus']\n", " print(\"Status: \" + status)\n", "\n", "print(\"Arn: \" + resp['EndpointArn'])\n", "print(\"Status: \" + status)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "runtime_client = boto3.client('runtime.sagemaker', region_name=region)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "!head -1 bikenow.test > bikenow.single.test" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Label: 5 \n", "Prediction: 13\n", "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n", "Wall time: 24.4 ms\n" ] } ], "source": [ "%%time\n", "import json\n", "from itertools import islice\n", "import math\n", "import struct\n", "\n", "file_name = 'bikenow.single.test' #customize to your test file\n", "with open(file_name, 'r') as f:\n", " payload = f.read().strip().split(',', 1)\n", "response = runtime_client.invoke_endpoint(EndpointName=endpoint_name, \n", " ContentType='text/csv', \n", " Body=payload[1])\n", "result = response['Body'].read()\n", "result = result.decode(\"utf-8\")\n", "result = result.split(',')\n", "result = [round(float(i)) for i in result]\n", "label = payload[0]\n", "print ('Label: ',label,'\\nPrediction: ', result[0])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "import sys\n", "import math\n", "def do_predict(data, endpoint_name, content_type):\n", " payload = '\\n'.join(data)\n", " response = runtime_client.invoke_endpoint(EndpointName=endpoint_name, \n", " ContentType=content_type, \n", " Body=payload)\n", " result = response['Body'].read()\n", " result = result.decode(\"utf-8\")\n", " result = result.split(',')\n", " preds = [float((num)) for num in result]\n", " preds = [round(num) for num in preds]\n", " return preds\n", "\n", "def batch_predict(data, batch_size, endpoint_name, content_type):\n", " items = len(data)\n", " arrs = []\n", " \n", " for offset in range(0, items, batch_size):\n", " if offset+batch_size < items:\n", " results = do_predict(data[offset:(offset+batch_size)], endpoint_name, content_type)\n", " arrs.extend(results)\n", " else:\n", " arrs.extend(do_predict(data[offset:items], endpoint_name, content_type))\n", " sys.stdout.write('.')\n", " return(arrs)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "................................................................................................................................................................................................................................................................................................................................................................................\n", " Mean Squared Error = 79.96900764605044\n", "CPU times: user 665 ms, sys: 65 ms, total: 730 ms\n", "Wall time: 3.89 s\n" ] } ], "source": [ "%%time\n", "import json\n", "import numpy as np\n", "\n", "with open(FILE_TEST, 'r') as f:\n", " payload = f.read().strip()\n", "\n", "labels = [int(line.split(',', 1)[0]) for line in payload.split('\\n')]\n", "test_data = [line.split(',', 1)[1] for line in payload.split('\\n')]\n", "preds = batch_predict(test_data, 100, endpoint_name, 'text/csv')\n", "\n", "print('\\n Mean Squared Error = ', np.mean((np.array(labels) - np.array(preds))**2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }