{ "cells": [ { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2021-01-20T10:03:00.870970Z", "start_time": "2021-01-20T10:03:00.418377Z" }, "execution": { "iopub.execute_input": "2020-09-22T11:42:30.692313Z", "iopub.status.busy": "2020-09-22T11:42:30.692004Z", "iopub.status.idle": "2020-09-22T11:42:30.696371Z", "shell.execute_reply": "2020-09-22T11:42:30.695224Z", "shell.execute_reply.started": "2020-09-22T11:42:30.692282Z" } }, "source": [ "### ML model training\n", "\n", "Multi-label text classification training using Object2Vec Amazon SageMaker built-in algorithm." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-03T08:36:16.913987Z", "start_time": "2020-12-03T08:36:12.697973Z" } }, "outputs": [], "source": [ "import os\n", "import boto3\n", "import json\n", "import pickle\n", "import datetime\n", "import pandas as pd\n", "import time\n", "import botocore\n", "import sagemaker\n", "from sagemaker import get_execution_role\n", "from sagemaker.amazon.amazon_estimator import get_image_uri\n", "from sagemaker.predictor import json_serializer, json_deserializer\n", "from sagemaker.tuner import CategoricalParameter, HyperparameterTuner, IntegerParameter, ContinuousParameter" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bucket_name = \"YOUR_BUCKET_HERE\"\n", "prefix = \"connect/O2VInput\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Functions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def download_object(bucket_name, key, local_path):\n", " \"\"\"Download S3 object to local\"\"\"\n", " s3 = boto3.resource('s3')\n", " try:\n", " s3.Bucket(bucket_name).download_file(key,local_path)\n", " except botocore.exceptions.ClientError as e:\n", " if e.response['Error']['Code'] == \"404\":\n", " print(\"The object does not exist\")\n", " else:\n", " raise" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def create_dir(directory):\n", " \"\"\"Create a directory\"\"\"\n", " if not os.path.exists(directory):\n", " os.makedirs(directory)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_hyperparameter_ranges():\n", " \"\"\"returns the hyperparameter ranges configured here\"\"\"\n", " #tunable parameters\n", " hyperparameter_ranges = {\n", " 'mlp_layers': IntegerParameter(2,6),\n", " 'early_stopping_patience': IntegerParameter(3,5),\n", " 'mlp_activation': CategoricalParameter(['relu','tanh']),\n", " 'dropout': ContinuousParameter(0.4, 0.8),\n", " 'learning_rate': ContinuousParameter(0.0001, 0.001),\n", " 'mini_batch_size': CategoricalParameter([512,1024])\n", " }\n", " return hyperparameter_ranges" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def update_hyperparameter(hyperparameters, name_hyper, value_hyper):\n", " \"\"\"update hyperparameter for non-tunable hyperparams\"\"\"\n", " hyperparameters[name_hyper] = value_hyper\n", " return hyperparameters" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Specify image URI for Object2Vec" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sess = sagemaker.Session()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sagemaker import get_execution_role\n", "role = get_execution_role()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Get docker image of ObjectToVec algorithm\n", "container = get_image_uri(boto3.Session().region_name, 'object2vec')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-01-20T10:06:03.731083Z", "start_time": "2021-01-20T10:06:03.725465Z" }, "execution": { "iopub.execute_input": "2020-09-22T13:44:15.150582Z", "iopub.status.busy": "2020-09-22T13:44:15.150277Z", "iopub.status.idle": "2020-09-22T13:44:15.155887Z", "shell.execute_reply": "2020-09-22T13:44:15.155193Z", "shell.execute_reply.started": "2020-09-22T13:44:15.150550Z" } }, "outputs": [], "source": [ "output_path = 's3://'+os.path.join(bucket_name, prefix.replace(\"O2VInput\", \"O2VOutput\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "download_object(bucket_name, prefix+\"/meta/token_to_vocab_dict.p\", \"./token_to_vocab_dict.p\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "tokens=pickle.load(open(\"token_to_vocab_dict.p\", \"br\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(tokens)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"My general output path for the ML model: {}\".format(output_path))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### 3.2 Sagemaker Estimator set up" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## object2vec estimator - run it within the VPC\n", "o2vec2_class = sagemaker.estimator.Estimator(container,\n", " role, \n", " instance_count=1, \n", " instance_type='ml.m4.xlarge',\n", " output_path=output_path,\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-01-20T10:06:05.418184Z", "start_time": "2021-01-20T10:06:05.412871Z" }, "execution": { "iopub.execute_input": "2020-09-22T13:44:15.477266Z", "iopub.status.busy": "2020-09-22T13:44:15.476956Z", "iopub.status.idle": "2020-09-22T13:44:15.482962Z", "shell.execute_reply": "2020-09-22T13:44:15.482356Z", "shell.execute_reply.started": "2020-09-22T13:44:15.477232Z" } }, "outputs": [], "source": [ "## hyperparameter specification\n", "hyperparameters = {\n", " \"_kvstore\": \"device\",\n", " \"_num_gpus\": 'auto',\n", " \"_num_kv_servers\": \"auto\",\n", " \"bucket_width\": 0,\n", " \"dropout\": 0.4,\n", " \"early_stopping_patience\": 3,\n", " \"early_stopping_tolerance\": 0.001,\n", " \"enc0_layers\": \"auto\",\n", " \"enc0_max_seq_len\": 50,\n", " \"enc0_network\": \"bilstm\",\n", " \"enc0_token_embedding_dim\": 300,\n", " 'enc0_vocab_file': \"\", \n", " \"enc0_vocab_size\": len(tokens),\n", " \"enc1_network\": \"enc0\",\n", " \"enc_dim\": 600,\n", " \"epochs\": 100,\n", " \"learning_rate\": 0.001,\n", " \"mini_batch_size\": 1024,\n", " \"mlp_activation\": \"relu\",\n", " \"mlp_dim\": 512,\n", " \"mlp_layers\": 2,\n", " \"num_classes\": 2, #either 0 or 1.\n", " \"optimizer\": \"adam\",\n", " \"output_layer\": \"softmax\",\n", " \"weight_decay\": 0,\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-01-20T10:06:13.575210Z", "start_time": "2021-01-20T10:06:13.568237Z" }, "execution": { "iopub.execute_input": "2020-09-22T13:44:17.182856Z", "iopub.status.busy": "2020-09-22T13:44:17.182553Z", "iopub.status.idle": "2020-09-22T13:44:17.186441Z", "shell.execute_reply": "2020-09-22T13:44:17.185823Z", "shell.execute_reply.started": "2020-09-22T13:44:17.182825Z" } }, "outputs": [], "source": [ "#input channels for validation, auxiliary (glove pretrained and vocabulary) and training.\n", "channels = {\n", " 'auxiliary': sagemaker.inputs.TrainingInput('s3://'+os.path.join(bucket_name, prefix, 'auxiliary/'),\n", " distribution='FullyReplicated', \n", " content_type='application/json'),\n", " 'train': sagemaker.inputs.TrainingInput('s3://'+os.path.join(bucket_name, prefix, 'train/train.jsonl'), \n", " distribution='ShardedByS3Key', \n", " content_type='application/jsonlines'),\n", " 'validation': sagemaker.inputs.TrainingInput('s3://'+os.path.join(bucket_name, prefix, 'val/val.jsonl'), \n", " distribution='ShardedByS3Key', \n", " content_type='application/jsonlines'),\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Training" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "job_name = 'default' + datetime.datetime.now().strftime(\"%Y-%m-%d-%H-%M-%S\")\n", "print(\"Training with job name\", job_name)\n", "o2vec2_class.set_hyperparameters(**hyperparameters)\n", "o2vec2_class.fit(channels, job_name=job_name, wait=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 4 }