{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-20T10:03:00.870970Z",
     "start_time": "2021-01-20T10:03:00.418377Z"
    },
    "execution": {
     "iopub.execute_input": "2020-09-22T11:42:30.692313Z",
     "iopub.status.busy": "2020-09-22T11:42:30.692004Z",
     "iopub.status.idle": "2020-09-22T11:42:30.696371Z",
     "shell.execute_reply": "2020-09-22T11:42:30.695224Z",
     "shell.execute_reply.started": "2020-09-22T11:42:30.692282Z"
    }
   },
   "source": [
    "### ML model training\n",
    "\n",
    "Multi-label text classification training using Object2Vec Amazon SageMaker built-in algorithm."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-03T08:36:16.913987Z",
     "start_time": "2020-12-03T08:36:12.697973Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import boto3\n",
    "import json\n",
    "import pickle\n",
    "import datetime\n",
    "import pandas as pd\n",
    "import time\n",
    "import botocore\n",
    "import sagemaker\n",
    "from sagemaker import get_execution_role\n",
    "from sagemaker.amazon.amazon_estimator import get_image_uri\n",
    "from sagemaker.predictor import json_serializer, json_deserializer\n",
    "from sagemaker.tuner import CategoricalParameter, HyperparameterTuner, IntegerParameter, ContinuousParameter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bucket_name = \"YOUR_BUCKET_HERE\"\n",
    "prefix = \"connect/O2VInput\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_object(bucket_name, key, local_path):\n",
    "    \"\"\"Download S3 object to local\"\"\"\n",
    "    s3 = boto3.resource('s3')\n",
    "    try:\n",
    "        s3.Bucket(bucket_name).download_file(key,local_path)\n",
    "    except botocore.exceptions.ClientError as e:\n",
    "        if e.response['Error']['Code'] == \"404\":\n",
    "            print(\"The object does not exist\")\n",
    "        else:\n",
    "            raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dir(directory):\n",
    "    \"\"\"Create a directory\"\"\"\n",
    "    if not os.path.exists(directory):\n",
    "        os.makedirs(directory)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_hyperparameter_ranges():\n",
    "    \"\"\"returns the hyperparameter ranges configured here\"\"\"\n",
    "    #tunable parameters\n",
    "    hyperparameter_ranges = {\n",
    "        'mlp_layers': IntegerParameter(2,6),\n",
    "        'early_stopping_patience': IntegerParameter(3,5),\n",
    "        'mlp_activation': CategoricalParameter(['relu','tanh']),\n",
    "        'dropout': ContinuousParameter(0.4, 0.8),\n",
    "        'learning_rate': ContinuousParameter(0.0001, 0.001),\n",
    "        'mini_batch_size': CategoricalParameter([512,1024])\n",
    "    }\n",
    "    return hyperparameter_ranges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def update_hyperparameter(hyperparameters, name_hyper, value_hyper):\n",
    "    \"\"\"update hyperparameter for non-tunable hyperparams\"\"\"\n",
    "    hyperparameters[name_hyper] = value_hyper\n",
    "    return hyperparameters"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Specify image URI for Object2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sess = sagemaker.Session()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker import get_execution_role\n",
    "role = get_execution_role()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Get docker image of ObjectToVec algorithm\n",
    "container = get_image_uri(boto3.Session().region_name, 'object2vec')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-20T10:06:03.731083Z",
     "start_time": "2021-01-20T10:06:03.725465Z"
    },
    "execution": {
     "iopub.execute_input": "2020-09-22T13:44:15.150582Z",
     "iopub.status.busy": "2020-09-22T13:44:15.150277Z",
     "iopub.status.idle": "2020-09-22T13:44:15.155887Z",
     "shell.execute_reply": "2020-09-22T13:44:15.155193Z",
     "shell.execute_reply.started": "2020-09-22T13:44:15.150550Z"
    }
   },
   "outputs": [],
   "source": [
    "output_path = 's3://'+os.path.join(bucket_name, prefix.replace(\"O2VInput\", \"O2VOutput\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "download_object(bucket_name, prefix+\"/meta/token_to_vocab_dict.p\", \"./token_to_vocab_dict.p\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "tokens=pickle.load(open(\"token_to_vocab_dict.p\", \"br\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"My general output path for the ML model: {}\".format(output_path))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 3.2 Sagemaker Estimator set up"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## object2vec estimator - run it within the VPC\n",
    "o2vec2_class = sagemaker.estimator.Estimator(container,\n",
    "                                          role, \n",
    "                                          instance_count=1, \n",
    "                                          instance_type='ml.m4.xlarge',\n",
    "                                          output_path=output_path,\n",
    "                                          )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-20T10:06:05.418184Z",
     "start_time": "2021-01-20T10:06:05.412871Z"
    },
    "execution": {
     "iopub.execute_input": "2020-09-22T13:44:15.477266Z",
     "iopub.status.busy": "2020-09-22T13:44:15.476956Z",
     "iopub.status.idle": "2020-09-22T13:44:15.482962Z",
     "shell.execute_reply": "2020-09-22T13:44:15.482356Z",
     "shell.execute_reply.started": "2020-09-22T13:44:15.477232Z"
    }
   },
   "outputs": [],
   "source": [
    "## hyperparameter specification\n",
    "hyperparameters =   {\n",
    "     \"_kvstore\": \"device\",\n",
    "      \"_num_gpus\": 'auto',\n",
    "      \"_num_kv_servers\": \"auto\",\n",
    "      \"bucket_width\": 0,\n",
    "      \"dropout\": 0.4,\n",
    "      \"early_stopping_patience\": 3,\n",
    "      \"early_stopping_tolerance\": 0.001,\n",
    "      \"enc0_layers\": \"auto\",\n",
    "      \"enc0_max_seq_len\": 50,\n",
    "      \"enc0_network\": \"bilstm\",\n",
    "      \"enc0_token_embedding_dim\": 300,\n",
    "      'enc0_vocab_file': \"\", \n",
    "      \"enc0_vocab_size\": len(tokens),\n",
    "      \"enc1_network\": \"enc0\",\n",
    "      \"enc_dim\": 600,\n",
    "      \"epochs\": 100,\n",
    "      \"learning_rate\": 0.001,\n",
    "      \"mini_batch_size\": 1024,\n",
    "      \"mlp_activation\": \"relu\",\n",
    "      \"mlp_dim\": 512,\n",
    "      \"mlp_layers\": 2,\n",
    "      \"num_classes\": 2, #either 0 or 1.\n",
    "      \"optimizer\": \"adam\",\n",
    "      \"output_layer\": \"softmax\",\n",
    "      \"weight_decay\": 0,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-20T10:06:13.575210Z",
     "start_time": "2021-01-20T10:06:13.568237Z"
    },
    "execution": {
     "iopub.execute_input": "2020-09-22T13:44:17.182856Z",
     "iopub.status.busy": "2020-09-22T13:44:17.182553Z",
     "iopub.status.idle": "2020-09-22T13:44:17.186441Z",
     "shell.execute_reply": "2020-09-22T13:44:17.185823Z",
     "shell.execute_reply.started": "2020-09-22T13:44:17.182825Z"
    }
   },
   "outputs": [],
   "source": [
    "#input channels for validation, auxiliary (glove pretrained and vocabulary) and training.\n",
    "channels = {\n",
    "    'auxiliary': sagemaker.inputs.TrainingInput('s3://'+os.path.join(bucket_name, prefix, 'auxiliary/'),\n",
    "                                            distribution='FullyReplicated', \n",
    "                                            content_type='application/json'),\n",
    "    'train': sagemaker.inputs.TrainingInput('s3://'+os.path.join(bucket_name, prefix, 'train/train.jsonl'), \n",
    "                                            distribution='ShardedByS3Key', \n",
    "                                        content_type='application/jsonlines'),\n",
    "    'validation': sagemaker.inputs.TrainingInput('s3://'+os.path.join(bucket_name, prefix, 'val/val.jsonl'), \n",
    "                                             distribution='ShardedByS3Key', \n",
    "                                             content_type='application/jsonlines'),\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "job_name = 'default' + datetime.datetime.now().strftime(\"%Y-%m-%d-%H-%M-%S\")\n",
    "print(\"Training with job name\", job_name)\n",
    "o2vec2_class.set_hyperparameters(**hyperparameters)\n",
    "o2vec2_class.fit(channels, job_name=job_name, wait=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}