### ML model training

Multi-label text classification training using Object2Vec Amazon SageMaker built-in algorithm.

In [None]:
import os
import boto3
import json
import pickle
import datetime
import pandas as pd
import time
import botocore
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import json_serializer, json_deserializer
from sagemaker.tuner import CategoricalParameter, HyperparameterTuner, IntegerParameter, ContinuousParameter

In [None]:
bucket_name = "YOUR_BUCKET_HERE"
prefix = "connect/O2VInput"

#### Functions

In [None]:
def download_object(bucket_name, key, local_path):
 """Download S3 object to local"""
 s3 = boto3.resource('s3')
 try:
 s3.Bucket(bucket_name).download_file(key,local_path)
 except botocore.exceptions.ClientError as e:
 if e.response['Error']['Code'] == "404":
 print("The object does not exist")
 else:
 raise

In [None]:
def create_dir(directory):
 """Create a directory"""
 if not os.path.exists(directory):
 os.makedirs(directory)

In [None]:
def get_hyperparameter_ranges():
 """returns the hyperparameter ranges configured here"""
 #tunable parameters
 hyperparameter_ranges = {
 'mlp_layers': IntegerParameter(2,6),
 'early_stopping_patience': IntegerParameter(3,5),
 'mlp_activation': CategoricalParameter(['relu','tanh']),
 'dropout': ContinuousParameter(0.4, 0.8),
 'learning_rate': ContinuousParameter(0.0001, 0.001),
 'mini_batch_size': CategoricalParameter([512,1024])
 }
 return hyperparameter_ranges

In [None]:
def update_hyperparameter(hyperparameters, name_hyper, value_hyper):
 """update hyperparameter for non-tunable hyperparams"""
 hyperparameters[name_hyper] = value_hyper
 return hyperparameters

##### Specify image URI for Object2Vec

In [None]:
sess = sagemaker.Session()

In [None]:
from sagemaker import get_execution_role
role = get_execution_role()


In [None]:
## Get docker image of ObjectToVec algorithm
container = get_image_uri(boto3.Session().region_name, 'object2vec')

In [None]:
output_path = 's3://'+os.path.join(bucket_name, prefix.replace("O2VInput", "O2VOutput"))

In [None]:
download_object(bucket_name, prefix+"/meta/token_to_vocab_dict.p", "./token_to_vocab_dict.p")

In [None]:
import pickle
tokens=pickle.load(open("token_to_vocab_dict.p", "br"))

In [None]:
len(tokens)

In [None]:
print("My general output path for the ML model: {}".format(output_path))

##### 3.2 Sagemaker Estimator set up

In [None]:
## object2vec estimator - run it within the VPC
o2vec2_class = sagemaker.estimator.Estimator(container,
 role, 
 instance_count=1, 
 instance_type='ml.m4.xlarge',
 output_path=output_path,
 )

In [None]:
## hyperparameter specification
hyperparameters = {
 "_kvstore": "device",
 "_num_gpus": 'auto',
 "_num_kv_servers": "auto",
 "bucket_width": 0,
 "dropout": 0.4,
 "early_stopping_patience": 3,
 "early_stopping_tolerance": 0.001,
 "enc0_layers": "auto",
 "enc0_max_seq_len": 50,
 "enc0_network": "bilstm",
 "enc0_token_embedding_dim": 300,
 'enc0_vocab_file': "", 
 "enc0_vocab_size": len(tokens),
 "enc1_network": "enc0",
 "enc_dim": 600,
 "epochs": 100,
 "learning_rate": 0.001,
 "mini_batch_size": 1024,
 "mlp_activation": "relu",
 "mlp_dim": 512,
 "mlp_layers": 2,
 "num_classes": 2, #either 0 or 1.
 "optimizer": "adam",
 "output_layer": "softmax",
 "weight_decay": 0,
}

In [None]:
#input channels for validation, auxiliary (glove pretrained and vocabulary) and training.
channels = {
 'auxiliary': sagemaker.inputs.TrainingInput('s3://'+os.path.join(bucket_name, prefix, 'auxiliary/'),
 distribution='FullyReplicated', 
 content_type='application/json'),
 'train': sagemaker.inputs.TrainingInput('s3://'+os.path.join(bucket_name, prefix, 'train/train.jsonl'), 
 distribution='ShardedByS3Key', 
 content_type='application/jsonlines'),
 'validation': sagemaker.inputs.TrainingInput('s3://'+os.path.join(bucket_name, prefix, 'val/val.jsonl'), 
 distribution='ShardedByS3Key', 
 content_type='application/jsonlines'),
}

##### Training

In [None]:
job_name = 'default' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
print("Training with job name", job_name)
o2vec2_class.set_hyperparameters(**hyperparameters)
o2vec2_class.fit(channels, job_name=job_name, wait=True)