In [None]:
#Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#SPDX-License-Identifier: MIT-0

In [None]:
#install additional libraries
!pip install nltk
!pip install jsonlines
!pip install pandarallel

In [None]:
#import libraries
import os
import uuid
import datetime
import time
import logging
import glob

import boto3
import sagemaker

from search_utils import helpers

In [None]:
#Define common variables

#Creating a sagemaker session
sagemaker_session = sagemaker.Session()

#We'll be using the sagemaker default bucket
#Feel free to change this to another bucket name and make sure it's the same across all four notebooks
bucket_name = sagemaker_session.default_bucket()

In [None]:
def generate_unique_id():
 return str(uuid.uuid4())

# 1. Building the docker image 

First we'll build a custom docker container in order to use it with the SageMaker processing jobs.

Within the docker we'll install the libraries defined in the requirements.txt file.

We'll also upload the source code (helper functions, processing functions etc) under "/opt/source_code/" so they are accessible during runtime.

In [None]:
%%bash
cd ../
sh build_and_push.sh

In [None]:
#Make sure you replace the following variable with your account id and region
#You can also copy past the ECR uri from the logs of the previous cell
ecr_uri = "-dkr.ecr-.amazonaws.com/sm-search:latest"

In [None]:
#uploading the search_utils files so they are accessible during runtime
s3_client = boto3.client("s3")
for file_name in glob.glob("../src/search_utils/*.py"):
 s3_client.upload_file(file_name, bucket_name, f"search_knn_blog/code/{file_name.split('/')[-1]}" )

# 2. Preprocessing 

In [None]:
from sagemaker.processing import ScriptProcessor
script_processor = ScriptProcessor(
 image_uri=ecr_uri,
 role=sagemaker.get_execution_role(),
 instance_count=1,
 instance_type='ml.m5.4xlarge',
 command=["python3"],
 volume_size_in_gb=50)


In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

now = datetime.datetime.utcnow()
now_string = now.strftime('%y%m%d%H%M%S%f')
run_id = now_string[:-2]
print(f"run id : {run_id}")
preprocess_job_name = f"search-preprocess-{run_id}"

s3_input_path=f"s3://{bucket_name}/search_knn_blog/data/processed_data/"
s3_code_path=f"s3://{bucket_name}/search_knn_blog/code/"

script_processor.run(job_name=preprocess_job_name,
 code='../src/preprocessing_main.py',
 inputs=[ProcessingInput(
 source=s3_input_path,
 destination='/opt/ml/processing/input'),
 ProcessingInput(
 source=s3_code_path,
 destination='/opt/ml/processing/input/code/search_utils/')],
 outputs=[
ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
 output_name='train_textual',
 source='/opt/ml/processing/train_textual'),
ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
 output_name='test_textual',
 source='/opt/ml/processing/test_textual'),
ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
 output_name='train_numerical',
 source='/opt/ml/processing/train_numerical'),
ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
 output_name='test_numerical',
 source='/opt/ml/processing/test_numerical'),
ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
 output_name='vocab',
 source='/opt/ml/processing/vocab'),
ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
 output_name='raw_vocab',
 source='/opt/ml/processing/raw_vocab')
 ],
 arguments=['--train-test-split-ratio', '0.2','--total-nb-of-records', '10000'],wait=False)



In [None]:
status = boto3.client("sagemaker").describe_processing_job(ProcessingJobName=preprocess_job_name)["ProcessingJobStatus"]

while status == 'InProgress': 
 status = boto3.client("sagemaker").describe_processing_job(ProcessingJobName=preprocess_job_name)["ProcessingJobStatus"]
 print(status)
 time.sleep(30)
 continue

# 3. Glove embedding

We will be using the glove embedding to initiate the values of the word tokens. The GloVe embeddings are downloaded from here : https://nlp.stanford.edu/projects/glove/

This data is made available under the Public Domain Dedication and License v1.0 whose full text can be found at: http://www.opendatacommons.org/licenses/pddl/1.0/.


Let's start by pulling the glove embeddings locally then pushing them to S3 using the following commands:

In [None]:
%%bash
# This might time a few minutes
mkdir /tmp/GloVe
curl -Lo /tmp/GloVe/glove.840B.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
unzip /tmp/GloVe/glove.840B.zip -d /tmp/GloVe/
rm /tmp/GloVe/glove.840B.zip

In [None]:
boto3.client("s3").upload_file("/tmp/GloVe/glove.840B.300d.txt",
 bucket_name, "search_knn_blog/artefacts/glove.840B.300d.txt")

We can now go ahead and craete a processing job name that will parse the vocabulary generated in the previous section and output a trimmed version of the glove embeddings based on our vocabulary.


In [None]:
from sagemaker.processing import ScriptProcessor
script_processor = ScriptProcessor(
 image_uri=ecr_uri,
 role=sagemaker.get_execution_role(),
 instance_count=1,
 instance_type='ml.m5.xlarge',
 command=["python3"])


In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

now = datetime.datetime.utcnow()
now_string = now.strftime('%y%m%d%H%M%S%f')
run_id = now_string[:-2]
print(f"run id : {run_id}")
glove_job_name = f"search-glove-{run_id}"

s3_code_path=f"s3://{bucket_name}/search_knn_blog/code/"

script_processor.run(job_name=glove_job_name,
 code='../src/glove_embeddings_main.py',
 inputs=[ProcessingInput(
 source=s3_code_path,
 destination='/opt/ml/processing/input/code/search_utils/'),
 ProcessingInput(
 source=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
 destination='/opt/ml/processing/input_vocabulary'),
 ProcessingInput(
 source=f"s3://{bucket_name}/search_knn_blog/artefacts/glove.840B.300d.txt",
 destination='/opt/ml/processing/input_glove')
 ],
 outputs=[
 ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{glove_job_name}/",
 output_name='trimmed_glove',
 source='/opt/ml/processing/trimmed_glove'),
 ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{glove_job_name}/",
 output_name='vocab',
 source='/opt/ml/processing/vocab')
 ],
 arguments=['--train-test-split-ratio', '0.2'],
 wait=False)

In [None]:
status = boto3.client("sagemaker").describe_processing_job(ProcessingJobName=glove_job_name)["ProcessingJobStatus"]

while status == 'InProgress': 
 status = boto3.client("sagemaker").describe_processing_job(ProcessingJobName=glove_job_name)["ProcessingJobStatus"]
 print(status)
 
 time.sleep(30)
 continue

In [None]:
print(f"This is the processing job name you will need during inference : {glove_job_name}")

# 4. Training 

In [None]:
word_to_id = helpers.read_json_from_s3(bucket_name, f"search_knn_blog/sagemaker-runs/{glove_job_name}/vocab.json")

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input

now = datetime.datetime.utcnow()
now_string = now.strftime('%y%m%d%H%M%S%f')
run_id = now_string[:-2]
print(f"run id : {run_id}")

training_job_name = f"search-training-{run_id}"
output_path = os.path.join(f"s3://{bucket_name}/search_knn_blog/sagemaker-runs", training_job_name)

regressor = sagemaker.estimator.Estimator(get_image_uri(boto3.Session().region_name, 'object2vec'),
 sagemaker.get_execution_role(), 
 train_instance_count=1, 
 train_instance_type='ml.p3.8xlarge',
 output_path=output_path,
 sagemaker_session=sagemaker.Session())


hyperparameters = {
 "enc_dim": 512, #The dimension of the output of the embedding layer.
 "mlp_dim": 256, #The dimension of the output from MLP layers.
 "mlp_activation": "linear",
 "mlp_layers": 2,
 
 "output_layer" : "softmax",#classification task
 "num_classes": 2,#0 and 1

 "optimizer" : "adam",
 "learning_rate" : 0.0004,
 "mini_batch_size": 256,
 "epochs" : 20,

 "enc0_max_seq_len": 200,
 "enc1_max_seq_len": 200,

 "enc0_network": "bilstm", #The network model for the enc0 encoder.
 "enc1_network": "enc0", #same as enc0_network

 "enc0_token_embedding_dim": 300, #The output dimension of the enc0 token embedding layer.
 "enc1_token_embedding_dim": 300, #The output dimension of the enc1 token embedding layer.
 
 "enc0_vocab_file" : "vocab.json", #The vocabulary file for mapping pretrained enc0 token embedding vectors to numerical vocabulary IDs.
 "enc1_vocab_file" : "vocab.json", #same as enc0_vocab_file

 "enc0_vocab_size" : len(word_to_id),#The vocabulary size of enc0 tokens.
 "enc1_vocab_size" : len(word_to_id),#The vocabulary size of enc1 tokens.
 
 "enc0_pretrained_embedding_file" : "trimmed_glove.txt",
 "enc1_pretrained_embedding_file" : "trimmed_glove.txt"
 
}

input_channels = {}
s3_client = boto3.client('s3')

input_channels["train"] = s3_input(os.path.join(f"s3://{bucket_name}/search_knn_blog/sagemaker-runs",\
 preprocess_job_name,
 "numerical_train_data.jsonl"),
 distribution='FullyReplicated', 
 content_type='application/jsonlines')

input_channels["test"] = s3_input(os.path.join(f"s3://{bucket_name}/search_knn_blog/sagemaker-runs",\
 preprocess_job_name,
 "numerical_test_data.jsonl"),
 distribution='FullyReplicated', 
 content_type='application/jsonlines')

input_channels['auxiliary'] = s3_input(os.path.join(f"s3://{bucket_name}/search_knn_blog/sagemaker-runs",\
 glove_job_name), 
 distribution='FullyReplicated', content_type='application/json')


In [None]:
regressor.set_hyperparameters(**hyperparameters)
regressor.fit(input_channels, job_name=training_job_name, wait=False)

In [None]:
status = boto3.client("sagemaker").describe_training_job(TrainingJobName=training_job_name)["TrainingJobStatus"]

while status == 'InProgress': 
 status = boto3.client("sagemaker").describe_training_job(TrainingJobName=training_job_name)["TrainingJobStatus"]
 print(status)
 
 time.sleep(30)
 continue


In [None]:
print(f"This is the training job name you will need during inference : {training_job_name}")

In [None]:
dict_metrics = [{metric["MetricName"]:metric['Value']} for metric in boto3.client("sagemaker").describe_training_job(TrainingJobName=training_job_name)["FinalMetricDataList"]]

In [None]:
dict_metrics