In [None]:
#Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#SPDX-License-Identifier: MIT-0

In [None]:
#install additional libraries
!pip install nltk
!pip install jsonlines
!pip install pandarallel
!pip install tensorflow==2.1
!pip install --upgrade grpcio 
!pip install --upgrade s3fs

In [None]:
#import libraries
import os 
import json
import shutil
import tensorflow as tf
import pandas as pd
import numpy as np

import boto3
import sagemaker
import nltk

from search_utils import helpers, search_preprocessing

# 1. Deploy a SageMaker Endpoint

In [None]:
#Creating a sagemaker session
sagemaker_session = sagemaker.Session()

#We'll be using the sagemaker default bucket
#Feel free to change this to another bucket name and make sure it's the same across all four notebooks
bucket_name = sagemaker_session.default_bucket()

#Copy the glove_job_name, this was generated automatically in step 3 of the training notebook
glove_job_name = ""

#Copy the training_job_name, this was generated automatically in step 4 of the training notebook
training_job_name = ""

In [None]:
sagemaker_session.create_model_from_job(training_job_name=training_job_name,
 env={'INFERENCE_PREFERRED_MODE': 'embedding'})

endpoint_config_name = sagemaker_session.create_endpoint_config(name=training_job_name,
 model_name=training_job_name,
 initial_instance_count=1,
 instance_type='ml.m4.xlarge')

#Specify the name of the endpoint
endpoint_name = "object2vec-embeddings"

sagemaker_session.create_endpoint(endpoint_name=endpoint_name, config_name=training_job_name, tags=None, wait=True)

# 2. Generate predictions using the SageMaker Endpoint

In [None]:
#Reading the data from S3 and loading the vocabulary 
data = pd.read_csv(f"s3://{bucket_name}/search_knn_blog/data/processed_data/data.csv")

word_to_id = helpers.read_json_from_s3(bucket_name,\
 f'search_knn_blog/sagemaker-runs/{glove_job_name}/vocab.json')

In [None]:
#Sample a few products from the overall catalog
sub_set = data.sample(10000)
descriptions = sub_set["processed_title"]

In [None]:
from sagemaker.predictor import json_serializer, json_deserializer

# define encode-decode format for inference data
predictor = sagemaker.predictor.RealTimePredictor(endpoint_name)
predictor.content_type = 'application/json'
predictor.serializer = json_serializer
predictor.deserializer = json_deserializer
tokenizer = nltk.tokenize.TreebankWordTokenizer()

In [None]:
def l2_normalize(v):
 """
 This functions normalise the embeddings based on the L2 norm.
 """
 
 norm = np.sqrt(np.sum(np.square(v)))

 return v / norm

In [None]:
#This is the "enc_dim" parameter you have set in the training job hyperparameters of object2vec
#By default this value is set to 512 in the training notebook
embedding_size=512

In [None]:
all_embeddings, labels = [], []
for i, description in enumerate(descriptions):
 if i%1000==0:
 print(f"Processing product {i}/{len(descriptions)}")

 enc_description = search_preprocessing.sentence_to_integers(description, tokenizer, word_to_id)
 if len(enc_description) != 0:
 payload = {"instances" : [{"in0": enc_description}]}
 result = predictor.predict(payload)
 embeddings = result["predictions"][0]["embeddings"]
 embeddings = l2_normalize(embeddings)
 labels.append(sub_set.iloc[i]["product_category"])
 all_embeddings.append(embeddings)
 else:
 all_embeddings.append([0]*embedding_size)
 labels.append(sub_set.iloc[i]["product_category"])


In [None]:
#Transforming predictions to "float64" numpy array
labels = np.array(labels, dtype="str")

X = []
for em_value in all_embeddings:
 X.append(em_value)
 
embeddings = np.array(X)
embeddings.dtype = "float64"
print(embeddings.shape)

# 3. Visualise the embeddings using Tensorboard projector

In [None]:
#Create a directory for storing tensorboard logs
#If a directory exists (previous runs) make sure to clean it up
if os.path.isdir("../tensorboard_logs"):
 shutil.rmtree("../tensorboard_logs")
 os.mkdir("../tensorboard_logs")
else:
 os.mkdir("../tensorboard_logs")

In [None]:
from tensorboard.plugins import projector

def register_embedding(embedding_tensor_name, metadata_path, logs_dir):
 config = projector.ProjectorConfig()
 embedding = config.embeddings.add()
 embedding.tensor_name = embedding_tensor_name
 embedding.metadata_path = metadata_path
 projector.visualize_embeddings(logs_dir, config)
 
#Setting the tensorboard logs directory and additional variables
logs_dir = '../tensorboard_logs' 
metadata_path = 'metadata.tsv' 
embedding_tensor_name = 'embeddings'
EMBEDDINGS_FPATH = os.path.join(logs_dir, f'{embedding_tensor_name}.ckpt')

#Registering and saving the embeddings in the logs directory
tf.compat.v1.reset_default_graph()
register_embedding(embedding_tensor_name, metadata_path, logs_dir)
tf.compat.v1.disable_eager_execution()
tensor_embeddings = tf.Variable(embeddings, name=embedding_tensor_name)
tf_session = tf.compat.v1.InteractiveSession()
tf_session.run(tf.compat.v1.global_variables_initializer())
saver = tf.compat.v1.train.Saver()
saver.save(tf_session, os.path.join(logs_dir, f'{embedding_tensor_name}.ckpt'), 0)
tf_session.close()

#Saving the labels
with open(os.path.join(logs_dir, metadata_path), 'w') as f:
 for label in labels:
 f.write('{}\n'.format(label))
 


# 4. Open Tensorboard

# 5. Saving enriched data for Elasticsearch 

In [None]:
str_all_embeddings = [str(list(e)) for e in all_embeddings]
sub_set["embeddings"]= str_all_embeddings

In [None]:
def row2dict(x):
 x = x.dropna().to_dict()

 for key in x:
 x[key] = str(x[key])


 return x

records_to_save = sub_set.apply(lambda x: row2dict(x), axis=1)
records_to_save = list(records_to_save.values)


with open("./data.json", "w") as write_file:
 json.dump(records_to_save, write_file)



boto3.client("s3").upload_file("./data.json",
 bucket_name, "search_knn_blog/data/enriched_data/data.json")

# 5. Delete the endpoint (Unless you plan to continue to notebook 4)

In [None]:
#Make sure you delete your endpoint when you're done making predictions
sagemaker_session.delete_endpoint(endpoint_name="object2vec-embeddings")