{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.\n", "#SPDX-License-Identifier: MIT-0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#install additional libraries\n", "!pip install nltk\n", "!pip install jsonlines\n", "!pip install pandarallel\n", "!pip install tensorflow==2.1\n", "!pip install --upgrade grpcio \n", "!pip install --upgrade s3fs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#import libraries\n", "import os \n", "import json\n", "import shutil\n", "import tensorflow as tf\n", "import pandas as pd\n", "import numpy as np\n", "\n", "import boto3\n", "import sagemaker\n", "import nltk\n", "\n", "from search_utils import helpers, search_preprocessing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Deploy a SageMaker Endpoint" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Creating a sagemaker session\n", "sagemaker_session = sagemaker.Session()\n", "\n", "#We'll be using the sagemaker default bucket\n", "#Feel free to change this to another bucket name and make sure it's the same across all four notebooks\n", "bucket_name = sagemaker_session.default_bucket()\n", "\n", "#Copy the glove_job_name, this was generated automatically in step 3 of the training notebook\n", "glove_job_name = \"\"\n", "\n", "#Copy the training_job_name, this was generated automatically in step 4 of the training notebook\n", "training_job_name = \"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sagemaker_session.create_model_from_job(training_job_name=training_job_name,\n", " env={'INFERENCE_PREFERRED_MODE': 'embedding'})\n", "\n", "endpoint_config_name = sagemaker_session.create_endpoint_config(name=training_job_name,\n", " model_name=training_job_name,\n", " initial_instance_count=1,\n", " instance_type='ml.m4.xlarge')\n", "\n", "#Specify the name of the endpoint\n", "endpoint_name = \"object2vec-embeddings\"\n", "\n", "sagemaker_session.create_endpoint(endpoint_name=endpoint_name, config_name=training_job_name, tags=None, wait=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Generate predictions using the SageMaker Endpoint" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Reading the data from S3 and loading the vocabulary \n", "data = pd.read_csv(f\"s3://{bucket_name}/search_knn_blog/data/processed_data/data.csv\")\n", "\n", "word_to_id = helpers.read_json_from_s3(bucket_name,\\\n", " f'search_knn_blog/sagemaker-runs/{glove_job_name}/vocab.json')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Sample a few products from the overall catalog\n", "sub_set = data.sample(10000)\n", "descriptions = sub_set[\"processed_title\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sagemaker.predictor import json_serializer, json_deserializer\n", "\n", "# define encode-decode format for inference data\n", "predictor = sagemaker.predictor.RealTimePredictor(endpoint_name)\n", "predictor.content_type = 'application/json'\n", "predictor.serializer = json_serializer\n", "predictor.deserializer = json_deserializer\n", "tokenizer = nltk.tokenize.TreebankWordTokenizer()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def l2_normalize(v):\n", " \"\"\"\n", " This functions normalise the embeddings based on the L2 norm.\n", " \"\"\"\n", " \n", " norm = np.sqrt(np.sum(np.square(v)))\n", "\n", " return v / norm" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#This is the \"enc_dim\" parameter you have set in the training job hyperparameters of object2vec\n", "#By default this value is set to 512 in the training notebook\n", "embedding_size=512" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "all_embeddings, labels = [], []\n", "for i, description in enumerate(descriptions):\n", " if i%1000==0:\n", " print(f\"Processing product {i}/{len(descriptions)}\")\n", "\n", " enc_description = search_preprocessing.sentence_to_integers(description, tokenizer, word_to_id)\n", " if len(enc_description) != 0:\n", " payload = {\"instances\" : [{\"in0\": enc_description}]}\n", " result = predictor.predict(payload)\n", " embeddings = result[\"predictions\"][0][\"embeddings\"]\n", " embeddings = l2_normalize(embeddings)\n", " labels.append(sub_set.iloc[i][\"product_category\"])\n", " all_embeddings.append(embeddings)\n", " else:\n", " all_embeddings.append([0]*embedding_size)\n", " labels.append(sub_set.iloc[i][\"product_category\"])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Transforming predictions to \"float64\" numpy array\n", "labels = np.array(labels, dtype=\"str\")\n", "\n", "X = []\n", "for em_value in all_embeddings:\n", " X.append(em_value)\n", " \n", "embeddings = np.array(X)\n", "embeddings.dtype = \"float64\"\n", "print(embeddings.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Visualise the embeddings using Tensorboard projector" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Create a directory for storing tensorboard logs\n", "#If a directory exists (previous runs) make sure to clean it up\n", "if os.path.isdir(\"../tensorboard_logs\"):\n", " shutil.rmtree(\"../tensorboard_logs\")\n", " os.mkdir(\"../tensorboard_logs\")\n", "else:\n", " os.mkdir(\"../tensorboard_logs\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from tensorboard.plugins import projector\n", "\n", "def register_embedding(embedding_tensor_name, metadata_path, logs_dir):\n", " config = projector.ProjectorConfig()\n", " embedding = config.embeddings.add()\n", " embedding.tensor_name = embedding_tensor_name\n", " embedding.metadata_path = metadata_path\n", " projector.visualize_embeddings(logs_dir, config)\n", " \n", "#Setting the tensorboard logs directory and additional variables\n", "logs_dir = '../tensorboard_logs' \n", "metadata_path = 'metadata.tsv' \n", "embedding_tensor_name = 'embeddings'\n", "EMBEDDINGS_FPATH = os.path.join(logs_dir, f'{embedding_tensor_name}.ckpt')\n", "\n", "#Registering and saving the embeddings in the logs directory\n", "tf.compat.v1.reset_default_graph()\n", "register_embedding(embedding_tensor_name, metadata_path, logs_dir)\n", "tf.compat.v1.disable_eager_execution()\n", "tensor_embeddings = tf.Variable(embeddings, name=embedding_tensor_name)\n", "tf_session = tf.compat.v1.InteractiveSession()\n", "tf_session.run(tf.compat.v1.global_variables_initializer())\n", "saver = tf.compat.v1.train.Saver()\n", "saver.save(tf_session, os.path.join(logs_dir, f'{embedding_tensor_name}.ckpt'), 0)\n", "tf_session.close()\n", "\n", "#Saving the labels\n", "with open(os.path.join(logs_dir, metadata_path), 'w') as f:\n", " for label in labels:\n", " f.write('{}\\n'.format(label))\n", " \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Open Tensorboard" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "Within your terminal type the following commands:\n", "\n", "$ source activate tensorflow_p36\n", "$ cd \n", "$ tensorboard --logdir ./tensorboard_logs\n", "\n", "Once tensorboard is running you can check it out using the following link in your browser:\n", "\n", "https://.notebook..sagemaker.aws/proxy//#projector" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Saving enriched data for Elasticsearch " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "str_all_embeddings = [str(list(e)) for e in all_embeddings]\n", "sub_set[\"embeddings\"]= str_all_embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def row2dict(x):\n", " x = x.dropna().to_dict()\n", "\n", " for key in x:\n", " x[key] = str(x[key])\n", "\n", "\n", " return x\n", "\n", "records_to_save = sub_set.apply(lambda x: row2dict(x), axis=1)\n", "records_to_save = list(records_to_save.values)\n", "\n", "\n", "with open(\"./data.json\", \"w\") as write_file:\n", " json.dump(records_to_save, write_file)\n", "\n", "\n", "\n", "boto3.client(\"s3\").upload_file(\"./data.json\",\n", " bucket_name, \"search_knn_blog/data/enriched_data/data.json\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Delete the endpoint (Unless you plan to continue to notebook 4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Make sure you delete your endpoint when you're done making predictions\n", "sagemaker_session.delete_endpoint(endpoint_name=\"object2vec-embeddings\")" ] } ], "metadata": { "kernelspec": { "display_name": "conda_tensorflow_p36", "language": "python", "name": "conda_tensorflow_p36" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 4 }