In [None]:
#Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#SPDX-License-Identifier: MIT-0

In [None]:
#Install additional libraries
!pip install nltk
!pip install jsonlines
!pip install pandarallel

In [None]:
#Import libraries and functions
import re
import pandas as pd
import sagemaker
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from search_utils import helpers

In [None]:
#Define common variables

#Creating a sagemaker session
sagemaker_session = sagemaker.Session()

#We'll be using the sagemaker default bucket
#Feel free to change this to another bucket name and make sure it's the same across all four notebooks
bucket_name = sagemaker_session.default_bucket()

# 1. Download data

We're using the Amazon reviews dataset (https://s3.amazonaws.com/amazon-reviews-pds/readme.html) which is provided under the following licence https://amazon-reviews-pds.s3.amazonaws.com/LICENSE.txt

We load 4 datasets from 4 different categories (Electronics, Shoes, Furniture and Toys), we then sample 100k examples from each category, merge the 4 samples and generate a smaller dataset of 100K containing all categories.

In [None]:
!mkdir ../data/
!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Electronics_v1_00.tsv.gz ../data/
!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Shoes_v1_00.tsv.gz ../data/
!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Furniture_v1_00.tsv.gz ../data/
!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Toys_v1_00.tsv.gz ../data/

In [None]:
!gunzip ../data/amazon_reviews_us_Electronics_v1_00.tsv.gz
!gunzip ../data/amazon_reviews_us_Shoes_v1_00.tsv.gz 
!gunzip ../data/amazon_reviews_us_Furniture_v1_00.tsv.gz
!gunzip ../data/amazon_reviews_us_Toys_v1_00.tsv.gz

## 1.1 Electronics dataset

In [None]:
data_electronics = pd.read_csv("../data/amazon_reviews_us_Electronics_v1_00.tsv", error_bad_lines=False, warn_bad_lines=False, sep="\t")

In [None]:
print(data_electronics.shape)

In [None]:
sub_set_electronics = data_electronics[:100000]

## 1.2 Shoes dataset

In [None]:
data_shoes = pd.read_csv("../data/amazon_reviews_us_Shoes_v1_00.tsv", error_bad_lines=False, warn_bad_lines=False, sep="\t")

In [None]:
print(data_shoes.shape)

In [None]:
sub_set_shoes = data_shoes[:100000]

## 1.3 Furniture dataset

In [None]:
data_furniture = pd.read_csv("../data/amazon_reviews_us_Furniture_v1_00.tsv", error_bad_lines=False, warn_bad_lines=False, sep="\t")

In [None]:
print(data_furniture.shape)

In [None]:
sub_set_furniture = data_furniture[:100000]

## 1.4 Toys dataset

In [None]:
data_toys = pd.read_csv("../data/amazon_reviews_us_Toys_v1_00.tsv", error_bad_lines=False, warn_bad_lines=False, sep="\t")

In [None]:
print(data_toys.shape)

In [None]:
sub_set_toys = data_toys[:100000]

# 2. Merge and process datasets

In [None]:
dataset = pd.concat([sub_set_electronics, sub_set_shoes, sub_set_furniture, sub_set_toys])

In [None]:
dataset=dataset.rename(columns={"product_id":"id"})

In [None]:
dataset.head()

In [None]:
print("Distribution of categories:")
print(dataset["product_category"].value_counts())

In [None]:
helpers.write_dataframe_to_s3(dataset, bucket_name=bucket_name, file_name="search_knn_blog/data/raw_data/data.csv", index=False, header=True)

# 2. Processing data 

In [None]:
#Changing column names and making sure there is no null values in the text
dataset = dataset.rename(columns={"product_id":"id"})
dataset = dataset[~dataset["product_title"].isnull()]

In [None]:
def clean_data(document):
 
 lemmatizer = WordNetLemmatizer()
 
 tokens = [t.lower() for t in word_tokenize(document)]
 
 clean_tokens = []
 for t in tokens:
 if len(t) >= 3 and re.match("^[a-zA-Z]*$", t):
 clean_tokens.append(t)

 clean_document = " ".join(clean_tokens)
 
 return clean_document

In [None]:
#This will take 1-2mins to run
dataset["processed_title"] = dataset["product_title"].apply(lambda x : clean_data(x))

In [None]:
#Making sure there is no null values after processing..
dataset = dataset[~dataset["processed_title"].isnull()]
dataset = dataset[dataset["processed_title"]!=""]

In [None]:
dataset.head()

In [None]:
helpers.write_dataframe_to_s3(dataset, bucket_name=bucket_name, file_name="search_knn_blog/data/processed_data/data.csv", index=False, header=True)