{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.\n", "#SPDX-License-Identifier: MIT-0" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#Install additional libraries\n", "!pip install nltk\n", "!pip install jsonlines\n", "!pip install pandarallel" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Import libraries and functions\n", "import re\n", "import pandas as pd\n", "import sagemaker\n", "from nltk import word_tokenize\n", "from nltk.stem import WordNetLemmatizer\n", "\n", "from search_utils import helpers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Define common variables\n", "\n", "#Creating a sagemaker session\n", "sagemaker_session = sagemaker.Session()\n", "\n", "#We'll be using the sagemaker default bucket\n", "#Feel free to change this to another bucket name and make sure it's the same across all four notebooks\n", "bucket_name = sagemaker_session.default_bucket()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Download data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We're using the Amazon reviews dataset (https://s3.amazonaws.com/amazon-reviews-pds/readme.html) which is provided under the following licence https://amazon-reviews-pds.s3.amazonaws.com/LICENSE.txt\n", "\n", "We load 4 datasets from 4 different categories (Electronics, Shoes, Furniture and Toys), we then sample 100k examples from each category, merge the 4 samples and generate a smaller dataset of 100K containing all categories." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!mkdir ../data/\n", "!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Electronics_v1_00.tsv.gz ../data/\n", "!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Shoes_v1_00.tsv.gz ../data/\n", "!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Furniture_v1_00.tsv.gz ../data/\n", "!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Toys_v1_00.tsv.gz ../data/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!gunzip ../data/amazon_reviews_us_Electronics_v1_00.tsv.gz\n", "!gunzip ../data/amazon_reviews_us_Shoes_v1_00.tsv.gz \n", "!gunzip ../data/amazon_reviews_us_Furniture_v1_00.tsv.gz\n", "!gunzip ../data/amazon_reviews_us_Toys_v1_00.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.1 Electronics dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_electronics = pd.read_csv(\"../data/amazon_reviews_us_Electronics_v1_00.tsv\", error_bad_lines=False, warn_bad_lines=False, sep=\"\\t\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(data_electronics.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sub_set_electronics = data_electronics[:100000]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.2 Shoes dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "data_shoes = pd.read_csv(\"../data/amazon_reviews_us_Shoes_v1_00.tsv\", error_bad_lines=False, warn_bad_lines=False, sep=\"\\t\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(data_shoes.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sub_set_shoes = data_shoes[:100000]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.3 Furniture dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "data_furniture = pd.read_csv(\"../data/amazon_reviews_us_Furniture_v1_00.tsv\", error_bad_lines=False, warn_bad_lines=False, sep=\"\\t\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(data_furniture.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sub_set_furniture = data_furniture[:100000]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.4 Toys dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "data_toys = pd.read_csv(\"../data/amazon_reviews_us_Toys_v1_00.tsv\", error_bad_lines=False, warn_bad_lines=False, sep=\"\\t\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(data_toys.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sub_set_toys = data_toys[:100000]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Merge and process datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset = pd.concat([sub_set_electronics, sub_set_shoes, sub_set_furniture, sub_set_toys])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset=dataset.rename(columns={\"product_id\":\"id\"})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Distribution of categories:\")\n", "print(dataset[\"product_category\"].value_counts())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "helpers.write_dataframe_to_s3(dataset, bucket_name=bucket_name, file_name=\"search_knn_blog/data/raw_data/data.csv\", index=False, header=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Processing data " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Changing column names and making sure there is no null values in the text\n", "dataset = dataset.rename(columns={\"product_id\":\"id\"})\n", "dataset = dataset[~dataset[\"product_title\"].isnull()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def clean_data(document):\n", " \n", " lemmatizer = WordNetLemmatizer()\n", " \n", " tokens = [t.lower() for t in word_tokenize(document)]\n", " \n", " clean_tokens = []\n", " for t in tokens:\n", " if len(t) >= 3 and re.match(\"^[a-zA-Z]*$\", t):\n", " clean_tokens.append(t)\n", "\n", " clean_document = \" \".join(clean_tokens)\n", " \n", " return clean_document" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#This will take 1-2mins to run\n", "dataset[\"processed_title\"] = dataset[\"product_title\"].apply(lambda x : clean_data(x))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Making sure there is no null values after processing..\n", "dataset = dataset[~dataset[\"processed_title\"].isnull()]\n", "dataset = dataset[dataset[\"processed_title\"]!=\"\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "helpers.write_dataframe_to_s3(dataset, bucket_name=bucket_name, file_name=\"search_knn_blog/data/processed_data/data.csv\", index=False, header=True)" ] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 4 }