{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Take a look at the data files you have downloaded." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare your data \n", "[Back to top](#top)\n", "\n", "The next thing to be done is to load the data and confirm the data is in a good state, then save it to a CSV where it is ready to be used with Amazon Personalize.\n", "\n", "To get started, import a collection of Python libraries commonly used in data science." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import time\n", "from time import sleep\n", "import json\n", "from datetime import datetime\n", "import numpy as np\n", "import boto3\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Configure the SDK to Personalize:\n", "personalize = boto3.client('personalize')\n", "personalize_runtime = boto3.client('personalize-runtime')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "item_df = pd.read_pickle(\"item_df.p\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "item_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "user_df = pd.read_pickle(\"user_df.p\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "train_data = pd.read_csv(\"ml-100k/train.csv\")\n", "test_data = pd.read_csv(\"ml-100k/test.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next,open the data file and take a look at the first several rows." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Offline Evaluation " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install tqdm\n", "from tqdm import tqdm_notebook\n", "from metrics import mean_reciprocal_rank, ndcg_at_k, precision_at_k" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_number = 1000\n", "unique_user = test_data['uid'].unique()\n", "np.random.shuffle(unique_user)\n", "sampled_user = unique_user[:sample_number]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "sampled_results = test_data[test_data['uid'].isin(sampled_user)].groupby('uid').iid\n", "sampled_results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "\n", "rerank_campaign_arn = \n", "\n", "relevance = []\n", "for user_id, true_items in tqdm_notebook(sampled_results):\n", " rec_response = personalize_runtime.get_recommendations(\n", " campaignArn = rerank_campaign_arn,\n", " userId = str(user_id)\n", " )\n", " rec_items = [int(x['itemId']) for x in rec_response['itemList']]\n", " relevance.append([int(x in true_items.values) for x in rec_items])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance]))\n", "print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance]))\n", "print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance]))\n", "print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance]))\n", "print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance]))\n", "print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance]))\n", "print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Calculate diversity, novelty and serendipity" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \\\n", " 'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \\\n", " 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']\n", "userprofile = train_data.merge(item_df, on=['iid'])[['uid']+genres].groupby(['uid']).sum().reset_index()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "userprofile" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## item object \n", "\n", "class Item: \n", " def __init__(self, item_df, play_log_df):\n", " self.items = item_df\n", " self.play_log = play_log_df.groupby(['iid']).sum().reset_index()\n", " \n", " def get_contents_by_id(self, id): \n", " return self.items[self.items['iid']==id].values[0][5:] #categories \n", " \n", " \n", " def get_popularity_by_id(self, id):\n", " if len(self.play_log[self.play_log['iid']==id].values) == 0:\n", " return 0\n", " return self.play_log[self.play_log['iid']==id].values[0][3]\n", "\n", "class User: \n", " def __init__(self, train_data, item_df):\n", " genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \\\n", " 'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \\\n", " 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']\n", " self.userprofile = train_data.merge(item_df, on=['iid'])[['uid']+genres].groupby(['uid']).sum().reset_index()\n", " \n", " def get_user_profile(self, id):\n", "\n", " if len(self.userprofile[self.userprofile['uid']==id].values)==0: \n", " return [0 for i in range(0, 19)]\n", " else:\n", " raw_profile = self.userprofile[self.userprofile['uid']==id].values[0][1:]\n", " avg = np.average(raw_profile)\n", " return [1 if x >=avg else 0 for x in raw_profile]\n", " \n", " \n", " \n", "item_db = Item(item_df, train_data)\n", "user_db = User(train_data, item_df)\n", "print(item_db.get_contents_by_id(1)) \n", "print(item_db.get_popularity_by_id(1)) \n", "print(user_db.get_user_profile(7))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### done by inter-similarity of a recommendation list \n", "import math\n", "\n", "def diversity(pred, item_db):\n", " d = 0 \n", " for i, p1 in enumerate(pred): \n", " for j, p2 in enumerate(pred):\n", " if j > i: \n", " dist = sum(abs(item_db.get_contents_by_id(p1) - item_db.get_contents_by_id(p2))) \n", " d += dist\n", " return d \n", "\n", "def novelty(pred, item_db):\n", " d = 0 \n", " for i, p in enumerate(pred):\n", " d += 1/(math.log(item_db.get_popularity_by_id(p)+2,2)+1)\n", " return d\n", "\n", "\n", "def serendipity(pred, groud_truth, uid, user_db, item_db): \n", " up = user_db.get_user_profile(uid)\n", " up_norm = [1 if i > 0 else 0 for i in up ]\n", " dist_total = 0 \n", " for p in pred:\n", " if p in groud_truth:\n", " contents = item_db.get_contents_by_id(p)\n", " dist = sum(abs(up_norm - contents)) \n", " dist_total += dist\n", " return dist_total / len(pred)\n", " \n", " \n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "total_diversity = 0 \n", "total_novelty = 0 \n", "total_serendipity = 0 \n", "\n", "\n", "for user_id, true_items in tqdm_notebook(sampled_results):\n", " rec_response = personalize_runtime.get_recommendations(\n", " campaignArn = rerank_campaign_arn,\n", " userId = str(user_id)\n", " )\n", " rec_items = [int(x['itemId']) for x in rec_response['itemList']]\n", " total_diversity += diversity(rec_items, item_db)\n", " total_novelty += novelty(rec_items, item_db)\n", " total_serendipity += serendipity(rec_items, true_items, user_id, user_db, item_db)\n", " \n", "users = test_data['uid'].unique() \n", "print(total_diversity / sample_number) \n", "print(total_novelty / sample_number)\n", "print(total_serendipity / sample_number)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%store dataset_group_arn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%store schema_arn \n", "%store item_schema_arn\n", "%store user_schema_arn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%store role_arn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset_group_arn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 4 }