{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Take a look at the data files you have downloaded."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare your data \n",
"[Back to top](#top)\n",
"\n",
"The next thing to be done is to load the data and confirm the data is in a good state, then save it to a CSV where it is ready to be used with Amazon Personalize.\n",
"\n",
"To get started, import a collection of Python libraries commonly used in data science."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"from time import sleep\n",
"import json\n",
"from datetime import datetime\n",
"import numpy as np\n",
"import boto3\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Configure the SDK to Personalize:\n",
"personalize = boto3.client('personalize')\n",
"personalize_runtime = boto3.client('personalize-runtime')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"item_df = pd.read_pickle(\"item_df.p\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"item_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"user_df = pd.read_pickle(\"user_df.p\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"train_data = pd.read_csv(\"ml-100k/train.csv\")\n",
"test_data = pd.read_csv(\"ml-100k/test.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next,open the data file and take a look at the first several rows."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Offline Evaluation "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install tqdm\n",
"from tqdm import tqdm_notebook\n",
"from metrics import mean_reciprocal_rank, ndcg_at_k, precision_at_k"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_number = 1000\n",
"unique_user = test_data['uid'].unique()\n",
"np.random.shuffle(unique_user)\n",
"sampled_user = unique_user[:sample_number]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"sampled_results = test_data[test_data['uid'].isin(sampled_user)].groupby('uid').iid\n",
"sampled_results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"\n",
"rerank_campaign_arn = \n",
"\n",
"relevance = []\n",
"for user_id, true_items in tqdm_notebook(sampled_results):\n",
" rec_response = personalize_runtime.get_recommendations(\n",
" campaignArn = rerank_campaign_arn,\n",
" userId = str(user_id)\n",
" )\n",
" rec_items = [int(x['itemId']) for x in rec_response['itemList']]\n",
" relevance.append([int(x in true_items.values) for x in rec_items])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance]))\n",
"print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance]))\n",
"print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance]))\n",
"print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance]))\n",
"print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance]))\n",
"print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance]))\n",
"print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Calculate diversity, novelty and serendipity"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \\\n",
" 'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \\\n",
" 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']\n",
"userprofile = train_data.merge(item_df, on=['iid'])[['uid']+genres].groupby(['uid']).sum().reset_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"userprofile"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## item object \n",
"\n",
"class Item: \n",
" def __init__(self, item_df, play_log_df):\n",
" self.items = item_df\n",
" self.play_log = play_log_df.groupby(['iid']).sum().reset_index()\n",
" \n",
" def get_contents_by_id(self, id): \n",
" return self.items[self.items['iid']==id].values[0][5:] #categories \n",
" \n",
" \n",
" def get_popularity_by_id(self, id):\n",
" if len(self.play_log[self.play_log['iid']==id].values) == 0:\n",
" return 0\n",
" return self.play_log[self.play_log['iid']==id].values[0][3]\n",
"\n",
"class User: \n",
" def __init__(self, train_data, item_df):\n",
" genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \\\n",
" 'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \\\n",
" 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']\n",
" self.userprofile = train_data.merge(item_df, on=['iid'])[['uid']+genres].groupby(['uid']).sum().reset_index()\n",
" \n",
" def get_user_profile(self, id):\n",
"\n",
" if len(self.userprofile[self.userprofile['uid']==id].values)==0: \n",
" return [0 for i in range(0, 19)]\n",
" else:\n",
" raw_profile = self.userprofile[self.userprofile['uid']==id].values[0][1:]\n",
" avg = np.average(raw_profile)\n",
" return [1 if x >=avg else 0 for x in raw_profile]\n",
" \n",
" \n",
" \n",
"item_db = Item(item_df, train_data)\n",
"user_db = User(train_data, item_df)\n",
"print(item_db.get_contents_by_id(1)) \n",
"print(item_db.get_popularity_by_id(1)) \n",
"print(user_db.get_user_profile(7))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"### done by inter-similarity of a recommendation list \n",
"import math\n",
"\n",
"def diversity(pred, item_db):\n",
" d = 0 \n",
" for i, p1 in enumerate(pred): \n",
" for j, p2 in enumerate(pred):\n",
" if j > i: \n",
" dist = sum(abs(item_db.get_contents_by_id(p1) - item_db.get_contents_by_id(p2))) \n",
" d += dist\n",
" return d \n",
"\n",
"def novelty(pred, item_db):\n",
" d = 0 \n",
" for i, p in enumerate(pred):\n",
" d += 1/(math.log(item_db.get_popularity_by_id(p)+2,2)+1)\n",
" return d\n",
"\n",
"\n",
"def serendipity(pred, groud_truth, uid, user_db, item_db): \n",
" up = user_db.get_user_profile(uid)\n",
" up_norm = [1 if i > 0 else 0 for i in up ]\n",
" dist_total = 0 \n",
" for p in pred:\n",
" if p in groud_truth:\n",
" contents = item_db.get_contents_by_id(p)\n",
" dist = sum(abs(up_norm - contents)) \n",
" dist_total += dist\n",
" return dist_total / len(pred)\n",
" \n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"total_diversity = 0 \n",
"total_novelty = 0 \n",
"total_serendipity = 0 \n",
"\n",
"\n",
"for user_id, true_items in tqdm_notebook(sampled_results):\n",
" rec_response = personalize_runtime.get_recommendations(\n",
" campaignArn = rerank_campaign_arn,\n",
" userId = str(user_id)\n",
" )\n",
" rec_items = [int(x['itemId']) for x in rec_response['itemList']]\n",
" total_diversity += diversity(rec_items, item_db)\n",
" total_novelty += novelty(rec_items, item_db)\n",
" total_serendipity += serendipity(rec_items, true_items, user_id, user_db, item_db)\n",
" \n",
"users = test_data['uid'].unique() \n",
"print(total_diversity / sample_number) \n",
"print(total_novelty / sample_number)\n",
"print(total_serendipity / sample_number)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%store dataset_group_arn"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%store schema_arn \n",
"%store item_schema_arn\n",
"%store user_schema_arn"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%store role_arn"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset_group_arn"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "conda_python3",
"language": "python",
"name": "conda_python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}