Take a look at the data files you have downloaded.

## Prepare your data 
[Back to top](#top)

The next thing to be done is to load the data and confirm the data is in a good state, then save it to a CSV where it is ready to be used with Amazon Personalize.

To get started, import a collection of Python libraries commonly used in data science.

In [None]:
import time
from time import sleep
import json
from datetime import datetime
import numpy as np
import boto3
import pandas as pd

In [None]:
# Configure the SDK to Personalize:
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

In [None]:

item_df = pd.read_pickle("item_df.p")

In [None]:
item_df.head()

In [None]:

user_df = pd.read_pickle("user_df.p")

In [None]:

train_data = pd.read_csv("ml-100k/train.csv")
test_data = pd.read_csv("ml-100k/test.csv")

Next,open the data file and take a look at the first several rows.

### Offline Evaluation 

In [None]:
!pip install tqdm
from tqdm import tqdm_notebook
from metrics import mean_reciprocal_rank, ndcg_at_k, precision_at_k

In [None]:
sample_number = 1000
unique_user = test_data['uid'].unique()
np.random.shuffle(unique_user)
sampled_user = unique_user[:sample_number]

In [None]:


sampled_results = test_data[test_data['uid'].isin(sampled_user)].groupby('uid').iid
sampled_results

In [None]:

rerank_campaign_arn = 

relevance = []
for user_id, true_items in tqdm_notebook(sampled_results):
 rec_response = personalize_runtime.get_recommendations(
 campaignArn = rerank_campaign_arn,
 userId = str(user_id)
 )
 rec_items = [int(x['itemId']) for x in rec_response['itemList']]
 relevance.append([int(x in true_items.values) for x in rec_items])

In [None]:
print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance]))
print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance]))
print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance]))
print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance]))

### Calculate diversity, novelty and serendipity

In [None]:
genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \
 'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \
 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
userprofile = train_data.merge(item_df, on=['iid'])[['uid']+genres].groupby(['uid']).sum().reset_index()

In [None]:
userprofile

In [None]:
## item object 

class Item: 
 def __init__(self, item_df, play_log_df):
 self.items = item_df
 self.play_log = play_log_df.groupby(['iid']).sum().reset_index()
 
 def get_contents_by_id(self, id): 
 return self.items[self.items['iid']==id].values[0][5:] #categories 
 
 
 def get_popularity_by_id(self, id):
 if len(self.play_log[self.play_log['iid']==id].values) == 0:
 return 0
 return self.play_log[self.play_log['iid']==id].values[0][3]

class User: 
 def __init__(self, train_data, item_df):
 genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \
 'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \
 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
 self.userprofile = train_data.merge(item_df, on=['iid'])[['uid']+genres].groupby(['uid']).sum().reset_index()
 
 def get_user_profile(self, id):

 if len(self.userprofile[self.userprofile['uid']==id].values)==0: 
 return [0 for i in range(0, 19)]
 else:
 raw_profile = self.userprofile[self.userprofile['uid']==id].values[0][1:]
 avg = np.average(raw_profile)
 return [1 if x >=avg else 0 for x in raw_profile]
 
 
 
item_db = Item(item_df, train_data)
user_db = User(train_data, item_df)
print(item_db.get_contents_by_id(1)) 
print(item_db.get_popularity_by_id(1)) 
print(user_db.get_user_profile(7))

In [None]:
### done by inter-similarity of a recommendation list 
import math

def diversity(pred, item_db):
 d = 0 
 for i, p1 in enumerate(pred): 
 for j, p2 in enumerate(pred):
 if j > i: 
 dist = sum(abs(item_db.get_contents_by_id(p1) - item_db.get_contents_by_id(p2))) 
 d += dist
 return d 

def novelty(pred, item_db):
 d = 0 
 for i, p in enumerate(pred):
 d += 1/(math.log(item_db.get_popularity_by_id(p)+2,2)+1)
 return d


def serendipity(pred, groud_truth, uid, user_db, item_db): 
 up = user_db.get_user_profile(uid)
 up_norm = [1 if i > 0 else 0 for i in up ]
 dist_total = 0 
 for p in pred:
 if p in groud_truth:
 contents = item_db.get_contents_by_id(p)
 dist = sum(abs(up_norm - contents)) 
 dist_total += dist
 return dist_total / len(pred)
 
 
 

In [None]:
total_diversity = 0 
total_novelty = 0 
total_serendipity = 0 


for user_id, true_items in tqdm_notebook(sampled_results):
 rec_response = personalize_runtime.get_recommendations(
 campaignArn = rerank_campaign_arn,
 userId = str(user_id)
 )
 rec_items = [int(x['itemId']) for x in rec_response['itemList']]
 total_diversity += diversity(rec_items, item_db)
 total_novelty += novelty(rec_items, item_db)
 total_serendipity += serendipity(rec_items, true_items, user_id, user_db, item_db)
 
users = test_data['uid'].unique() 
print(total_diversity / sample_number) 
print(total_novelty / sample_number)
print(total_serendipity / sample_number)

In [None]:
%store dataset_group_arn

In [None]:
%store schema_arn 
%store item_schema_arn
%store user_schema_arn

In [None]:
%store role_arn

In [None]:
dataset_group_arn