# Personalize with temporal evaluation on hold-out set

The notebook largely follows the basic notebook, with the additional tweak to hold-out 10% of "future" data for every user. Then, we set up an inference endpoint to bring recommendation and evaluate externally on the held-out data.

The other minor difference is that we predict views rather than ratings. We find it more interesting to demonstrate recommendation of unpopular (yet highly personalized) movies than to recommend popular movies that everyone would enjoy (without personalization needs).

In [None]:
import boto3, os
import json
import numpy as np
import pandas as pd
import time
from botocore.exceptions import ClientError
!pip install tqdm
from tqdm import tqdm_notebook
from metrics import mean_reciprocal_rank, ndcg_at_k, precision_at_k

In [None]:
suffix = str(np.random.uniform())[4:9]

In [None]:
bucket = "demo-temporal-holdout-"+ suffix # replace with the name of your S3 bucket
filename = "DEMO-temporal-holdout.csv"

In [None]:
!aws s3 mb s3://{bucket}

In [None]:
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

# Download and process benchmark data

In [None]:
!wget -N http://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip -o ml-1m.zip
data = pd.read_csv('./ml-1m/ratings.dat', sep='::', names=['USER_ID','ITEM_ID','EVENT_VALUE', 'TIMESTAMP'])

In [None]:
pd.set_option('display.max_rows', 5)

In [None]:
data = data[['USER_ID', 'ITEM_ID', 'TIMESTAMP']] # select columns that match the columns in the schema below
print('unique users %d; unique items %d'%(
 len(data['USER_ID'].unique()), len(data['ITEM_ID'].unique())))
data

### Extract last 10% of interactions per user as hold-out tests

In [None]:
ranks = data.groupby('USER_ID').TIMESTAMP.rank(pct=True, method='first')
data = data.join((ranks>0.9).to_frame('holdout'))
holdout = data[data['holdout']].drop('holdout', axis=1)
data = data[~data['holdout']].drop('holdout', axis=1)

In [None]:
print('unique users %d; unique items %d'%(
 len(data['USER_ID'].unique()), len(data['ITEM_ID'].unique())))
data

### Upload data

In [None]:
data.to_csv(filename, index=False)
boto3.Session().resource('s3').Bucket(bucket).Object(filename).upload_file(filename)

# Create Schema

In [None]:
schema = {
 "type": "record",
 "name": "Interactions",
 "namespace": "com.amazonaws.personalize.schema",
 "fields": [
 {
 "name": "USER_ID",
 "type": "string"
 },
 {
 "name": "ITEM_ID",
 "type": "string"
 },
 {
 "name": "TIMESTAMP",
 "type": "long"
 }
 ],
 "version": "1.0"
}

create_schema_response = personalize.create_schema(
 name = "DEMO-temporal-schema-"+suffix,
 schema = json.dumps(schema)
)

schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

## Datasets and Dataset Groups

### Create a Dataset Group

In [None]:
create_dataset_group_response = personalize.create_dataset_group(
 name = "DEMO-temporal-dataset-group-"+suffix
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

In [None]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
 describe_dataset_group_response = personalize.describe_dataset_group(
 datasetGroupArn = dataset_group_arn
 )
 status = describe_dataset_group_response["datasetGroup"]["status"]
 print("DatasetGroup: {}".format(status))
 
 if status == "ACTIVE" or status == "CREATE FAILED":
 break
 
 time.sleep(20)

### Create an 'Interactions' Dataset Type

In [None]:
dataset_type = "INTERACTIONS"
create_dataset_response = personalize.create_dataset(
 datasetType = dataset_type,
 datasetGroupArn = dataset_group_arn,
 schemaArn = schema_arn,
 name = "DEMO-temporal-dataset-"+suffix
)

dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

## S3 Bucket Permissions for Personalize Access

### Attach a Policy to the S3 Bucket

In [None]:
s3 = boto3.client("s3")

policy = {
 "Version": "2012-10-17",
 "Id": "PersonalizeS3BucketAccessPolicy",
 "Statement": [
 {
 "Sid": "PersonalizeS3BucketAccessPolicy",
 "Effect": "Allow",
 "Principal": {
 "Service": "personalize.amazonaws.com"
 },
 "Action": [
 "s3:GetObject",
 "s3:ListBucket"
 ],
 "Resource": [
 "arn:aws:s3:::{}".format(bucket),
 "arn:aws:s3:::{}/*".format(bucket)
 ]
 }
 ]
}

s3.put_bucket_policy(Bucket=bucket, Policy=json.dumps(policy));

### Create a role that has the right permissions

In [None]:
iam = boto3.client("iam")

role_name = "PersonalizeS3Role-"+suffix
assume_role_policy_document = {
 "Version": "2012-10-17",
 "Statement": [
 {
 "Effect": "Allow",
 "Principal": {
 "Service": "personalize.amazonaws.com"
 },
 "Action": "sts:AssumeRole"
 }
 ]
}
try:
 create_role_response = iam.create_role(
 RoleName = role_name,
 AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
 );

 iam.attach_role_policy(
 RoleName = role_name,
 PolicyArn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
 );

 role_arn = create_role_response["Role"]["Arn"]
except ClientError as e:
 if e.response['Error']['Code'] == 'EntityAlreadyExists':
 role_arn = iam.get_role(RoleName=role_name)['Role']['Arn']
 else:
 raise
# sometimes need to wait a bit for the role to be created
time.sleep(45)
print(role_arn)

# Create your Dataset import jobs

This is your interactions data upload

In [None]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
 jobName = "DEMO-temporal-dataset-import-job-"+suffix,
 datasetArn = dataset_arn,
 dataSource = {
 "dataLocation": "s3://{}/{}".format(bucket, filename)
 },
 roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

### Wait for Dataset Import Job and Dataset Import Job Run to Have ACTIVE Status

In [None]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
 describe_dataset_import_job_response = personalize.describe_dataset_import_job(
 datasetImportJobArn = dataset_import_job_arn
 )
 
 dataset_import_job = describe_dataset_import_job_response["datasetImportJob"]
 if "latestDatasetImportJobRun" not in dataset_import_job:
 status = dataset_import_job["status"]
 print("DatasetImportJob: {}".format(status))
 else:
 status = dataset_import_job["latestDatasetImportJobRun"]["status"]
 print("LatestDatasetImportJobRun: {}".format(status))
 
 if status == "ACTIVE" or status == "CREATE FAILED":
 break
 
 time.sleep(60)

# Create Solution

In [None]:
recipe_list = personalize.list_recipes()
for recipe in recipe_list['recipes']:
 print(recipe['recipeArn'])

There are many recipes for different scenarios. In this example, we only have interactions data, so we will choose one from the basic recipes.

| Feasible? | Recipe | Description 
|-------- | -------- |:------------
| Y | aws-popularity-count | Calculates popularity of items based on count of events against that item in user-item interactions dataset.
| Y | aws-hrnn | Predicts items a user will interact with. A hierarchical recurrent neural network which can model the temporal order of user-item interactions.
| N - requires meta data | aws-hrnn-metadata | Predicts items a user will interact with. HRNN with additional features derived from contextual (user-item interaction metadata), user metadata (user dataset) and item metadata (item dataset)
| N - for bandits and requires meta data | aws-hrnn-coldstart | Predicts items a user will interact with. HRNN-metadata with personalized exploration of new items.
| N - for item-based queries | aws-sims | Computes items similar to a given item based on co-occurrence of item in same user history in user-item interaction dataset
| N - for reranking a short list | aws-personalized-ranking | Reranks a list of items for a user. Trains on user-item interactions dataset. 


We (or autoML) can run all of these basic recipes and choose the best-performing model from internal metrics. We recommend comparisons, especially with popularity-baseline, to see the lifts in metrics via personalization. However, in this demo, we will pick one recipe - aws-hrnn, to focus on external evaluations.

In [None]:
recipe_arn = "arn:aws:personalize:::recipe/aws-hrnn"

### Create and Wait for your Solution
This is a 2 step process
1. Create a Solution
2. Create a Solution Version

In [None]:
create_solution_response = personalize.create_solution(
 name = "DEMO-temporal-solution-"+suffix,
 datasetGroupArn = dataset_group_arn,
 recipeArn = recipe_arn,
)

solution_arn = create_solution_response['solutionArn']
print(json.dumps(create_solution_response, indent=2))

In [None]:
create_solution_version_response = personalize.create_solution_version(
 solutionArn = solution_arn
)

solution_version_arn = create_solution_version_response['solutionVersionArn']
print(json.dumps(create_solution_version_response, indent=2))

### Wait for Solution Version to Have ACTIVE Status

In [None]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
 describe_solution_version_response = personalize.describe_solution_version(
 solutionVersionArn = solution_version_arn
 )
 status = describe_solution_version_response["solutionVersion"]["status"]
 print("SolutionVersion: {}".format(status))
 
 if status == "ACTIVE" or status == "CREATE FAILED":
 break
 
 time.sleep(60)

### Get Metrics of Solution

In [None]:
get_metrics_response = personalize.get_solution_metrics(
 solutionVersionArn = solution_version_arn
)

print(json.dumps(get_metrics_response, indent=2))

# Create and Wait for Campaign

In [None]:
create_campaign_response = personalize.create_campaign(
 name = "DEMO-temporal-campaign-"+suffix,
 solutionVersionArn = solution_version_arn,
 minProvisionedTPS = 2, 
)

campaign_arn = create_campaign_response['campaignArn']
print(json.dumps(create_campaign_response, indent=2))

### Wait for Campaign to Have ACTIVE Status

In [None]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
 describe_campaign_response = personalize.describe_campaign(
 campaignArn = campaign_arn
 )
 status = describe_campaign_response["campaign"]["status"]
 print("Campaign: {}".format(status))
 
 if status == "ACTIVE" or status == "CREATE FAILED":
 break
 
 time.sleep(60)

# Evaluate using external metrics

An explanation of the evaluation metrics are provided at https://docs.aws.amazon.com/personalize/latest/dg/working-with-training-metrics.html

For example, suppose we recommend four items and two of them are relevant, $r=[0,1,0,1]$. In this case, the metrics are:

|Name	|Example	|Explanation
|:------|:----------|:----------
|Precision@K	|$\frac{2}{4} = 0.5$	|Total relevant items divided by total recommended items.
|Mean reciprocal ranks (MRR@K)	|${\rm mean}(\frac{1}{2} + \frac{1}{4}) = 0.375$	|Considers positional effects by computing the mean of the inverse positions of all relevant items.
|Normalized discounted cumulative gains (NDCG@K)	|$\frac{\frac{1}{\log(1 + 2)} + \frac{1}{\log(1 + 4)}}{\frac{1}{\log(1 + 1)} + \frac{1}{\log(1 + 2)}} = 0.65$	|Considers positional effects by applying inverse logarithmic weights based on the positions of relevant items, normalized by the largest possible scores from ideal recommendations.
|Average precision (AP@K)	|${\rm mean}(\frac{1}{2} + \frac{2}{4}) = 0.5$	|Average precision@K where K is the position of every relevant item.

These metrics are different from the internal metrics in two aspects:
* They are evaluated at different times, which may imply different click rates. We recommend to always keep the evaluations in the same time periods to avoid temporal drifts.
* The example external evaluations may hold out and consider multiple items as ground truth, while the internal evaluations only hold out the last item in each user-history as the ground truth. There is no absolute preference as to how many items should be held out; we recommend designing the evaluation methods that are similar to the actual use case.

In [None]:
relevance = []
for user_id, true_items in tqdm_notebook(holdout.groupby('USER_ID').ITEM_ID):
 rec_response = personalize_runtime.get_recommendations(
 campaignArn = campaign_arn,
 userId = str(user_id)
 )
 rec_items = [int(x['itemId']) for x in rec_response['itemList']]
 relevance.append([int(x in true_items.values) for x in rec_items])

In [None]:
print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance]))
print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance]))
print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance]))
print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance]))

### Optional: slightly better results after deduplicating previous purchase histories

In [None]:
rel_dedup = []
for user_id, true_items in tqdm_notebook(holdout.groupby('USER_ID').ITEM_ID):
 rec_response = personalize_runtime.get_recommendations(
 campaignArn = campaign_arn,
 userId = str(user_id)
 )
 past_items = data[data.USER_ID == user_id].ITEM_ID.values
 topk = [int(x['itemId']) for x in rec_response['itemList']]
 rec_items = [x for x in topk if x not in past_items]
 if len(rec_items) < 25:
 rec_items.extend([x for x in topk if x not in rec_items])
 rec_items = rec_items[:25] 

 rel_dedup.append([int(x in true_items.values) for x in rec_items])

In [None]:
print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in rel_dedup]))
print('precision_at_5', np.mean([precision_at_k(r, 5) for r in rel_dedup]))
print('precision_at_10', np.mean([precision_at_k(r, 10) for r in rel_dedup]))
print('precision_at_25', np.mean([precision_at_k(r, 25) for r in rel_dedup]))
print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in rel_dedup]))
print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in rel_dedup]))
print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in rel_dedup]))

### Try comparing with popularity baseline as a dummy recommender

In [None]:
topk = data.groupby("ITEM_ID").TIMESTAMP.count().sort_values(ascending=False).iloc[:100].index.values

In [None]:
rel_popular = []
for user_id, true_items in tqdm_notebook(holdout.groupby('USER_ID').ITEM_ID):
 rec_items = topk[:25]
 rel_popular.append([int(x in true_items.values) for x in rec_items])

In [None]:
print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in rel_popular]))
print('precision_at_5', np.mean([precision_at_k(r, 5) for r in rel_popular]))
print('precision_at_10', np.mean([precision_at_k(r, 10) for r in rel_popular]))
print('precision_at_25', np.mean([precision_at_k(r, 25) for r in rel_popular]))
print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in rel_popular]))
print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in rel_popular]))
print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in rel_popular]))

### Popularity baseline deduplicating user histories

In [None]:
rel_pop_dedup = []
for user_id, true_items in tqdm_notebook(holdout.groupby('USER_ID').ITEM_ID):
 past_items = data[data.USER_ID == user_id].ITEM_ID.values
 rec_items = [x for x in topk if x not in past_items]
 if len(rec_items) < 25:
 rec_items.extend([x for x in topk if x not in rec_items])
 rec_items = rec_items[:25] 
 rel_pop_dedup.append([int(x in true_items.values) for x in rec_items])

In [None]:
print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in rel_pop_dedup]))
print('precision_at_5', np.mean([precision_at_k(r, 5) for r in rel_pop_dedup]))
print('precision_at_10', np.mean([precision_at_k(r, 10) for r in rel_pop_dedup]))
print('precision_at_25', np.mean([precision_at_k(r, 25) for r in rel_pop_dedup]))
print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in rel_pop_dedup]))
print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in rel_pop_dedup]))
print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in rel_pop_dedup]))

# Clean up

In [None]:
personalize.delete_campaign(campaignArn=campaign_arn)
while len(personalize.list_campaigns(solutionArn=solution_arn)['campaigns']):
 time.sleep(5)

personalize.delete_solution(solutionArn=solution_arn)
while len(personalize.list_solutions(datasetGroupArn=dataset_group_arn)['solutions']):
 time.sleep(5)

for dataset in personalize.list_datasets(datasetGroupArn=dataset_group_arn)['datasets']:
 personalize.delete_dataset(datasetArn=dataset['datasetArn'])
while len(personalize.list_datasets(datasetGroupArn=dataset_group_arn)['datasets']):
 time.sleep(5)

personalize.delete_dataset_group(datasetGroupArn=dataset_group_arn)

# If you are using a personal bucket Execute this cell with caution!

In [None]:
!aws s3 rm s3://{bucket} --recursive