{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Take a look at the data files you have downloaded."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare your data <a class=\"anchor\" id=\"prepare\"></a>\n",
    "[Back to top](#top)\n",
    "\n",
    "The next thing to be done is to load the data and confirm the data is in a good state, then save it to a CSV where it is ready to be used with Amazon Personalize.\n",
    "\n",
    "To get started, import a collection of Python libraries commonly used in data science."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "from time import sleep\n",
    "import json\n",
    "from datetime import datetime\n",
    "import numpy as np\n",
    "import boto3\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Configure the SDK to Personalize:\n",
    "personalize = boto3.client('personalize')\n",
    "personalize_runtime = boto3.client('personalize-runtime')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "item_df = pd.read_pickle(\"item_df.p\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "user_df = pd.read_pickle(\"user_df.p\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "train_data = pd.read_csv(\"ml-100k/train.csv\")\n",
    "test_data = pd.read_csv(\"ml-100k/test.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next,open the data file and take a look at the first several rows."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Offline Evaluation "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install tqdm\n",
    "from tqdm import tqdm_notebook\n",
    "from metrics import mean_reciprocal_rank, ndcg_at_k, precision_at_k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_number = 1000\n",
    "unique_user = test_data['uid'].unique()\n",
    "np.random.shuffle(unique_user)\n",
    "sampled_user = unique_user[:sample_number]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "sampled_results = test_data[test_data['uid'].isin(sampled_user)].groupby('uid').iid\n",
    "sampled_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "\n",
    "rerank_campaign_arn = <your campaign>\n",
    "\n",
    "relevance = []\n",
    "for user_id, true_items in tqdm_notebook(sampled_results):\n",
    "    rec_response = personalize_runtime.get_recommendations(\n",
    "        campaignArn = rerank_campaign_arn,\n",
    "        userId = str(user_id)\n",
    "    )\n",
    "    rec_items = [int(x['itemId']) for x in rec_response['itemList']]\n",
    "    relevance.append([int(x in true_items.values) for x in rec_items])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance]))\n",
    "print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance]))\n",
    "print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance]))\n",
    "print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance]))\n",
    "print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance]))\n",
    "print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance]))\n",
    "print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculate diversity, novelty and serendipity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \\\n",
    "                                        'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \\\n",
    "                                        'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']\n",
    "userprofile = train_data.merge(item_df, on=['iid'])[['uid']+genres].groupby(['uid']).sum().reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "userprofile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## item object \n",
    "\n",
    "class Item: \n",
    "    def __init__(self, item_df, play_log_df):\n",
    "        self.items = item_df\n",
    "        self.play_log = play_log_df.groupby(['iid']).sum().reset_index()\n",
    "        \n",
    "    def get_contents_by_id(self, id): \n",
    "        return self.items[self.items['iid']==id].values[0][5:] #categories \n",
    "    \n",
    "    \n",
    "    def get_popularity_by_id(self, id):\n",
    "        if len(self.play_log[self.play_log['iid']==id].values) == 0:\n",
    "            return 0\n",
    "        return self.play_log[self.play_log['iid']==id].values[0][3]\n",
    "\n",
    "class User: \n",
    "    def __init__(self, train_data, item_df):\n",
    "        genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \\\n",
    "                                        'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \\\n",
    "                                        'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']\n",
    "        self.userprofile = train_data.merge(item_df, on=['iid'])[['uid']+genres].groupby(['uid']).sum().reset_index()\n",
    "    \n",
    "    def get_user_profile(self, id):\n",
    "\n",
    "        if len(self.userprofile[self.userprofile['uid']==id].values)==0: \n",
    "            return [0 for i in range(0, 19)]\n",
    "        else:\n",
    "            raw_profile = self.userprofile[self.userprofile['uid']==id].values[0][1:]\n",
    "            avg = np.average(raw_profile)\n",
    "            return [1 if x >=avg else 0 for x in raw_profile]\n",
    "            \n",
    "    \n",
    "    \n",
    "item_db =  Item(item_df, train_data)\n",
    "user_db = User(train_data, item_df)\n",
    "print(item_db.get_contents_by_id(1))    \n",
    "print(item_db.get_popularity_by_id(1))  \n",
    "print(user_db.get_user_profile(7))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### done by inter-similarity of a recommendation list \n",
    "import math\n",
    "\n",
    "def diversity(pred, item_db):\n",
    "    d = 0 \n",
    "    for i, p1 in enumerate(pred): \n",
    "        for j, p2 in enumerate(pred):\n",
    "            if j > i: \n",
    "                dist = sum(abs(item_db.get_contents_by_id(p1) - item_db.get_contents_by_id(p2))) \n",
    "                d += dist\n",
    "    return d \n",
    "\n",
    "def novelty(pred, item_db):\n",
    "    d = 0 \n",
    "    for i, p in enumerate(pred):\n",
    "        d += 1/(math.log(item_db.get_popularity_by_id(p)+2,2)+1)\n",
    "    return d\n",
    "\n",
    "\n",
    "def serendipity(pred, groud_truth, uid, user_db, item_db): \n",
    "    up = user_db.get_user_profile(uid)\n",
    "    up_norm = [1 if i > 0 else 0 for i in up ]\n",
    "    dist_total = 0 \n",
    "    for p in pred:\n",
    "        if p in groud_truth:\n",
    "            contents = item_db.get_contents_by_id(p)\n",
    "            dist = sum(abs(up_norm - contents))   \n",
    "            dist_total += dist\n",
    "    return  dist_total / len(pred)\n",
    "    \n",
    "    \n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_diversity = 0 \n",
    "total_novelty = 0 \n",
    "total_serendipity = 0 \n",
    "\n",
    "\n",
    "for user_id, true_items in tqdm_notebook(sampled_results):\n",
    "    rec_response = personalize_runtime.get_recommendations(\n",
    "        campaignArn = rerank_campaign_arn,\n",
    "        userId = str(user_id)\n",
    "    )\n",
    "    rec_items = [int(x['itemId']) for x in rec_response['itemList']]\n",
    "    total_diversity += diversity(rec_items, item_db)\n",
    "    total_novelty += novelty(rec_items, item_db)\n",
    "    total_serendipity += serendipity(rec_items, true_items, user_id, user_db, item_db)\n",
    "    \n",
    "users = test_data['uid'].unique()    \n",
    "print(total_diversity / sample_number)    \n",
    "print(total_novelty / sample_number)\n",
    "print(total_serendipity / sample_number)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store dataset_group_arn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store schema_arn \n",
    "%store item_schema_arn\n",
    "%store user_schema_arn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store role_arn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_group_arn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}