{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Movie recommendation on Amazon SageMaker with Factorization Machines"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download ml-100k dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip\n",
    "!unzip -o ml-100k.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%cd ml-100k\n",
    "!shuf ua.base -o ua.base.shuffled\n",
    "!head -10 ua.base.shuffled"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head -10 ua.test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sagemaker\n",
    "import sagemaker.amazon.common as smac\n",
    "from sagemaker import get_execution_role\n",
    "from sagemaker.predictor import json_deserializer\n",
    "\n",
    "import boto3, csv, io, json\n",
    "import numpy as np\n",
    "from scipy.sparse import lil_matrix\n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Build training set and test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nbUsers = 943\n",
    "nbMovies = 1682\n",
    "\n",
    "# one hot encoding vector size\n",
    "nbFeatures = nbUsers + nbMovies\n",
    "\n",
    "# sample size\n",
    "nbRatingsTrain = 90570\n",
    "nbRatingsTest = 9430"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "moviesByUser = defaultdict(list)\n",
    "\n",
    "with open('ua.base.shuffled', 'r') as f:\n",
    "    samples = csv.reader(f, delimiter = '\\t')\n",
    "    \n",
    "    for userId, movieId, rating, timestamp in samples:\n",
    "        moviesByUser[str(int(userId)-1)].append(int(movieId)-1) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def loadDataset(filename, lines, columns):\n",
    "    # Features are one-hot encoded in a sparse matrix\n",
    "    # lil_maxtrix: structure for constructing sparse matrices incrementally\n",
    "    # lil: List of Lists Format\n",
    "    # https://www.scipy-lectures.org/advanced/scipy_sparse/lil_matrix.html\n",
    "    X = lil_matrix((lines, columns)).astype('float32')\n",
    "    # Labels are stored in a vector\n",
    "    Y = []\n",
    "    line = 0\n",
    "    \n",
    "    with open(filename, 'r') as f:\n",
    "        samples = csv.reader(f, delimiter = '\\t')\n",
    "        \n",
    "        for userId, movieId, rating, timestamp in samples:\n",
    "            X[line, int(userId) - 1] = 1\n",
    "            X[line, int(nbUsers) + int(movieId)-1] = 1\n",
    "            \n",
    "            if int(rating) >= 4:\n",
    "                Y.append(1)\n",
    "            else:\n",
    "                Y.append(0)\n",
    "            line = line + 1\n",
    "            \n",
    "    Y = np.array(Y).astype('float32')\n",
    "    return X, Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# X_train: A training sparse matrix: 90,570 lines and 2,625 columns and this matrix is 99.92% sparse\n",
    "# Y_train: A training label array: 90,570 ratings\n",
    "X_train, Y_train = loadDataset('ua.base.shuffled', nbRatingsTrain, nbFeatures)\n",
    "\n",
    "# X_test: A test sparse matrix: 9,430 lines and 2,625 columns\n",
    "# Y_test: A test label array: 9,430 ratings\n",
    "X_test, Y_test = loadDataset('ua.test', nbRatingsTest, nbFeatures)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(X_train.shape)\n",
    "print(Y_train.shape)\n",
    "assert X_train.shape == (nbRatingsTrain, nbFeatures)\n",
    "assert Y_train.shape == (nbRatingsTrain, )\n",
    "zero_labels = np.count_nonzero(Y_train)\n",
    "print(\"Training labels: %d zeros, %d ones\" % (zero_labels, nbRatingsTrain-zero_labels))\n",
    "\n",
    "print(X_test.shape)\n",
    "print(Y_test.shape)\n",
    "assert X_test.shape == (nbRatingsTest, nbFeatures)\n",
    "assert Y_test.shape == (nbRatingsTest, )\n",
    "zero_labels = np.count_nonzero(Y_test)\n",
    "print(\"Test labels: %d zeros, %d ones\" % (zero_labels, nbRatingsTest-zero_labels))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert to protobuf and save to S3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# your bucket name\n",
    "bucket = 'hyun-data-kr'\n",
    "prefix = 'sagemaker/fm-movielens'\n",
    "\n",
    "train_key = 'train.protobuf'\n",
    "train_prefix = '{}/{}'.format(prefix, 'train3')\n",
    "\n",
    "test_key = 'test.protobuf'\n",
    "test_prefix = '{}/{}'.format(prefix, 'test3')\n",
    "\n",
    "output_prefix = 's3://{}/{}/output'.format(bucket, prefix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def writeDatasetToProtobuf(X, Y, bucket, prefix, key):\n",
    "    buf = io.BytesIO()\n",
    "    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)\n",
    "    buf.seek(0)\n",
    "    obj = '{}/{}'.format(prefix, key)\n",
    "    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)\n",
    "    return 's3://{}/{}'.format(bucket, obj)\n",
    "    \n",
    "train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    \n",
    "test_data = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    \n",
    "  \n",
    "print(train_data)\n",
    "print(test_data)\n",
    "print('Output: {}'.format(output_prefix))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run training job"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.amazon.amazon_estimator import get_image_uri\n",
    "container = get_image_uri(boto3.Session().region_name, 'factorization-machines')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "fm = sagemaker.estimator.Estimator(container,\n",
    "                                   get_execution_role(), \n",
    "                                   train_instance_count = 1, \n",
    "                                   train_instance_type = 'ml.c5.4xlarge',\n",
    "                                   output_path = output_prefix,\n",
    "                                   sagemaker_session = sagemaker.Session())\n",
    "\n",
    "# num_factors: the common dimension for the user and item matrices\n",
    "fm.set_hyperparameters(feature_dim = nbFeatures,\n",
    "                      predictor_type = 'binary_classifier',\n",
    "                      mini_batch_size = 1000,\n",
    "                      num_factors = 64,\n",
    "                      epochs = 100)\n",
    "\n",
    "fm.fit({'train': train_data, 'test': test_data})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Deploy model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "fm_predictor = fm.deploy(instance_type = 'ml.c4.xlarge', initial_instance_count = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fm_serializer(data):\n",
    "    js = {'instances': []}\n",
    "    \n",
    "    for row in data:\n",
    "        js['instances'].append({'features': row.tolist()})\n",
    "    return json.dumps(js)\n",
    "\n",
    "fm_predictor.content_type = 'application/json'\n",
    "fm_predictor.serializer = fm_serializer\n",
    "fm_predictor.deserializer = json_deserializer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = fm_predictor.predict(X_test[1000:1010].toarray())\n",
    "print(result)\n",
    "print (Y_test[1000:1010])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(X_test[1000:1010])\n",
    "print(Y_test[1000:1010])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fm_predictor.delete_endpoint()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_mxnet_p36",
   "language": "python",
   "name": "conda_mxnet_p36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}