{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Movie recommendation on Amazon SageMaker with Factorization Machines" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Download ml-100k dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip\n", "!unzip -o ml-100k.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%cd ml-100k\n", "!shuf ua.base -o ua.base.shuffled\n", "!head -10 ua.base.shuffled" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!head -10 ua.test" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sagemaker\n", "import sagemaker.amazon.common as smac\n", "from sagemaker import get_execution_role\n", "from sagemaker.predictor import json_deserializer\n", "\n", "import boto3, csv, io, json\n", "import numpy as np\n", "from scipy.sparse import lil_matrix\n", "from collections import defaultdict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Build training set and test set" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nbUsers = 943\n", "nbMovies = 1682\n", "\n", "# one hot encoding vector size\n", "nbFeatures = nbUsers + nbMovies\n", "\n", "# sample size\n", "nbRatingsTrain = 90570\n", "nbRatingsTest = 9430" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "moviesByUser = defaultdict(list)\n", "\n", "with open('ua.base.shuffled', 'r') as f:\n", " samples = csv.reader(f, delimiter = '\\t')\n", " \n", " for userId, movieId, rating, timestamp in samples:\n", " moviesByUser[str(int(userId)-1)].append(int(movieId)-1) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def loadDataset(filename, lines, columns):\n", " # Features are one-hot encoded in a sparse matrix\n", " # lil_maxtrix: structure for constructing sparse matrices incrementally\n", " # lil: List of Lists Format\n", " # https://www.scipy-lectures.org/advanced/scipy_sparse/lil_matrix.html\n", " X = lil_matrix((lines, columns)).astype('float32')\n", " # Labels are stored in a vector\n", " Y = []\n", " line = 0\n", " \n", " with open(filename, 'r') as f:\n", " samples = csv.reader(f, delimiter = '\\t')\n", " \n", " for userId, movieId, rating, timestamp in samples:\n", " X[line, int(userId) - 1] = 1\n", " X[line, int(nbUsers) + int(movieId)-1] = 1\n", " \n", " if int(rating) >= 4:\n", " Y.append(1)\n", " else:\n", " Y.append(0)\n", " line = line + 1\n", " \n", " Y = np.array(Y).astype('float32')\n", " return X, Y" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# X_train: A training sparse matrix: 90,570 lines and 2,625 columns and this matrix is 99.92% sparse\n", "# Y_train: A training label array: 90,570 ratings\n", "X_train, Y_train = loadDataset('ua.base.shuffled', nbRatingsTrain, nbFeatures)\n", "\n", "# X_test: A test sparse matrix: 9,430 lines and 2,625 columns\n", "# Y_test: A test label array: 9,430 ratings\n", "X_test, Y_test = loadDataset('ua.test', nbRatingsTest, nbFeatures)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(X_train.shape)\n", "print(Y_train.shape)\n", "assert X_train.shape == (nbRatingsTrain, nbFeatures)\n", "assert Y_train.shape == (nbRatingsTrain, )\n", "zero_labels = np.count_nonzero(Y_train)\n", "print(\"Training labels: %d zeros, %d ones\" % (zero_labels, nbRatingsTrain-zero_labels))\n", "\n", "print(X_test.shape)\n", "print(Y_test.shape)\n", "assert X_test.shape == (nbRatingsTest, nbFeatures)\n", "assert Y_test.shape == (nbRatingsTest, )\n", "zero_labels = np.count_nonzero(Y_test)\n", "print(\"Test labels: %d zeros, %d ones\" % (zero_labels, nbRatingsTest-zero_labels))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convert to protobuf and save to S3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# your bucket name\n", "bucket = 'hyun-data-kr'\n", "prefix = 'sagemaker/fm-movielens'\n", "\n", "train_key = 'train.protobuf'\n", "train_prefix = '{}/{}'.format(prefix, 'train3')\n", "\n", "test_key = 'test.protobuf'\n", "test_prefix = '{}/{}'.format(prefix, 'test3')\n", "\n", "output_prefix = 's3://{}/{}/output'.format(bucket, prefix)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def writeDatasetToProtobuf(X, Y, bucket, prefix, key):\n", " buf = io.BytesIO()\n", " smac.write_spmatrix_to_sparse_tensor(buf, X, Y)\n", " buf.seek(0)\n", " obj = '{}/{}'.format(prefix, key)\n", " boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)\n", " return 's3://{}/{}'.format(bucket, obj)\n", " \n", "train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key) \n", "test_data = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key) \n", " \n", "print(train_data)\n", "print(test_data)\n", "print('Output: {}'.format(output_prefix))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run training job" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sagemaker.amazon.amazon_estimator import get_image_uri\n", "container = get_image_uri(boto3.Session().region_name, 'factorization-machines')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "fm = sagemaker.estimator.Estimator(container,\n", " get_execution_role(), \n", " train_instance_count = 1, \n", " train_instance_type = 'ml.c5.4xlarge',\n", " output_path = output_prefix,\n", " sagemaker_session = sagemaker.Session())\n", "\n", "# num_factors: the common dimension for the user and item matrices\n", "fm.set_hyperparameters(feature_dim = nbFeatures,\n", " predictor_type = 'binary_classifier',\n", " mini_batch_size = 1000,\n", " num_factors = 64,\n", " epochs = 100)\n", "\n", "fm.fit({'train': train_data, 'test': test_data})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Deploy model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "fm_predictor = fm.deploy(instance_type = 'ml.c4.xlarge', initial_instance_count = 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fm_serializer(data):\n", " js = {'instances': []}\n", " \n", " for row in data:\n", " js['instances'].append({'features': row.tolist()})\n", " return json.dumps(js)\n", "\n", "fm_predictor.content_type = 'application/json'\n", "fm_predictor.serializer = fm_serializer\n", "fm_predictor.deserializer = json_deserializer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run predictions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "result = fm_predictor.predict(X_test[1000:1010].toarray())\n", "print(result)\n", "print (Y_test[1000:1010])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(X_test[1000:1010])\n", "print(Y_test[1000:1010])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fm_predictor.delete_endpoint()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_mxnet_p36", "language": "python", "name": "conda_mxnet_p36" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }