# Movie recommendation on Amazon SageMaker with Factorization Machines

### Download ml-100k dataset

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -o ml-100k.zip

In [None]:
%cd ml-100k
!shuf ua.base -o ua.base.shuffled
!head -10 ua.base.shuffled

In [None]:
!head -10 ua.test

In [None]:
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import json_deserializer

import boto3, csv, io, json
import numpy as np
from scipy.sparse import lil_matrix
from collections import defaultdict

### Build training set and test set

In [None]:
nbUsers = 943
nbMovies = 1682

# one hot encoding vector size
nbFeatures = nbUsers + nbMovies

# sample size
nbRatingsTrain = 90570
nbRatingsTest = 9430

In [None]:
moviesByUser = defaultdict(list)

with open('ua.base.shuffled', 'r') as f:
 samples = csv.reader(f, delimiter = '\t')
 
 for userId, movieId, rating, timestamp in samples:
 moviesByUser[str(int(userId)-1)].append(int(movieId)-1) 

In [None]:
def loadDataset(filename, lines, columns):
 # Features are one-hot encoded in a sparse matrix
 # lil_maxtrix: structure for constructing sparse matrices incrementally
 # lil: List of Lists Format
 # https://www.scipy-lectures.org/advanced/scipy_sparse/lil_matrix.html
 X = lil_matrix((lines, columns)).astype('float32')
 # Labels are stored in a vector
 Y = []
 line = 0
 
 with open(filename, 'r') as f:
 samples = csv.reader(f, delimiter = '\t')
 
 for userId, movieId, rating, timestamp in samples:
 X[line, int(userId) - 1] = 1
 X[line, int(nbUsers) + int(movieId)-1] = 1
 
 if int(rating) >= 4:
 Y.append(1)
 else:
 Y.append(0)
 line = line + 1
 
 Y = np.array(Y).astype('float32')
 return X, Y

In [None]:
# X_train: A training sparse matrix: 90,570 lines and 2,625 columns and this matrix is 99.92% sparse
# Y_train: A training label array: 90,570 ratings
X_train, Y_train = loadDataset('ua.base.shuffled', nbRatingsTrain, nbFeatures)

# X_test: A test sparse matrix: 9,430 lines and 2,625 columns
# Y_test: A test label array: 9,430 ratings
X_test, Y_test = loadDataset('ua.test', nbRatingsTest, nbFeatures)

In [None]:
print(X_train.shape)
print(Y_train.shape)
assert X_train.shape == (nbRatingsTrain, nbFeatures)
assert Y_train.shape == (nbRatingsTrain, )
zero_labels = np.count_nonzero(Y_train)
print("Training labels: %d zeros, %d ones" % (zero_labels, nbRatingsTrain-zero_labels))

print(X_test.shape)
print(Y_test.shape)
assert X_test.shape == (nbRatingsTest, nbFeatures)
assert Y_test.shape == (nbRatingsTest, )
zero_labels = np.count_nonzero(Y_test)
print("Test labels: %d zeros, %d ones" % (zero_labels, nbRatingsTest-zero_labels))

### Convert to protobuf and save to S3

In [None]:
# your bucket name
bucket = 'hyun-data-kr'
prefix = 'sagemaker/fm-movielens'

train_key = 'train.protobuf'
train_prefix = '{}/{}'.format(prefix, 'train3')

test_key = 'test.protobuf'
test_prefix = '{}/{}'.format(prefix, 'test3')

output_prefix = 's3://{}/{}/output'.format(bucket, prefix)

In [None]:
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
 buf = io.BytesIO()
 smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
 buf.seek(0)
 obj = '{}/{}'.format(prefix, key)
 boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
 return 's3://{}/{}'.format(bucket, obj)
 
train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key) 
test_data = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key) 
 
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

### Run training job

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'factorization-machines')

In [None]:
fm = sagemaker.estimator.Estimator(container,
 get_execution_role(), 
 train_instance_count = 1, 
 train_instance_type = 'ml.c5.4xlarge',
 output_path = output_prefix,
 sagemaker_session = sagemaker.Session())

# num_factors: the common dimension for the user and item matrices
fm.set_hyperparameters(feature_dim = nbFeatures,
 predictor_type = 'binary_classifier',
 mini_batch_size = 1000,
 num_factors = 64,
 epochs = 100)

fm.fit({'train': train_data, 'test': test_data})

### Deploy model

In [None]:
fm_predictor = fm.deploy(instance_type = 'ml.c4.xlarge', initial_instance_count = 1)

In [None]:
def fm_serializer(data):
 js = {'instances': []}
 
 for row in data:
 js['instances'].append({'features': row.tolist()})
 return json.dumps(js)

fm_predictor.content_type = 'application/json'
fm_predictor.serializer = fm_serializer
fm_predictor.deserializer = json_deserializer

### Run predictions

In [None]:
result = fm_predictor.predict(X_test[1000:1010].toarray())
print(result)
print (Y_test[1000:1010])

In [None]:
print(X_test[1000:1010])
print(Y_test[1000:1010])

In [None]:
fm_predictor.delete_endpoint()