This notebook develops a model to predict whether a certain promotional offer will motivate buying behavior on an e-commerce site.
In this notebook, we take a small set of the raw input, transform it, and then work with a field-aware factorization machine (FFM) model directly.
The data set is from the Kasandr UCI data set. This is a public data set. Per the attribution policies, we cite the following paper for this data set:
Sumit Sidana, Charlotte Laclau, Massih-Reza Amini, Gilles Vandelle, and Andre Bois-Crettez. 'KASANDR: A Large-Scale Dataset with Implicit Feedback for Recommendation', SIGIR 2017.
The data set is already divided into train and test sets. The train set has 15,844,718 samples. The test set has 1,919,562 samples. That's a total of 17,764,280 samples, with a 90/10 train/test split.
The schema is:
The country code is not useful, as the entire data set is for Germany.
# Imports
import boto3
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker.predictor import json_deserializer
from sagemaker import get_execution_role
import numpy as np
from scipy.sparse import lil_matrix
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
import scipy
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from tffm import TFFMRegressor
import tensorflow as tf
import os.path
import csv, io, json
# Bucket with raw data
bucket = 'promo-dataprep'
prefix = 'data'
train_file = 'train_de.csv'
test_file = 'test_de.csv'
# set up boto for s3 access
role = get_execution_role()
s3 = boto3.resource('s3')
# download inputs
if os.path.isfile(train_file) and os.path.exists(train_file):
print("{0} already exists, skipping".format(train_file))
else:
s3.Bucket(bucket).download_file("{0}/{1}".format(prefix, train_file), train_file)
if os.path.isfile(test_file) and os.path.exists(test_file):
print("{0} already exists, skipping".format(test_file))
else:
s3.Bucket(bucket).download_file("{0}/{1}".format(prefix, test_file), test_file)
# read train data set
train_df = pd.read_csv(train_file,
parse_dates = {'utcdate' : [5, 6]}, infer_datetime_format=True,
skiprows = 1, sep='\s+',
header = None, names = ['userid', 'offerid', 'countrycode', 'category', 'merchant', 'date0', 'date1', 'rating'],
dtype={'userid': 'category', 'offerid': 'category', 'countrycode': 'category',
'category': 'category', 'merchant': 'category',
'date0': 'str', 'date1': 'str', 'utcdate': 'datetime64', 'rating': 'int64'})
print("Read train data")
# read test data set
test_df = pd.read_csv(test_file,
parse_dates = {'utcdate' : [5, 6]}, infer_datetime_format=True,
skiprows = 1, sep='\s+',
header = None, names = ['userid', 'offerid', 'countrycode', 'category', 'merchant', 'date0', 'date1', 'rating'],
dtype={'userid': 'category', 'offerid': 'category', 'countrycode': 'category',
'category': 'category', 'merchant': 'category',
'date0': 'str', 'date1': 'str', 'utcdate': 'datetime64', 'rating': 'int64'})
print("Read test data")
# drop timestamp
train_df_for_model = train_df.drop('utcdate', 1)
test_df_for_model = test_df.drop('utcdate', 1)
# rename label
train_df_for_model = train_df_for_model.rename(columns={"rating": "label"})
test_df_for_model = test_df_for_model.rename(columns={"rating": "label"})
# rename merchant to product
train_df_for_model = train_df_for_model.rename(columns={"merchant": "product"})
test_df_for_model = test_df_for_model.rename(columns={"merchant": "product"})
train_df_for_model.dtypes
# encode data - merge first so the encoding is done consistently
merged_data = pd.concat([train_df_for_model, test_df_for_model], ignore_index=True)
print("Size of merged data: {0}".format(merged_data.shape))
# take first 50,000 rows to avoid memory issues
transformed_data = pd.get_dummies(merged_data.sample(50000))
print("Size of transformed data: {0}".format(transformed_data.shape))
y = np.array(transformed_data['label'].astype('float32').as_matrix())
# drop label - we store this separately as the output label
transformed_data.drop(['label'], 1, inplace=True)
X = np.array(transformed_data).astype('float32')
X = np.nan_to_num(X)
# Split data into train, test
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2)
print("Size of training data: {0} ({1})".format(X_tr.shape, y_tr.shape))
print("Size of test data: {0} ({1})".format(X_te.shape, y_te.shape))
# convert to sparse matrix
X_tr_sparse = scipy.sparse.csr_matrix(X_tr)
X_te_sparse = scipy.sparse.csr_matrix(X_te)
print("Size of sparse training data: {0} ({1})".format(X_tr_sparse.shape, y_te.shape))
print("Size of sparse test data: {0} ({1})".format(X_te_sparse.shape, y_te.shape))
# convert to protobuf and save to s3
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
buf = io.BytesIO()
smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
buf.seek(0)
obj = '{}/{}'.format(prefix, key)
boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
return 's3://{}/{}'.format(bucket,obj)
train_data = writeDatasetToProtobuf(X_tr_sparse, y_tr, bucket, 'train-small', 'train.protobuf')
test_data = writeDatasetToProtobuf(X_te_sparse, y_te, bucket, 'train-small', 'test.protobuf')
print("S3 training data: {0}".format(train_data))
print("S3 test data: {0}".format(test_data))
model = TFFMRegressor(
order=3,
rank=7,
optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
n_epochs=50,
batch_size=-1,
init_std=0.001,
input_type='sparse'
)
model.fit(X_tr_sparse, y_tr, show_progress=True)
predictions = model.predict(X_te_sparse)
model.destroy()
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, average_precision_score
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from sklearn.metrics import precision_score, precision_recall_fscore_support, recall_score
predvec = np.where(predictions > 0.5, 1, 0)
print('Weighted F1: {}'.format(f1_score(y_te, predvec,average='weighted')))
print('Accuracy: {}'.format(accuracy_score(y_te, predvec)))
print('Weighted ROC: {}'.format(roc_auc_score(y_te, predvec, average='weighted')))
print('Classification report: {}'.format(classification_report(y_te, predvec)))
print(pd.crosstab(np.array(y_te), predvec, rownames=['actuals'], colnames=['predictions']))
headers = list(transformed_data.columns.values)
with open('headers-small.csv', 'w') as csvfile:
hwriter = csv.writer(csvfile, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
for h in headers:
hwriter.writerow([h])
boto3.resource('s3').Bucket(bucket).Object('train-small/headers.csv').upload_file('headers-small.csv')
unique_offers = train_df.offerid.unique()
unique_offers.shape
Let's look at the distribution of the offers. How many of the offer codes are frequently used? When we look at the 'max' value below, note that it is substantially higher than the 75th or even the 99th percentile.
unique_counts = train_df.offerid.value_counts()
unique_counts.describe()
unique_counts.quantile(0.99)
Now note that the skew is large and positive, indicating that the distribution is skewed to the right. The kurtosis is also quite large, indicating a sharp peak.
unique_counts.skew()
unique_counts.kurtosis()
unique_counts.head(20)
We conclude that the offer ID counts are weighted very heavily towards only a relative few of the offer IDs, producing a distribution that is very heavy near the origin with a long tail. (Keep in mind that the offer counts are in descending order.)
Let's see what a sample of 50 of the most common offers looks like.
# Generate rolling total of first 'N' elements
cumulative_unique_counts = unique_counts.head(50).expanding().apply(sum)
# Divide the rolling totals by the total sample count
pct_of_total = cumulative_unique_counts.div(unique_counts.sum())
# The maximum (last) value shows how much of the total count we've incorporated with only 50 offer IDs.
max(pct_of_total)
Ok - 50 offers (or about 0.0025% of the total offers) accounts for over 8% of the data set. Let's visualize about 200 offers and double check the distribution.
import matplotlib.pyplot as plt
interesting_counts = unique_counts.head(200)
interesting_counts.plot(xticks=range(0,interesting_counts.shape[0],10), rot=90, use_index=False,
title='Top 200 most frequent offers')
Indeed, the there is a sharp dropoff somewhere in the 30-50 index.