In [1]:
import sagemaker
import pandas as pd
from io import StringIO, BytesIO
from sagemaker.pytorch import PyTorch
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.serializers import CSVSerializer, IdentitySerializer
from sklearn.metrics import roc_auc_score

# Train RGCN model using SageMaker

Please make sure you download the data by running [`01-Prepare-Data.ipynb`](01-Prepare-Data.ipynb) notebook first.

In [2]:
sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = "sagemaker/ieee-fraud-detection-train"

role = sagemaker.get_execution_role()

In [3]:
### upload training data to S3
inputs = sagemaker_session.upload_data(path="./data/train.parquet", bucket=bucket, key_prefix=prefix)

In [4]:
### create SageMaker's PyTorch estimator with custom training script
estimator = PyTorch(
 entry_point="smtrain.py",
 role=role,
 py_version="py38",
 framework_version="1.11.0",
 instance_count=1,
 instance_type="ml.m5.4xlarge",
 source_dir='fgnn',
 volume_size=100,
 hyperparameters={
 'embedding_size': 64,
 'n_layers': 2,
 'n_epochs': 150,
 'n_hidden': 16,
 'dropout': 0.2,
 'weight_decay': 5e-05,
 'lr': 0.01,
 },
)

In [None]:
### fit SM estimator
estimator.fit({"training": inputs})

# Deploy trained RGCN model to SageMaker endpoint 

In [6]:
## create sm model from model data + source code
model = PyTorchModel(model_data=estimator.model_data,
 role=role,
 entry_point='smtrain.py', 
 source_dir='fgnn',
 py_version="py38",
 framework_version="1.11.0",
 model_server_workers=2)

## alternatively, use fitted estimator object to create sm model
# model = estimator.create_model(model_server_workers=2)

In [7]:
## deploy sm model to an endpoint that will accept payload in (serialized) parquet format
predictor = model.deploy(initial_instance_count=1, instance_type="ml.m5.4xlarge", 
 serializer=IdentitySerializer(content_type='application/x-parquet'))

-------!

In [8]:
### alternatively, deploy sm model to an endpoint that will accept payload in CSV format
# predictor_csv = model.deploy(initial_instance_count=1, instance_type="ml.m5.4xlarge", 
# serializer=IdentitySerializer(content_type='text/csv'))

# Invoke endpoint with test transactions

In [9]:
### load test transactions
df_test = pd.read_parquet('./data/test.parquet')

In [14]:
### sample batch of 1000 transaction
df_batch=df_test.sample(1000)

In [15]:
%%time
### serialize parquet table with test transactions
buffer = BytesIO()
df_batch.drop(columns=['isFraud']).to_parquet(buffer)
### invoke model endpoint with serialized parquet payload
response = predictor.predict(buffer.getvalue())

CPU times: user 67.5 ms, sys: 201 µs, total: 67.7 ms
Wall time: 7.98 s


In [16]:
### invoke model endpoint with CSV payload
### note that using CSV format may result in prediction error because CSV serialization will loose column type information:
### e.g., when all rows for a string/object column has NaN values in a batch, this column will be deserialized as type float on the endpoint side,
### and one-hot-encoding of this column will fail inside fraud_detector.py (line `self._cat_transformer.transform(test_transactions)`)
#
# response = predictor_csv.predict(df_batch.to_csv(index=False))

In [17]:
### compute roc-auc score for the batch
roc_auc_score(df_batch.isFraud, response)

0.8737550277724574

# Delete SageMaker endpoint

In [18]:
sagemaker_session.delete_endpoint(endpoint_name=predictor.endpoint_name)
# sagemaker_session.delete_endpoint(endpoint_name=predictor_csv.endpoint_name)