# Building and Registering our ML Model

**SageMaker Studio Kernel**: Data Science

In this exercise you will do:
 - Upload the dataset
 - Train your ML model using Pytorch
 - Compute the thresholds, used by the application, to classify the predictions as anomalies or normal behavior
 - Register the model in the model registry


### Upload the dataset

In [None]:
import sagemaker
import numpy as np
import glob
from sagemaker.pytorch.estimator import PyTorch

role = sagemaker.get_execution_role()
sagemaker_session=sagemaker.Session()
bucket_name = sagemaker_session.default_bucket()

prefix='wind_turbine_anomaly'
data_files = glob.glob('data/*.npy')

train_input = "s3://%s/%s/data" % (bucket_name, prefix)

for f in data_files:
 sagemaker_session.upload_data(f, key_prefix="%s/data" % prefix)
n_features = np.load(data_files[0]).shape[1]

print(train_input)

### Training script

In [None]:
%%writefile wind_turbine.py
import argparse
import glob
import numpy as np
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from sklearn.model_selection import KFold

device = "cuda" if torch.cuda.is_available() else "cpu"

def create_model(n_features, dropout=0): 
 return torch.nn.Sequential(
 torch.nn.Conv2d(n_features, 32, kernel_size=2, padding=1),
 torch.nn.ReLU(),
 torch.nn.Dropout(dropout),
 torch.nn.Conv2d(32, 64, kernel_size=2, padding=1),
 torch.nn.ReLU(),
 torch.nn.Dropout(dropout),
 torch.nn.Conv2d(64, 128, kernel_size=2, padding=2),
 torch.nn.ReLU(),
 torch.nn.ConvTranspose2d(128, 64, kernel_size=2, padding=2),
 torch.nn.ReLU(),
 torch.nn.Dropout(dropout),
 torch.nn.ConvTranspose2d(64, 32, kernel_size=2, padding=1),
 torch.nn.ReLU(),
 torch.nn.Dropout(dropout),
 torch.nn.ConvTranspose2d(32, n_features, kernel_size=2, padding=1),
 ) 

def load_data(data_dir):
 input_files = glob.glob(os.path.join(data_dir, '*.npy'))
 data = [np.load(i) for i in input_files]
 return np.vstack(data) 

def train_epoch(optimizer, criterion, epoch, model, train_dataloader, test_dataloader):
 train_loss = 0.0 
 test_loss = 0.0 
 model.train()
 for x_train, y_train in train_dataloader:
 # clearing the Gradients of the model parameters
 optimizer.zero_grad()
 # prediction for training and validation set 
 output_train = model(x_train) 
 loss_train = criterion(output_train, y_train)
 
 # computing the updated weights of all the model parameters
 # statistics
 train_loss += loss_train.item()
 loss_train.backward()
 optimizer.step() 
 model.eval()
 for x_test, y_test in test_dataloader: 
 output_test = model(x_test.float())
 loss_test = criterion(output_test, y_test)
 # statistics
 test_loss += loss_test.item() 
 
 return train_loss, test_loss

def model_fn(model_dir):
 model = torch.load(os.path.join(model_dir, "model.pth"))
 model = model.to(device)
 model.eval()
 return model

def predict_fn(input_data, model): 
 with torch.no_grad():
 return model(input_data.float().to(device))

def train(args):
 best_of_the_best = (0,-1)
 best_loss = 10000000
 num_epochs = args.num_epochs
 batch_size = args.batch_size 
 
 X = load_data(args.train)
 criterion = nn.MSELoss() 
 kf = KFold(n_splits=args.k_fold_splits, shuffle=True)
 
 for i, indexes in enumerate(kf.split(X)):
 # skip other Ks if fixed was informed
 if args.k_index_only >= 0 and args.k_index_only != i: continue
 
 train_index, test_index = indexes
 print("Test dataset proportion: %.02f%%" % (len(test_index)/len(train_index) * 100))
 X_train, X_test = X[train_index], X[test_index]
 X_train = torch.from_numpy(X_train).float().to(device)
 X_test = torch.from_numpy(X_test).float().to(device)

 train_dataset = torch.utils.data.TensorDataset(X_train, X_train)
 train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
 test_dataset = torch.utils.data.TensorDataset(X_test, X_test)
 test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

 model = create_model(args.num_features, args.dropout_rate)
 model = model.to(device)

 optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
 # Instantiate model
 # Training loop
 for epoch in range(num_epochs):
 start_time = time.time()
 train_loss, test_loss = train_epoch( optimizer, criterion, epoch, model, train_dataloader, test_dataloader)
 elapsed_time = (time.time() - start_time)
 print("k=%d; epoch=%d; train_loss=%.3f; test_loss=%.3f; elapsed_time=%.3fs" % (i, epoch, train_loss, test_loss, elapsed_time))
 if test_loss < best_loss: 
 torch.save(model.state_dict(), os.path.join(args.output_data_dir,'model_state.pth'))
 best_loss = test_loss
 if best_loss < best_of_the_best[0]:
 best_of_the_best = (best_loss, i)
 print("\nBest model: best_mse=%f;" % best_loss)
 model = create_model(args.num_features, args.dropout_rate)
 model.load_state_dict( torch.load(os.path.join(args.output_data_dir, "model_state.pth")) ) 
 torch.save(model, os.path.join(args.model_dir, "model.pth"))

if __name__ == '__main__':
 nn.DataParallel
 parser = argparse.ArgumentParser()

 # Hyperparameters are described here. In this simple example we are just including one hyperparameter. 
 parser.add_argument('--k_fold_splits', type=int, default=6)
 parser.add_argument('--k_index_only', type=int, default=-1)
 parser.add_argument('--batch_size', type=int, default=16)
 parser.add_argument('--num_epochs', type=int, default=10)
 parser.add_argument('--num_features', type=int, default=8)
 parser.add_argument('--learning_rate', type=float, default=0.003)
 parser.add_argument('--dropout_rate', type=float, default=0.0)

 # Sagemaker specific arguments. Defaults are set in the environment variables.
 parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
 parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
 parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
 parser.add_argument('--num-gpus', type=int, default=os.environ['SM_NUM_GPUS'])

 args = parser.parse_args()
 train(args)

## Training our model with SageMaker

In [None]:
#Use the buit-in PyTorch container
estimator = PyTorch(
 'wind_turbine.py', 
 framework_version='1.6.0',
 role=role,
 sagemaker_session=sagemaker_session,
 instance_type='ml.p3.2xlarge',
 #instance_type='local_gpu', 
 instance_count=1,
 py_version='py3', 
 hyperparameters={
 'k_fold_splits': 6,
 'k_index_only': 3, # after running some experiments with this dataset, it makes sense to fix it
 'num_epochs': 200,
 'batch_size': 256,
 'learning_rate': 0.0001,
 'dropout_rate': 0.001,
 'num_features': n_features
 },
 metric_definitions=[
 {'Name': 'train_loss:mse', 'Regex': ' train_loss=(\S+);'},
 {'Name': 'test_loss:mse', 'Regex': ' test_loss=(\S+);'}
 ]
)

In [None]:
estimator.fit({'train': train_input})

## Compute the threshold based on MAE

In [None]:
transformer = estimator.transformer(
 instance_count=1, 
 instance_type='ml.m5.xlarge', 
 output_path="s3://%s/%s/output" % (bucket_name, prefix),
 accept='application/x-npy',
 max_payload=20,
 strategy='MultiRecord',
 assemble_with='Line'
)

In [None]:
# To start a transform job:
transformer.transform(train_input, content_type='application/x-npy')
# Then wait until transform job is completed
transformer.wait()

### Download the predictions

In [None]:
sagemaker_session.download_data(bucket=bucket_name, key_prefix='wind_turbine_anomaly/output/', path='data/preds/')

### Compute MAE & the thresholds

In [None]:
import numpy as np
import glob

x_inputs = np.vstack([np.load(i) for i in data_files])
y_preds = np.vstack([np.load(i) for i in glob.glob('data/preds/*.out')])

n_samples,n_features,n_rows,n_cols = x_inputs.shape

x_inputs = x_inputs.reshape(n_samples, n_features, n_rows*n_cols).transpose((0,2,1))
y_preds = y_preds.reshape(n_samples, n_features, n_rows*n_cols).transpose((0,2,1))

mae_loss = np.mean(np.abs(y_preds - x_inputs), axis=1).transpose((1,0))
mae_loss[np.isnan(mae_loss)] = 0

thresholds = np.mean(mae_loss, axis=1)
print(",".join(thresholds.astype(str)), thresholds.shape)

### Register the model in the registry

In [None]:
# Specify the model source
from sagemaker.pytorch.estimator import PyTorchModel

model_package_group_turbine="modelPackageGroupTurbine"

model_url = '%s%s/output/model.tar.gz' % (estimator.output_path, estimator.latest_training_job.name)

env= {
 "batch_size": "256",
 "dropout_rate": "0.001",
 "learning_rate": "0.001",
 "kernel_size": "2"
}

#https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#create-a-pytorchmodel-object
model = PyTorchModel(
 model_url,
 role,
 'wind_turbine.py', 
 py_version='py3', 
 env=env,
 sagemaker_session=sagemaker_session,
 framework_version='1.6.0',
 name=estimator.latest_training_job.name
)

model_package = model.register(
 content_types=["*"],
 response_types=["application/json"],
 inference_instances=["ml.p3.2xlarge"],
 transform_instances=["ml.m5.xlarge"],
 description="Turbine anomaly detection",
 approval_status="PendingManualApproval",
 model_package_group_name=model_package_group_turbine
)

At this stage, our model is trained, and registered in the registry. The model is ready to be approved. Let's verify it was correctly pushed to the registry:

In [None]:
import boto3
boto3_sm = boto3.client('sagemaker')

boto3_sm.describe_model_package_group(ModelPackageGroupName=model_package_group_turbine)

In [None]:
boto3_sm.list_model_packages(ModelPackageGroupName=model_package_group_turbine)