In [1]:
import boto3
from datetime import datetime
import sagemaker
from sagemaker.tensorflow import TensorFlow

In [2]:
sm_client = boto3.client('sagemaker')
sm_session = sagemaker.Session()
sm_role = sagemaker.get_execution_role()
bucket = sm_session.default_bucket()

In [3]:
data_prefix = 'distributed_training_demo/data'
logs_prefix= 'distributed_training_demo/logs'
model_prefix = 'distributed_training_demo/model'

In [4]:
## use file mode or pipe/tfdata
is_file = 0

In [5]:
## configure the training job

# metrics to capture from output of training script
metric_definitions = [
 {'Name': 'train:loss', 'Regex': '- loss: ([0-9\\.]+)'},
 {'Name': 'validation:loss', 'Regex': '- val_loss: ([0-9\\.]+)'},
 {'Name': 'train:categorical_accuracy', 'Regex': '- categorical_accuracy: ([0-9\\.]+)'},
 {'Name': 'validation:categorical_accuracy', 'Regex': '- val_categorical_accuracy: ([0-9\\.]+)'},
 {'Name': 'epoch', 'Regex': 'Epoch ([0-9\\.]+)\/[0-9\\.]+'},
 {'Name': 'seconds_per_epoch', 'Regex': '- ([0-9]+)s -'}
]

if is_file:
 is_pipe = 0
 is_tfdata = 0
else:
 # tfdata wrapper for data input
 # https://github.com/tensorflow/examples/blob/master/community/en/docs/deploy/s3.md
 tfdata_s3uri = f's3://{bucket}/{data_prefix}'
 is_pipe = 1
 is_tfdata = 1
 is_file = 0
 
train_path = f's3://{bucket}/{data_prefix}/train/'
validation_path = f's3://{bucket}/{data_prefix}/validation/'
data_inputs = {'train': train_path,'validation': validation_path}

## Multi-GPU training using TensroFlow MirroredStrategy

In [6]:
## set up sagemaker estimator object

# hyperparameters
base_job_name = 'cinic-demo-multi'
tensorboard_logs_s3uri = f's3://{bucket}/{logs_prefix}/{base_job_name}-{datetime.now().strftime("%Y%m%d-%H%M")}'
output_path = f's3://{bucket}/{model_prefix}/'
hyperparameters = {
 'use-horovod': 0,
 'tensorboard-logs-s3uri': tensorboard_logs_s3uri,
 'learning-rate': 1e-4,
 'batch-size': 1024, # tf.distribute.MirroredStrategy() will divide this amongst GPUs
 'epochs': 100,
 'tfdata-s3uri': tfdata_s3uri if is_tfdata else None,
}

# https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/using_tf.html#create-an-estimator
estimator = TensorFlow(
 base_job_name=base_job_name,
 entry_point='./training_script.py',
 source_dir='../source_directory/training',
 output_path=output_path,
 role=sm_role,
 framework_version='2.3', # https://github.com/tensorflow/tensorflow/tags
 py_version='py37',
 volume_size=50,
 metric_definitions=metric_definitions,
 hyperparameters=hyperparameters,
 input_mode='Pipe',
 instance_count=1,
 instance_type='ml.p3.16xlarge',
 debugger_hook_config=False, # turn off sm debugger (allows you to print tensors during training)
)

In [7]:
## launch training job
estimator.fit(data_inputs, wait=False, logs=None)

## Distributed training using Horovod

In [8]:
## update batch size and hyperparameters
# for horovod and herring below, need to set batch size per gpu
hyperparameters['use-horovod'] = 1
hyperparameters['batch-size'] = int(hyperparameters['batch-size']/8)

In [9]:
## launching training job with mpirun distribution
# https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/using_tf.html#training-with-horovod
distribution = {
 'mpi': {
 'enabled': True,
 'processes_per_host': 4, # number of GPUs per instance
 'custom_mpi_options': '--NCCL_DEBUG=INFO',
 }
 }

In [10]:
## set up horovod estimator

# hyperparameters
base_job_name = 'cinic-demo-horovod'
tensorboard_logs_s3uri = f's3://{bucket}/{logs_prefix}/{base_job_name}-{datetime.now().strftime("%Y%m%d-%H%M")}'
output_path = f's3://{bucket}/{model_prefix}'
hyperparameters['tensorboard-logs-s3uri'] = tensorboard_logs_s3uri

# sagemaker estimator object
estimator = TensorFlow(
 base_job_name=base_job_name,
 entry_point='./training_script.py',
 source_dir='../source_directory/training',
 output_path=output_path,
 role=sm_role,
 framework_version='2.3', # https://github.com/tensorflow/tensorflow/tags
 py_version='py37',
 volume_size=50,
 metric_definitions=metric_definitions,
 hyperparameters=hyperparameters,
 input_mode='Pipe',
 instance_count=2,
 instance_type='ml.p3.8xlarge',
 debugger_hook_config=False, # turn off sm debugger (allows you to print tensors during training)
 distribution=distribution,
)

In [11]:
## launch training job
estimator.fit(data_inputs, wait=False, logs=None)
print("Training job name:", estimator.latest_training_job.name)

Training job name: cinic-demo-horovod-2021-02-12-20-05-02-420


In [14]:
estimator.latest_training_job.wait(logs=False)


2021-02-12 20:05:09 Starting - Launching requested ML instances.....
2021-02-12 20:07:10 Starting - Preparing the instances for training........
2021-02-12 20:07:55 Downloading - Downloading input data.
2021-02-12 20:08:10 Training - Downloading the training image...............
2021-02-12 20:09:27 Training - Training image download completed. Training in progress.........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................