{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "from datetime import datetime\n",
    "import sagemaker\n",
    "from sagemaker.tensorflow import TensorFlow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "sm_client = boto3.client('sagemaker')\n",
    "sm_session = sagemaker.Session()\n",
    "sm_role = sagemaker.get_execution_role()\n",
    "bucket = sm_session.default_bucket()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_prefix = 'distributed_training_demo/data'\n",
    "logs_prefix= 'distributed_training_demo/logs'\n",
    "model_prefix = 'distributed_training_demo/model'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "## use file mode or pipe/tfdata\n",
    "is_file = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "## configure the training job\n",
    "\n",
    "# metrics to capture from output of training script\n",
    "metric_definitions = [\n",
    "    {'Name': 'train:loss', 'Regex': '- loss: ([0-9\\\\.]+)'},\n",
    "    {'Name': 'validation:loss', 'Regex': '- val_loss: ([0-9\\\\.]+)'},\n",
    "    {'Name': 'train:categorical_accuracy', 'Regex': '- categorical_accuracy: ([0-9\\\\.]+)'},\n",
    "    {'Name': 'validation:categorical_accuracy', 'Regex': '- val_categorical_accuracy: ([0-9\\\\.]+)'},\n",
    "    {'Name': 'epoch', 'Regex': 'Epoch ([0-9\\\\.]+)\\/[0-9\\\\.]+'},\n",
    "    {'Name': 'seconds_per_epoch', 'Regex': '- ([0-9]+)s -'}\n",
    "]\n",
    "\n",
    "if is_file:\n",
    "    is_pipe = 0\n",
    "    is_tfdata = 0\n",
    "else:\n",
    "    # tfdata wrapper for data input\n",
    "    # https://github.com/tensorflow/examples/blob/master/community/en/docs/deploy/s3.md\n",
    "    tfdata_s3uri = f's3://{bucket}/{data_prefix}'\n",
    "    is_pipe = 1\n",
    "    is_tfdata = 1\n",
    "    is_file = 0\n",
    "    \n",
    "train_path = f's3://{bucket}/{data_prefix}/train/'\n",
    "validation_path   = f's3://{bucket}/{data_prefix}/validation/'\n",
    "data_inputs = {'train': train_path,'validation': validation_path}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Multi-GPU training using TensroFlow MirroredStrategy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "## set up sagemaker estimator object\n",
    "\n",
    "# hyperparameters\n",
    "base_job_name = 'cinic-demo-multi'\n",
    "tensorboard_logs_s3uri = f's3://{bucket}/{logs_prefix}/{base_job_name}-{datetime.now().strftime(\"%Y%m%d-%H%M\")}'\n",
    "output_path = f's3://{bucket}/{model_prefix}/'\n",
    "hyperparameters = {\n",
    "    'use-horovod': 0,\n",
    "    'tensorboard-logs-s3uri': tensorboard_logs_s3uri,\n",
    "    'learning-rate': 1e-4,\n",
    "    'batch-size': 1024, # tf.distribute.MirroredStrategy() will divide this amongst GPUs\n",
    "    'epochs': 100,\n",
    "    'tfdata-s3uri': tfdata_s3uri if is_tfdata else None,\n",
    "}\n",
    "\n",
    "# https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/using_tf.html#create-an-estimator\n",
    "estimator = TensorFlow(\n",
    "    base_job_name=base_job_name,\n",
    "    entry_point='./training_script.py',\n",
    "    source_dir='../source_directory/training',\n",
    "    output_path=output_path,\n",
    "    role=sm_role,\n",
    "    framework_version='2.3', # https://github.com/tensorflow/tensorflow/tags\n",
    "    py_version='py37',\n",
    "    volume_size=50,\n",
    "    metric_definitions=metric_definitions,\n",
    "    hyperparameters=hyperparameters,\n",
    "    input_mode='Pipe',\n",
    "    instance_count=1,\n",
    "    instance_type='ml.p3.16xlarge',\n",
    "    debugger_hook_config=False, # turn off sm debugger (allows you to print tensors during training)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "## launch training job\n",
    "estimator.fit(data_inputs, wait=False, logs=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Distributed training using Horovod"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "## update batch size and hyperparameters\n",
    "# for horovod and herring below, need to set batch size per gpu\n",
    "hyperparameters['use-horovod'] = 1\n",
    "hyperparameters['batch-size'] = int(hyperparameters['batch-size']/8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "## launching training job with mpirun distribution\n",
    "# https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/using_tf.html#training-with-horovod\n",
    "distribution = {\n",
    "    'mpi': {\n",
    "        'enabled': True,\n",
    "        'processes_per_host': 4, # number of GPUs per instance\n",
    "        'custom_mpi_options': '--NCCL_DEBUG=INFO',\n",
    "        }\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "## set up horovod estimator\n",
    "\n",
    "# hyperparameters\n",
    "base_job_name = 'cinic-demo-horovod'\n",
    "tensorboard_logs_s3uri = f's3://{bucket}/{logs_prefix}/{base_job_name}-{datetime.now().strftime(\"%Y%m%d-%H%M\")}'\n",
    "output_path = f's3://{bucket}/{model_prefix}'\n",
    "hyperparameters['tensorboard-logs-s3uri'] = tensorboard_logs_s3uri\n",
    "\n",
    "# sagemaker estimator object\n",
    "estimator = TensorFlow(\n",
    "    base_job_name=base_job_name,\n",
    "    entry_point='./training_script.py',\n",
    "    source_dir='../source_directory/training',\n",
    "    output_path=output_path,\n",
    "    role=sm_role,\n",
    "    framework_version='2.3', # https://github.com/tensorflow/tensorflow/tags\n",
    "    py_version='py37',\n",
    "    volume_size=50,\n",
    "    metric_definitions=metric_definitions,\n",
    "    hyperparameters=hyperparameters,\n",
    "    input_mode='Pipe',\n",
    "    instance_count=2,\n",
    "    instance_type='ml.p3.8xlarge',\n",
    "    debugger_hook_config=False, # turn off sm debugger (allows you to print tensors during training)\n",
    "    distribution=distribution,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training job name: cinic-demo-horovod-2021-02-12-20-05-02-420\n"
     ]
    }
   ],
   "source": [
    "## launch training job\n",
    "estimator.fit(data_inputs, wait=False, logs=None)\n",
    "print(\"Training job name:\", estimator.latest_training_job.name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "2021-02-12 20:05:09 Starting - Launching requested ML instances.....\n",
      "2021-02-12 20:07:10 Starting - Preparing the instances for training........\n",
      "2021-02-12 20:07:55 Downloading - Downloading input data.\n",
      "2021-02-12 20:08:10 Training - Downloading the training image...............\n",
      "2021-02-12 20:09:27 Training - Training image download completed. Training in progress...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................\n",
      "2021-02-12 21:07:04 Uploading - Uploading generated training model.....\n",
      "2021-02-12 21:07:37 Completed - Training job completed\n"
     ]
    }
   ],
   "source": [
    "estimator.latest_training_job.wait(logs=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_tensorflow2_p36",
   "language": "python",
   "name": "conda_tensorflow2_p36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}