{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import boto3\n", "from datetime import datetime\n", "import sagemaker\n", "from sagemaker.tensorflow import TensorFlow" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "sm_client = boto3.client('sagemaker')\n", "sm_session = sagemaker.Session()\n", "sm_role = sagemaker.get_execution_role()\n", "bucket = sm_session.default_bucket()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "data_prefix = 'distributed_training_demo/data'\n", "logs_prefix= 'distributed_training_demo/logs'\n", "model_prefix = 'distributed_training_demo/model'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "## use file mode or pipe/tfdata\n", "is_file = 0" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "## configure the training job\n", "\n", "# metrics to capture from output of training script\n", "metric_definitions = [\n", " {'Name': 'train:loss', 'Regex': '- loss: ([0-9\\\\.]+)'},\n", " {'Name': 'validation:loss', 'Regex': '- val_loss: ([0-9\\\\.]+)'},\n", " {'Name': 'train:categorical_accuracy', 'Regex': '- categorical_accuracy: ([0-9\\\\.]+)'},\n", " {'Name': 'validation:categorical_accuracy', 'Regex': '- val_categorical_accuracy: ([0-9\\\\.]+)'},\n", " {'Name': 'epoch', 'Regex': 'Epoch ([0-9\\\\.]+)\\/[0-9\\\\.]+'},\n", " {'Name': 'seconds_per_epoch', 'Regex': '- ([0-9]+)s -'}\n", "]\n", "\n", "if is_file:\n", " is_pipe = 0\n", " is_tfdata = 0\n", "else:\n", " # tfdata wrapper for data input\n", " # https://github.com/tensorflow/examples/blob/master/community/en/docs/deploy/s3.md\n", " tfdata_s3uri = f's3://{bucket}/{data_prefix}'\n", " is_pipe = 1\n", " is_tfdata = 1\n", " is_file = 0\n", " \n", "train_path = f's3://{bucket}/{data_prefix}/train/'\n", "validation_path = f's3://{bucket}/{data_prefix}/validation/'\n", "data_inputs = {'train': train_path,'validation': validation_path}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Multi-GPU training using TensroFlow MirroredStrategy" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "## set up sagemaker estimator object\n", "\n", "# hyperparameters\n", "base_job_name = 'cinic-demo-multi'\n", "tensorboard_logs_s3uri = f's3://{bucket}/{logs_prefix}/{base_job_name}-{datetime.now().strftime(\"%Y%m%d-%H%M\")}'\n", "output_path = f's3://{bucket}/{model_prefix}/'\n", "hyperparameters = {\n", " 'use-horovod': 0,\n", " 'tensorboard-logs-s3uri': tensorboard_logs_s3uri,\n", " 'learning-rate': 1e-4,\n", " 'batch-size': 1024, # tf.distribute.MirroredStrategy() will divide this amongst GPUs\n", " 'epochs': 100,\n", " 'tfdata-s3uri': tfdata_s3uri if is_tfdata else None,\n", "}\n", "\n", "# https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/using_tf.html#create-an-estimator\n", "estimator = TensorFlow(\n", " base_job_name=base_job_name,\n", " entry_point='./training_script.py',\n", " source_dir='../source_directory/training',\n", " output_path=output_path,\n", " role=sm_role,\n", " framework_version='2.3', # https://github.com/tensorflow/tensorflow/tags\n", " py_version='py37',\n", " volume_size=50,\n", " metric_definitions=metric_definitions,\n", " hyperparameters=hyperparameters,\n", " input_mode='Pipe',\n", " instance_count=1,\n", " instance_type='ml.p3.16xlarge',\n", " debugger_hook_config=False, # turn off sm debugger (allows you to print tensors during training)\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "## launch training job\n", "estimator.fit(data_inputs, wait=False, logs=None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Distributed training using Horovod" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "## update batch size and hyperparameters\n", "# for horovod and herring below, need to set batch size per gpu\n", "hyperparameters['use-horovod'] = 1\n", "hyperparameters['batch-size'] = int(hyperparameters['batch-size']/8)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "## launching training job with mpirun distribution\n", "# https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/using_tf.html#training-with-horovod\n", "distribution = {\n", " 'mpi': {\n", " 'enabled': True,\n", " 'processes_per_host': 4, # number of GPUs per instance\n", " 'custom_mpi_options': '--NCCL_DEBUG=INFO',\n", " }\n", " }" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "## set up horovod estimator\n", "\n", "# hyperparameters\n", "base_job_name = 'cinic-demo-horovod'\n", "tensorboard_logs_s3uri = f's3://{bucket}/{logs_prefix}/{base_job_name}-{datetime.now().strftime(\"%Y%m%d-%H%M\")}'\n", "output_path = f's3://{bucket}/{model_prefix}'\n", "hyperparameters['tensorboard-logs-s3uri'] = tensorboard_logs_s3uri\n", "\n", "# sagemaker estimator object\n", "estimator = TensorFlow(\n", " base_job_name=base_job_name,\n", " entry_point='./training_script.py',\n", " source_dir='../source_directory/training',\n", " output_path=output_path,\n", " role=sm_role,\n", " framework_version='2.3', # https://github.com/tensorflow/tensorflow/tags\n", " py_version='py37',\n", " volume_size=50,\n", " metric_definitions=metric_definitions,\n", " hyperparameters=hyperparameters,\n", " input_mode='Pipe',\n", " instance_count=2,\n", " instance_type='ml.p3.8xlarge',\n", " debugger_hook_config=False, # turn off sm debugger (allows you to print tensors during training)\n", " distribution=distribution,\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training job name: cinic-demo-horovod-2021-02-12-20-05-02-420\n" ] } ], "source": [ "## launch training job\n", "estimator.fit(data_inputs, wait=False, logs=None)\n", "print(\"Training job name:\", estimator.latest_training_job.name)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "2021-02-12 20:05:09 Starting - Launching requested ML instances.....\n", "2021-02-12 20:07:10 Starting - Preparing the instances for training........\n", "2021-02-12 20:07:55 Downloading - Downloading input data.\n", "2021-02-12 20:08:10 Training - Downloading the training image...............\n", "2021-02-12 20:09:27 Training - Training image download completed. Training in progress...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................\n", "2021-02-12 21:07:04 Uploading - Uploading generated training model.....\n", "2021-02-12 21:07:37 Completed - Training job completed\n" ] } ], "source": [ "estimator.latest_training_job.wait(logs=False)" ] } ], "metadata": { "kernelspec": { "display_name": "conda_tensorflow2_p36", "language": "python", "name": "conda_tensorflow2_p36" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 4 }