{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Horovod Distributed Training with Script Mode." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sagemaker\n", "import os\n", "from sagemaker.utils import sagemaker_timestamp\n", "from sagemaker.tensorflow import TensorFlow\n", "\n", "sage_session = sagemaker.Session()\n", "\n", "from sagemaker import get_execution_role\n", "role = get_execution_role()\n", "\n", "\n", "account = sage_session.boto_session.client('sts').get_caller_identity()['Account']\n", "region = sage_session.boto_session.region_name\n", "\n", "image_name = \"sagemaker-horovod-distributed-training-3\"\n", "ecr_image_url = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region,image_name)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Build your horovod container" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%script env region=$region image_name=$image_name bash\n", "\n", "echo \"Building docker image locally with image name: $image_name region: $region\"\n", "\n", "# Sagemaker Registry account id\n", "sagemaker_registry_account=520713654638\n", "\n", "# Get the login command from ECR and execute it directly with registy id of sagemaker to extend the sagemaker TF container.\n", "$(aws ecr get-login --region ${region} --no-include-email --registry-ids ${sagemaker_registry_account})\n", "\n", "# Build the docker image locally with the image name and then push it to ECR.\n", "\n", "# On a SageMaker Notebook Instance, the docker daemon may need to be restarted in order\n", "# to detect your network configuration correctly. (This is a known issue.)\n", "if [ -d \"/home/ec2-user/SageMaker\" ]; then\n", " sudo service docker restart\n", "fi\n", "\n", "cd ../ && docker build -t ${image_name}:latest --build-arg region=${region} -f docker/Dockerfile.cpu .\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Push container to ECR Repository" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%script env account=$account region=$region image_name=$image_name ecr_image_url=$ecr_image_url bash\n", "\n", "echo \"Pushing locally built container to ECR Repository: $ecr_image_url in region: $region on account: $account\"\n", "\n", "# If the repository doesn't exist in ECR, create it.\n", "aws ecr describe-repositories --repository-names \"${image_name}\" > /dev/null 2>&1\n", "\n", "if [ $? -ne 0 ]\n", "then\n", " echo \"Creating a new ECR repository with name: $image_name\"\n", " aws ecr create-repository --repository-name \"${image_name}\" > /dev/null\n", "fi\n", " \n", "`aws ecr get-login --no-include-email --region ${region}`\n", "\n", "# Tag Docker image with ECR Url\n", "docker tag ${image_name}:latest ${ecr_image_url}\n", "\n", "docker push ${ecr_image_url}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Prepare train and test data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "def _get_train_test_data(data_path, sagemaker_session):\n", "\n", " prefix = 'tf_mnist/{}'.format(sagemaker_timestamp())\n", " train_data_path = os.path.join(data_path, 'train')\n", " key_prefix = prefix + '/train'\n", " train_input = sagemaker_session.upload_data(path=train_data_path, key_prefix=key_prefix)\n", " test_path = os.path.join(data_path, 'test')\n", " test_input = sagemaker_session.upload_data(path=test_path, key_prefix=prefix + '/test')\n", "\n", " return test_input, train_input\n", "\n", "source_dir = os.path.join('../src')\n", "data_path = os.path.join(source_dir, 'data')\n", "\n", "test_input, train_input = _get_train_test_data(data_path, sage_session)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Train it with Horovod\n", "\n", "Hyperparameters to control horovod behaviour:\n", "* `horovod-train-script`: Distributed training script using horovod.\n", "* `instance_count`: Number of instances to be used for horovod distributed training\n", "* `num-processes-per-host`: Number of processes per host to be launched as part of MPI/horovod job.### Train it with Horovod" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def train(instance_count,\n", " num_of_processes_per_host,\n", " horovod_train_script):\n", " \n", " estimator = TensorFlow(entry_point=\"horovod_launcher.py\",\n", " role=role,\n", " training_steps=1,\n", " evaluation_steps=1,\n", " train_instance_count=instance_count,\n", " train_instance_type=\"ml.c4.xlarge\",\n", " sagemaker_session=sage_session,\n", " image_name=ecr_image_url,\n", " base_job_name=\"tf-horovod-{}x\".format(str(instance_count)),\n", " source_dir=source_dir,\n", " hyperparameters= {\n", " \"horovod-train-script\": horovod_train_script, \n", " \"num-processes-per-host\": num_of_processes_per_host\n", " })\n", "\n", " estimator.fit({'train': train_input, 'test': test_input})\n", " \n", "\n", "train(horovod_train_script = \"train_mnist_hvd.py\",\n", " instance_count = 2,\n", " num_of_processes_per_host = 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 }