## Horovod Distributed Training with Script Mode.

### Setup 

In [None]:
import sagemaker
import os
from sagemaker.utils import sagemaker_timestamp
from sagemaker.tensorflow import TensorFlow

sage_session = sagemaker.Session()

from sagemaker import get_execution_role
role = get_execution_role()


account = sage_session.boto_session.client('sts').get_caller_identity()['Account']
region = sage_session.boto_session.region_name

image_name = "sagemaker-horovod-distributed-training-3"
ecr_image_url = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region,image_name)


### Build your horovod container

In [None]:
%%script env region=$region image_name=$image_name bash

echo "Building docker image locally with image name: $image_name region: $region"

# Sagemaker Registry account id
sagemaker_registry_account=520713654638

# Get the login command from ECR and execute it directly with registy id of sagemaker to extend the sagemaker TF container.
$(aws ecr get-login --region ${region} --no-include-email --registry-ids ${sagemaker_registry_account})

# Build the docker image locally with the image name and then push it to ECR.

# On a SageMaker Notebook Instance, the docker daemon may need to be restarted in order
# to detect your network configuration correctly. (This is a known issue.)
if [ -d "/home/ec2-user/SageMaker" ]; then
 sudo service docker restart
fi

cd ../ && docker build -t ${image_name}:latest --build-arg region=${region} -f docker/Dockerfile.cpu .
 

## Push container to ECR Repository

In [None]:
%%script env account=$account region=$region image_name=$image_name ecr_image_url=$ecr_image_url bash

echo "Pushing locally built container to ECR Repository: $ecr_image_url in region: $region on account: $account"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${image_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
 echo "Creating a new ECR repository with name: $image_name"
 aws ecr create-repository --repository-name "${image_name}" > /dev/null
fi
 
`aws ecr get-login --no-include-email --region ${region}`

# Tag Docker image with ECR Url
docker tag ${image_name}:latest ${ecr_image_url}

docker push ${ecr_image_url}

### Prepare train and test data

In [None]:

def _get_train_test_data(data_path, sagemaker_session):

 prefix = 'tf_mnist/{}'.format(sagemaker_timestamp())
 train_data_path = os.path.join(data_path, 'train')
 key_prefix = prefix + '/train'
 train_input = sagemaker_session.upload_data(path=train_data_path, key_prefix=key_prefix)
 test_path = os.path.join(data_path, 'test')
 test_input = sagemaker_session.upload_data(path=test_path, key_prefix=prefix + '/test')

 return test_input, train_input

source_dir = os.path.join('../src')
data_path = os.path.join(source_dir, 'data')

test_input, train_input = _get_train_test_data(data_path, sage_session)

### Train it with Horovod

Hyperparameters to control horovod behaviour:
* `horovod-train-script`: Distributed training script using horovod.
* `instance_count`: Number of instances to be used for horovod distributed training
* `num-processes-per-host`: Number of processes per host to be launched as part of MPI/horovod job.### Train it with Horovod

In [None]:
def train(instance_count,
 num_of_processes_per_host,
 horovod_train_script):
 
 estimator = TensorFlow(entry_point="horovod_launcher.py",
 role=role,
 training_steps=1,
 evaluation_steps=1,
 train_instance_count=instance_count,
 train_instance_type="ml.c4.xlarge",
 sagemaker_session=sage_session,
 image_name=ecr_image_url,
 base_job_name="tf-horovod-{}x".format(str(instance_count)),
 source_dir=source_dir,
 hyperparameters= {
 "horovod-train-script": horovod_train_script, 
 "num-processes-per-host": num_of_processes_per_host
 })

 estimator.fit({'train': train_input, 'test': test_input})
 

train(horovod_train_script = "train_mnist_hvd.py",
 instance_count = 2,
 num_of_processes_per_host = 1)