################################################################ # Launch DLAMI with EFA ################################################################ import boto3 import yaml import argparse import os parser = argparse.ArgumentParser() parser.add_argument("--instance_id") parser.add_argument("--docker_user") parser.add_argument("--keypair") args = parser.parse_args() keypair = os.getcwd() + "/" + args.keypair dockerhub_user = args.docker_user dockerhub_repo = 'efa' dockerhub_tag = 'dlami_28' ec2_session = boto3.Session(region_name="us-east-1") ec2_client = ec2_session.client("ec2") ec2_resource = ec2_session.resource("ec2") #response = ec2_client.run_instances(**config) response = ec2_client.start_instances(InstanceIds=[args.instance_id]) print(response) ################################################################ # Create SSH interface to all instances # Runs in loop while waiting for instances to be ready ################################################################ import ssh from time import sleep while True: try: instances = [instance['InstanceId'] for instance in response['StartingInstances']] status = ec2_resource.meta.client.describe_instances(InstanceIds=instances) public_ips = [instance['PublicIpAddress'] for instance in status['Reservations'][0]['Instances']] ssh_client = ssh.SSH(public_ips, keypair) # wait a few seconds and run a simple command to make sure instances are up pci = ssh_client.run_on_all('lspci') break except: sleep(10) continue print(pci[0]['stdout']) ################################################################ # Setup Containers ################################################################ launch_cont = """docker run --rm -it -d --gpus all \ --name mpicont \ --net=host --uts=host --ipc=host \ --ulimit stack=67108864 --ulimit memlock=-1 \ --security-opt seccomp=unconfined \ -v /opt/amazon/efa:/efa \ -v /home/ubuntu/ssh_container:/root/.ssh \ -v ~/shared_workspace:/workspace/shared_workspace \ --device=/dev/infiniband/uverbs0 \ {0}/{1}:{2} """.format(dockerhub_user, dockerhub_repo, dockerhub_tag) ssh_client.run_on_all(launch_cont) ssh_client.run_on_all("docker images > ~/imagelog") ################################################################ # Launch Training # Run training in background thread so it will continue # if disconnected from instance. # To run not in background, remove `nohup` and `&> ~/shared_workspace/logs/out.log &` ################################################################ from datetime import datetime import time training_launch = """ mpirun --allow-run-as-root \ -x FI_PROVIDER=\\\"efa\\\" \ -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/efa/lib:/usr/local/lib:/nccl/build/lib:/aws-ofi-nccl/install/lib \ -x NCCL_DEBUG=INFO \ -x NCCL_TREE_THRESHOLD=0 \ --hostfile /root/.ssh/hosts \ --mca plm_rsh_no_tree_spawn 1 -bind-to none -map-by slot -mca pml ob1 \ --mca btl_vader_single_copy_mechanism none \ --mca oob_tcp_if_include ens5 \ --mca btl_tcp_if_include ens5 \ python /workspace/shared_workspace/deep-learning-models/models/vision/detection/tools/train_docker.py \ --configuration /workspace/shared_workspace/deep-learning-models/models/vision/detection/configs/docker_default_config.py \ --base_learning_rate 15e-3 \ --batch_size_per_device 4 \ --fp16 True \ --schedule 1x \ --warmup_init_lr_scale 3.0 \ --warmup_steps 1000 \ --use_rcnn_bn False \ --use_conv True \ --ls 0.0 \ --epochs 1 \ --name demo """ ssh_client.run_on_master('mkdir -p ~/shared_workspace/logs') ssh_client.run_on_all('date > time1') print(datetime.now().strftime("%d/%m/%Y %H:%M:%S")) start = time.time() training_thread = ssh_client.run_on_master("""docker exec mpicont bash -c \"{}\" &> ~/shared_workspace/logs/out.log&""".format(training_launch)) print(datetime.now().strftime("%d/%m/%Y %H:%M:%S")) end = time.time() ################################################################ # Cleanup and shutdown # disconnect from notebook # stop docker container # terminate instance # if you would rather just stop instances so they # can be used again later use # ec2_client.stop_instances(InstanceIds=instances) # ec2_client.start_instances(InstanceIds=instances) ################################################################ #notebook.disconnect() #ssh_client.run_on_all("docker stop mpicont") sleep(3000) ssh_client.run_on_all("python ~/shared_workspace/logs/parse_and_submit.py ~/shared_workspace/logs/out.log 8 32 p3dn.24xlarge EC2 > parselog") #ec2_client.stop_instances(InstanceIds=instances)