################################################################ # Launch DLAMI with EFA ################################################################ import boto3 import yaml import argparse import os parser = argparse.ArgumentParser() parser.add_argument("--instance_id1") parser.add_argument("--instance_id2") parser.add_argument("--docker_user") parser.add_argument("--keypair") args = parser.parse_args() keypair = os.getcwd() + "/" + args.keypair ec2_session = boto3.Session(region_name="us-east-1") ec2_client = ec2_session.client("ec2") ec2_resource = ec2_session.resource("ec2") #response = ec2_client.run_instances(**config) response = ec2_client.start_instances(InstanceIds=[args.instance_id1, args.instance_id2]) print(response) ################################################################ # Create SSH interface to all instances # Runs in loop while waiting for instances to be ready ################################################################ import ssh from time import sleep while True: try: instances = [instance['InstanceId'] for instance in response['StartingInstances']] status = ec2_resource.meta.client.describe_instances(InstanceIds=instances) public_ips = [instance['PublicIpAddress'] for instance in status['Reservations'][0]['Instances']] ssh_client = ssh.SSH(public_ips, keypair) # wait a few seconds and run a simple command to make sure instances are up pci = ssh_client.run_on_all('lspci') break except: sleep(10) continue print(pci[0]['stdout']) ''################################################################ # Use local AWS credentials for EC2 ################################################################ '''import getpass import configparser credentials = configparser.ConfigParser() credentials.read('/Users/{0}/.aws/credentials'.format(getpass.getuser())) config = configparser.ConfigParser() config.read('/Users/{0}/.aws/config'.format(getpass.getuser())) ssh_client.run_on_all('aws configure set aws_access_key_id {}'.format(credentials['default']['aws_access_key_id'])) ssh_client.run_on_all('aws configure set aws_secret_access_key {}'.format(credentials['default']['aws_secret_access_key'])) ssh_client.run_on_all('aws configure set default.region {}'.format(config['default']['region'])) del credentials del config''' ################################################################ # Update EFA Driver to 1.8.4 Takes about 2 minutes # Runs in loop in case of periodic installation error [track this down] ################################################################ #ssh_client.scp_local_to_all('efa_tutorial/setup_scripts/efa_setup.sh', 'efa_setup.sh') '''ssh_client.run_on_all('./efa_setup.sh') ################################################################ # Check to make sure driver is updated # mpi should be 4.0.3 ################################################################ version_check = ssh_client.run_on_all('/opt/amazon/openmpi/bin/mpirun --version') while not all(['4.0.3' in i['stdout'] for i in version_check]): sleep(10) ssh_client.run_on_all('./efa_setup.sh') version_check = ssh_client.run_on_all('/opt/amazon/openmpi/bin/mpirun --version') print(version_check[0]['stdout'])''' ################################################################ # mount nvme drive ################################################################ '''ssh_client.run_on_all('mkdir -p ~/shared_workspace') ssh_client.run_on_all('sudo mkfs -t xfs /dev/nvme0n1') ssh_client.run_on_all('sudo mount /dev/nvme0n1 ~/shared_workspace') ssh_client.run_on_all('mkdir -p ~/shared_workspace/data') ssh_client.run_on_all('sudo chmod -R 777 ~/shared_workspace')''' ################################################################ # download coco data # specific to vision models ################################################################ '''print('start downloading') download_coco = "aws s3 cp --recursive s3://jbsnyder-sagemaker/faster-rcnn/data/ ~/shared_workspace/data > ~/s3log" print('downloaded') coco_thread = ssh_client.run_on_all(download_coco, wait=True)''' ################################################################ # Build Docker image Takes about 10 minutes # Only run first time ################################################################ first_run=False dockerhub_user = args.docker_user dockerhub_repo = 'efa' dockerhub_tag = 'dlami_28' if first_run: ssh_client.scp_local_to_master('../docker', 'docker', recursive=True) ssh_client.run_on_master('cp -R /opt/amazon/efa docker/') ssh_client.run_on_master('cd docker && docker build -t {}/{}:{} .'.format(dockerhub_user, dockerhub_repo, dockerhub_tag)) ################################################################ # Deploy Docker image to all nodes # Only run first time ################################################################ if first_run: # Warning: bug in ipykernel can sometimes cause password to echo. recommend run in standard python import getpass dh_password = getpass.getpass('enter dockerhub password') ssh_client.run_on_master('docker login --username {} --password {}'.format(dockerhub_user, dh_password)) del dh_password ssh_client.run_on_master('docker push {}/{}:{}'.format(dockerhub_user, dockerhub_repo, dockerhub_tag)) ssh_client.run_on_workers('docker pull {}/{}:{}'.format(dockerhub_user, dockerhub_repo, dockerhub_tag)) ################################################################ # After first run, just pull image to nodes ################################################################ '''ssh_client.run_on_all('docker pull {}/{}:{} > ~/dockerlog'.format(dockerhub_user, dockerhub_repo, dockerhub_tag))''' ################################################################ # Setup internode communication # This passes the same ssh credentials to all nodes # and makes sure they can communicate without login ################################################################ '''private_ips = [instance['PrivateIpAddress'] for instance in status['Reservations'][0]['Instances']] print(private_ips) ssh.create_hostfile(ssh_client, private_ips) print('hostfile created') ssh.create_ssh_comm(ssh_client) print("comm created") ssh.setup_container_communication(ssh_client) print('container set')''' ################################################################ # Setup Containers ################################################################ launch_cont = """docker run --rm -it -d --gpus all \ --name mpicont \ --net=host --uts=host --ipc=host \ --ulimit stack=67108864 --ulimit memlock=-1 \ --security-opt seccomp=unconfined \ -v /opt/amazon/efa:/efa \ -v /home/ubuntu/ssh_container:/root/.ssh \ -v ~/shared_workspace:/workspace/shared_workspace \ --device=/dev/infiniband/uverbs0 \ {0}/{1}:{2} """.format(dockerhub_user, dockerhub_repo, dockerhub_tag) ssh_client.run_on_all(launch_cont) ssh_client.run_on_all("docker images > ~/imagelog") ################################################################ # setup nccl tests (optional) ################################################################ '''ssh_client.run_on_all('docker exec mpicont /bin/bash -c "cd /workspace/shared_workspace && git clone https://github.com/NVIDIA/nccl-tests.git" > ~/execlog1') ssh_client.run_on_all('docker exec mpicont /bin/bash -c "cd /workspace/shared_workspace/nccl-tests && make MPI=1 MPI_HOME=/usr/local/ NCCL_HOME=/nccl/build" > ~/execlog2')''' ################################################################ # Run NCCL Tests ################################################################ '''import re nccl_efa_command = """ mpirun -x FI_PROVIDER="efa" \ --allow-run-as-root \ -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/efa/lib:/usr/local/lib:/nccl/build/lib:/aws-ofi-nccl/install/lib \ -x NCCL_DEBUG=INFO \ -x NCCL_TREE_THRESHOLD=0 \ -x NCCL_SOCKET_IFNAME=ens5 \ --hostfile /root/.ssh/hosts \ --mca plm_rsh_no_tree_spawn 1 -bind-to none -map-by slot -mca pml ob1 \ --mca btl_vader_single_copy_mechanism none \ --mca oob_tcp_if_include ens5 \ --mca btl_tcp_if_include ens5 \ --oversubscribe \ /workspace/shared_workspace/nccl-tests/build/all_reduce_perf \ -b 8 -e 4G -f 2 -g 1 -c0 """ efa_result = ssh_client.run_on_master('docker exec mpicont bash -c \"{}\" > > ~/execlog3'.format(nccl_efa_command))''' #efa_bandwidth = float(re.findall("\d+\.\d+", efa_result['stdout'].split(':')[-1])[0]) #print("EFA bandwidth: {}".format(efa_bandwidth)) ################################################################ # make sure coco download is complete before unarchiving # specific to vision models ################################################################ '''while not all([i.done() for i in coco_thread]): sleep(1) continue ssh_client.run_on_all('cd ~/shared_workspace/data/coco && tar -xf coco.tar') ssh_client.run_on_all("cd shared_workspace && git clone -b staging https://github.com/aws-samples/deep-learning-models")''' ################################################################ # Start Jupyterlab (optional) # useful environment for interacting with container # also contains Tensorboard and monitoring tools ################################################################ '''notebook = ssh.Notebook(ssh_client) print(notebook.get_token())''' ################################################################ # Launch Training # Run training in background thread so it will continue # if disconnected from instance. # To run not in background, remove `nohup` and `&> ~/shared_workspace/logs/out.log &` ################################################################ training_launch = """ mpirun --allow-run-as-root \ -x FI_PROVIDER=\\\"efa\\\" \ -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/efa/lib:/usr/local/lib:/nccl/build/lib:/aws-ofi-nccl/install/lib \ -x NCCL_DEBUG=INFO \ -x NCCL_TREE_THRESHOLD=0 \ --hostfile /root/.ssh/hosts \ --mca plm_rsh_no_tree_spawn 1 -bind-to none -map-by slot -mca pml ob1 \ --mca btl_vader_single_copy_mechanism none \ --mca oob_tcp_if_include ens5 \ --mca btl_tcp_if_include ens5 \ python /workspace/shared_workspace/deep-learning-models/models/vision/detection/tools/train_docker.py \ --configuration /workspace/shared_workspace/deep-learning-models/models/vision/detection/configs/docker_default_config.py \ --base_learning_rate 15e-3 \ --batch_size_per_device 4 \ --fp16 True \ --schedule 1x \ --warmup_init_lr_scale 3.0 \ --warmup_steps 1000 \ --use_rcnn_bn False \ --use_conv True \ --ls 0.0 \ --epochs 1 \ --name demo """ ssh_client.run_on_master('mkdir -p ~/shared_workspace/logs') training_thread = ssh_client.run_on_master("""docker exec mpicont bash -c \"{}\" &> ~/shared_workspace/logs/out.log""".format(training_launch)) ################################################################ # Cleanup and shutdown # disconnect from notebook # stop docker container # terminate instance # if you would rather just stop instances so they # can be used again later use # ec2_client.stop_instances(InstanceIds=instances) # ec2_client.start_instances(InstanceIds=instances) ################################################################ #notebook.disconnect() #ssh_client.run_on_all("docker stop mpicont") sleep(3000) ssh_client.run_on_master("python ~/shared_workspace/logs/parse_and_submit.py ~/shared_workspace/logs/out.log 16 64 p3dn.24xlarge EC2 > parselog") ec2_client.stop_instances(InstanceIds=instances)