#!/bin/bash # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. set -ex NUM_HOSTS_file=$1 NUM_HOSTS=$2 TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/instance-type) GPU_COUNT=$(nvidia-smi -L | wc -l) NODES=$(($GPU_COUNT * $NUM_HOSTS)) PRETTY_NAME=$(cat /etc/os-release | grep PRETTY_NAME) TRAINING_LOG="/test/logs/testEFA.log" USE_DEVICE_RDMA_ARG="-x FI_EFA_USE_DEVICE_RDMA=1" if [[ ${INSTANCE_TYPE} == p3dn.24xlarge ]]; then USE_DEVICE_RDMA_ARG="" fi check_ring_single_node(){ echo "Running ring" # Not using full-paths of mpirun and other scripts because these paths can change across PyTorch versions. mpirun -n 3 --host localhost --oversubscribe \ -x RDMAV_FORK_SAFE=1 ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple --mca pml ^cm \ -x LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH \ ring RETURN_VAL=`echo $?` if [ ${RETURN_VAL} -eq 0 ]; then echo "***************************** check_ring_single_node passed *****************************" else echo "***************************** check_ring_single_node failed *****************************" exit 1 fi } check_multinode_nccl_transfer() { echo "Running nccl_message_transfer" # Need to pass -x PATH because rank non-zero nodes seem to "forget" the value of PATH that is pre-configured into # the container. Not using full-paths of mpirun and other executables because these paths can change across PyTorch # versions in DLC images. mpirun -n $NUM_HOSTS -N 1 --hostfile $NUM_HOSTS_file \ -x PATH -x LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH -x NCCL_DEBUG=INFO \ ${USE_DEVICE_RDMA_ARG} -x NCCL_ALGO=ring -x NCCL_PROTO=simple -x RDMAV_FORK_SAFE=1 --mca pml ^cm \ -x FI_PROVIDER="efa" --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ nccl_message_transfer RETURN_VAL=`echo $?` if [ ${RETURN_VAL} -eq 0 ]; then echo "***************************** check_multinode_nccl_transfer passed *****************************" else echo "***************************** check_multinode_nccl_transfer failed *****************************" exit 1 fi } validate_all_reduce_performance_logs(){ grep "NET/OFI Using aws-ofi-nccl" ${TRAINING_LOG} grep "NET/OFI Selected Provider is efa" ${TRAINING_LOG} grep "Using network AWS Libfabric" ${TRAINING_LOG} if [[ ${INSTANCE_TYPE} == p4d* ]]; then grep "Setting NCCL_TOPO_FILE environment variable to" ${TRAINING_LOG} grep "NET/AWS Libfabric/0/GDRDMA" ${TRAINING_LOG} fi } check_efa_nccl_all_reduce_performance(){ benchmark=$(cat $TRAINING_LOG | grep '1073741824' | tail -n1 | awk -F " " '{{print $11}}' | sed 's/ //' | sed 's/ 5e-07//') echo "Benchmark throughput: ${benchmark}" # The standard throughput should be at least 41 for 2 p4d with 4 EFA devices and 7 for 2 p3dn with 1 EFA device. # However, if the 2 instances are not in the same A-Z in the same region, performance can decrease. # To account for this we need to modify thresholds dynamically based on where instances are. # Temporarily setting these to be < 50% of optimal until AWS OFI NCCL team has concrete numbers for this. PERFORMANCE_THRESHOLD="3" if [[ $(echo "$benchmark $PERFORMANCE_THRESHOLD" | awk '{print ($1 >= $2)}') == 1 ]]; then echo "***************************** check_efa_nccl_all_reduce_performance passed *****************************" else echo "***************************** check_efa_nccl_all_reduce_performance failed *****************************" exit 1 fi } check_efa_nccl_all_reduce(){ echo "Running all_reduce_perf test" # Need to pass -x PATH because rank non-zero nodes seem to "forget" the value of PATH that is pre-configured into # the container. Not using full-paths of mpirun and other executables because these paths can change across PyTorch # versions in DLC images. mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_file \ -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ -x PATH -x LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH \ -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" RETURN_VAL=${PIPESTATUS[0]} if [ ${RETURN_VAL} -eq 0 ]; then echo "***************************** check_efa_nccl_all_reduce passed *****************************" else echo "***************************** check_efa_nccl_all_reduce failed *****************************" fi validate_all_reduce_performance_logs check_efa_nccl_all_reduce_performance } check_ring_single_node check_multinode_nccl_transfer check_efa_nccl_all_reduce