#!/bin/bash
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.

set -ex

NUM_HOSTS_file=$1
NUM_HOSTS=$2

TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/instance-type)

GPU_COUNT=$(nvidia-smi -L | wc -l)
NODES=$(($GPU_COUNT * $NUM_HOSTS))


PRETTY_NAME=$(cat /etc/os-release | grep PRETTY_NAME)
TRAINING_LOG="/test/logs/testEFA.log"

USE_DEVICE_RDMA_ARG="-x FI_EFA_USE_DEVICE_RDMA=1"
if [[ ${INSTANCE_TYPE} == p3dn.24xlarge ]]; then
  USE_DEVICE_RDMA_ARG=""
fi

check_ring_single_node(){
    echo "Running ring"
    # Not using full-paths of mpirun and other scripts because these paths can change across PyTorch versions.
    mpirun -n 3 --host localhost --oversubscribe \
        -x RDMAV_FORK_SAFE=1 ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple --mca pml ^cm \
        -x LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
        ring
    RETURN_VAL=`echo $?`
    if [ ${RETURN_VAL} -eq 0 ]; then
        echo "***************************** check_ring_single_node passed *****************************"
    else
        echo "***************************** check_ring_single_node failed *****************************"
        exit 1
    fi
}

check_multinode_nccl_transfer() {
    echo "Running nccl_message_transfer"
    # Need to pass -x PATH because rank non-zero nodes seem to "forget" the value of PATH that is pre-configured into
    # the container. Not using full-paths of mpirun and other executables because these paths can change across PyTorch
    # versions in DLC images.
    mpirun -n $NUM_HOSTS -N 1 --hostfile $NUM_HOSTS_file \
        -x PATH -x LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH -x NCCL_DEBUG=INFO \
        ${USE_DEVICE_RDMA_ARG} -x NCCL_ALGO=ring -x NCCL_PROTO=simple -x RDMAV_FORK_SAFE=1 --mca pml ^cm \
        -x FI_PROVIDER="efa" --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \
        nccl_message_transfer
    RETURN_VAL=`echo $?`
    if [ ${RETURN_VAL} -eq 0 ]; then
        echo "***************************** check_multinode_nccl_transfer passed *****************************"
    else
        echo "***************************** check_multinode_nccl_transfer failed *****************************"
        exit 1
    fi
}

validate_all_reduce_performance_logs(){
    grep "NET/OFI Using aws-ofi-nccl" ${TRAINING_LOG}
    grep "NET/OFI Selected Provider is efa" ${TRAINING_LOG}
    grep "Using network AWS Libfabric" ${TRAINING_LOG}
    if [[ ${INSTANCE_TYPE} == p4d* ]]; then
        grep "Setting NCCL_TOPO_FILE environment variable to" ${TRAINING_LOG}
        grep "NET/AWS Libfabric/0/GDRDMA" ${TRAINING_LOG}
    fi
}

check_efa_nccl_all_reduce_performance(){
    benchmark=$(cat $TRAINING_LOG | grep '1073741824' | tail -n1 | awk -F " " '{{print $11}}' | sed 's/ //' | sed  's/  5e-07//')
    echo "Benchmark throughput: ${benchmark}"

    # The standard throughput should be at least 41 for 2 p4d with 4 EFA devices and 7 for 2 p3dn with 1 EFA device.
    # However, if the 2 instances are not in the same A-Z in the same region, performance can decrease.
    # To account for this we need to modify thresholds dynamically based on where instances are.
    # Temporarily setting these to be < 50% of optimal until AWS OFI NCCL team has concrete numbers for this.
    PERFORMANCE_THRESHOLD="3"

    if [[ $(echo "$benchmark $PERFORMANCE_THRESHOLD" | awk '{print ($1 >= $2)}') == 1 ]]; then
        echo "***************************** check_efa_nccl_all_reduce_performance passed *****************************"
    else
        echo "***************************** check_efa_nccl_all_reduce_performance failed *****************************"
        exit 1
    fi
}

check_efa_nccl_all_reduce(){
    echo "Running all_reduce_perf test"
    # Need to pass -x PATH because rank non-zero nodes seem to "forget" the value of PATH that is pre-configured into
    # the container. Not using full-paths of mpirun and other executables because these paths can change across PyTorch
    # versions in DLC images.
    mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_file \
        -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \
        -x PATH -x LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
        -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \
        /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}"
    RETURN_VAL=${PIPESTATUS[0]}
    if [ ${RETURN_VAL} -eq 0 ]; then
        echo "***************************** check_efa_nccl_all_reduce passed *****************************"
    else
        echo "***************************** check_efa_nccl_all_reduce failed *****************************"
    fi
    validate_all_reduce_performance_logs
    check_efa_nccl_all_reduce_performance
}

check_ring_single_node
check_multinode_nccl_transfer
check_efa_nccl_all_reduce