#!/bin/bash

set -e

source /test/bin/pytorch_tests/setupPyTorchBackendTest
export USE_INDUCTOR=$1
echo testPyTorchNCCL: USE_INDUCTOR, $USE_INDUCTOR

# may encounter cuda OOM if used too many processes. The benchmark repo hardcodes the batchsize for now. 
export WORLD_SIZE=$(($NUM_GPUS < 4 ? $NUM_GPUS : 4)) 
# run NCCL benchmarking with 2 GPUS and 1 local node (resnet will use torchvision)
python userbenchmark/ddp_experiments/__init__.py \
    --ngpus $WORLD_SIZE \
    --distributed ddp \
    --nodes 1 \
    --cluster local \
    --filter_models resnet50 \
    --timeout 10 \
    --job_dir $AWS_LOG_DIR \
    --nccl-socket-ifname eth0


# generate the result csv
JOB_ID=$(ls $AWS_LOG_DIR | grep .out | head -n 1 | cut -d'_' -f 1)
python userbenchmark/ddp_experiments/parse_ddp.py \
    --job_id $JOB_ID \
    --results_dir $AWS_LOG_DIR \
    --csv_out > $AWS_LOG_DIR/aws_res.csv
echo testPyTorchNCCL: AWS-PyTorch test results && cat $AWS_LOG_DIR/aws_res.csv

# get OSS perf data
bash ${BIN_DIR}/pytorch_tests/installOSSPyTorch

# run NCCL benchmarking with 2 GPUS and 1 local node (resnet will use torchvision)
python userbenchmark/ddp_experiments/__init__.py \
    --ngpus $WORLD_SIZE \
    --distributed ddp \
    --nodes 1 \
    --cluster local \
    --filter_models resnet50 \
    --timeout 10 \
    --job_dir $OSS_LOG_DIR \
    --nccl-socket-ifname eth0

# generate the result csv
JOB_ID=$(ls $OSS_LOG_DIR | grep .out | head -n 1 | cut -d'_' -f 1)
python userbenchmark/ddp_experiments/parse_ddp.py \
    --job_id $JOB_ID \
    --results_dir $OSS_LOG_DIR \
    --csv_out > $OSS_LOG_DIR/oss_res.csv
echo testPyTorchNCCL: OSS-PyTorch test results && cat $OSS_LOG_DIR/oss_res.csv

# evaluate the performance
bash ${BIN_DIR}/pytorch_tests/evaluateResults $AWS_LOG_DIR/aws_res.csv $OSS_LOG_DIR/oss_res.csv

exit 0