#!/bin/bash set -e source /test/bin/pytorch_tests/setupPyTorchBackendTest export USE_INDUCTOR=$1 echo testPyTorchNCCL: USE_INDUCTOR, $USE_INDUCTOR # may encounter cuda OOM if used too many processes. The benchmark repo hardcodes the batchsize for now. export WORLD_SIZE=$(($NUM_GPUS < 4 ? $NUM_GPUS : 4)) # run NCCL benchmarking with 2 GPUS and 1 local node (resnet will use torchvision) python userbenchmark/ddp_experiments/__init__.py \ --ngpus $WORLD_SIZE \ --distributed ddp \ --nodes 1 \ --cluster local \ --filter_models resnet50 \ --timeout 10 \ --job_dir $AWS_LOG_DIR \ --nccl-socket-ifname eth0 # generate the result csv JOB_ID=$(ls $AWS_LOG_DIR | grep .out | head -n 1 | cut -d'_' -f 1) python userbenchmark/ddp_experiments/parse_ddp.py \ --job_id $JOB_ID \ --results_dir $AWS_LOG_DIR \ --csv_out > $AWS_LOG_DIR/aws_res.csv echo testPyTorchNCCL: AWS-PyTorch test results && cat $AWS_LOG_DIR/aws_res.csv # get OSS perf data bash ${BIN_DIR}/pytorch_tests/installOSSPyTorch # run NCCL benchmarking with 2 GPUS and 1 local node (resnet will use torchvision) python userbenchmark/ddp_experiments/__init__.py \ --ngpus $WORLD_SIZE \ --distributed ddp \ --nodes 1 \ --cluster local \ --filter_models resnet50 \ --timeout 10 \ --job_dir $OSS_LOG_DIR \ --nccl-socket-ifname eth0 # generate the result csv JOB_ID=$(ls $OSS_LOG_DIR | grep .out | head -n 1 | cut -d'_' -f 1) python userbenchmark/ddp_experiments/parse_ddp.py \ --job_id $JOB_ID \ --results_dir $OSS_LOG_DIR \ --csv_out > $OSS_LOG_DIR/oss_res.csv echo testPyTorchNCCL: OSS-PyTorch test results && cat $OSS_LOG_DIR/oss_res.csv # evaluate the performance bash ${BIN_DIR}/pytorch_tests/evaluateResults $AWS_LOG_DIR/aws_res.csv $OSS_LOG_DIR/oss_res.csv exit 0