#!/bin/bash export NUM_GPUS=8 export NCCL_DEBUG=info export MPI_HOME=/opt/amazon/openmpi/bin /opt/amazon/openmpi/bin/mpirun --allow-run-as-root -np $NUM_GPUS \ /workspace/nccl-tests/build/all_reduce_perf \ -g 1 -b 8 -e 4GB -f 2 -n 100 for collective in all_reduce all_gather broadcast reduce_scatter all_to_all ; do echo "TESTING COLLECTIVE: $collective" /opt/amazon/openmpi/bin/mpirun --allow-run-as-root -np $NUM_GPUS \ python param/train/comms/pt/comms.py \ --b 8 --e 1G --n 100 --f 2 --z 1 --collective $collective \ --backend nccl --device cuda --log INFO done