#!/bin/bash INSTANCE_TYPE=$1 if [ -z "$INSTANCE_TYPE" ];then echo "Usage: "$0" " exit 1 fi if [[ $INSTANCE_TYPE == g2* ]]; then echo "This is g2* instance. Skipped." exit 0 fi HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin LOG_DIR=${HOME_DIR}/logs TRAINING_LOG=${LOG_DIR}/mxnet_horovod_test.log HOVOROD_DIR=${BIN_DIR}/examples/Horovod set -e echo "Simply verify if Horovod works well. You can follow progress on the log file : $TRAINING_LOG" | tee -a $TRAINING_LOG set +e nvidia-smi RETURN_VAL=`echo $?` set -e if [ ${RETURN_VAL} -eq 0 ]; then /usr/local/bin/mpirun -np 1 -bind-to none -map-by slot \ -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \ python -u ${BIN_DIR}/testMXNetHVD.py \ 2> ${TRAINING_LOG} else /usr/local/bin/mpirun -np 1 -bind-to none -map-by slot \ -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \ python -u ${BIN_DIR}/testMXNetHVD.py --no-cuda \ 2> ${TRAINING_LOG} fi if grep "Time cost=" $TRAINING_LOG; then echo "Successfully verified Horovod works with mpi." else echo "Horovod training failed with mpi." cat ${TRAINING_LOG} exit 1 fi echo "Train a CNN model using Horovod with MXNet on MNIST data. You can follow progress on the log file : $TRAINING_LOG" | tee -a $TRAINING_LOG if [ ${RETURN_VAL} -eq 0 ]; then GPU_AMOUNT=`nvidia-smi -L | wc -l` if [ $GPU_AMOUNT -ge 8 ]; then GPU_AMOUNT=8; fi # We only have Ubuntu16.04 image now. The only EIA it support is etho0. INTERFACE=eth0 # Train /usr/local/bin/mpirun -np $GPU_AMOUNT -hostfile ${HOVOROD_DIR}/hosts -mca plm_rsh_no_tree_spawn 1 \ -bind-to socket -map-by slot \ -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \ -x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \ python -W ignore ${HOVOROD_DIR}/mxnet_train_mnist_hvd.py \ 2> ${TRAINING_LOG} else /usr/local/bin/mpirun -np 1 -mca plm_rsh_no_tree_spawn 1 \ -bind-to socket -map-by slot \ -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \ -x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \ python -W ignore ${HOVOROD_DIR}/mxnet_train_mnist_hvd.py \ --no-cuda 2> ${TRAINING_LOG} fi if grep "Time cost=" $TRAINING_LOG; then echo "Horovod sanity test using CNN model on MNIST dataset is successful." exit 0 elif grep "Achieved accuracy" $TRAINING_LOG; then echo "Horovod sanity test using CNN model on MNIST dataset is successful." exit 0 else echo "Horovod sanity test using CNN model on MNIST dataset has failed" echo "Training output: " cat ${TRAINING_LOG} exit 1 fi exit 0