#!/bin/bash INSTANCE_TYPE=$1 if [ -z "$INSTANCE_TYPE" ];then echo "Usage: "$0" " exit 1 fi HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin LOG_DIR=${HOME_DIR}/logs MPI_TRAINING_LOG=${LOG_DIR}/mpi_tensorflow_horovod_test.log TRAINING_LOG=${LOG_DIR}/tensorflow_horovod_test.log HOVOROD_DIR=${BIN_DIR}/examples/Horovod set -e echo "Simply verify if Horovod works well. You can follow progress on the log file : $MPI_TRAINING_LOG" | tee -a $MPI_TRAINING_LOG set +e nvidia-smi RETURN_VAL=`echo $?` set -e if [ ${RETURN_VAL} -eq 0 ]; then echo "Running testTF2HVD.py on gpu" python ${BIN_DIR}/testTF2HVD.py 2>&1 | tee $MPI_TRAINING_LOG else echo "Running testTF2HVD.py on cpu" python ${BIN_DIR}/testTF2HVD.py --no-cuda 2>&1 | tee $MPI_TRAINING_LOG fi if grep "Loss =" $MPI_TRAINING_LOG; then echo "Successfully verified Horovod works with mpi." else echo "Horovod training failed with mpi." cat ${MPI_TRAINING_LOG} exit 1 fi echo "Train a resnet model using Horovod with Tensorflow on synthetic data. You can follow progress on the log file : $TRAINING_LOG" | tee -a $TRAINING_LOG if [ ${RETURN_VAL} -eq 0 ]; then GPU_AMOUNT=`nvidia-smi -L | wc -l` if [ $GPU_AMOUNT -ge 8 ]; then GPU_AMOUNT=8; fi # We only have Ubuntu16.04 image now. The only EIA it support is etho0. INTERFACE=eth0 # Train set +e horovodrun -np $GPU_AMOUNT -H localhost:$GPU_AMOUNT python -W ignore ${HOVOROD_DIR}/tensorflow2_synthetic_benchmark.py > ${TRAINING_LOG} 2>&1 TEST_SUCCESS_VAL=`echo $?` set -e if [ ${TEST_SUCCESS_VAL} -ne 0 ]; then echo "Horovod training using resnet50 model on synthetic data failed on GPU image" exit 1 fi else python -W ignore ${HOVOROD_DIR}/tensorflow2_synthetic_benchmark.py --no-cuda > ${TRAINING_LOG} fi if grep "Total img/sec on" $TRAINING_LOG; then echo "Horovod sanity test using resnet50 model on synthetic data successful." exit 0 elif grep "Finished in" $TRAINING_LOG; then echo "Horovod sanity test using resnet50 model on synthetic data successful." exit 0 else echo "Horovod sanity test using resnet50 model on synthetic data failed." echo "Training output: " cat ${TRAINING_LOG} exit 1 fi exit 0