--- apiVersion: kubeflow.org/v1alpha1 kind: MPIJob metadata: name: benchmark-eks-resnet-gpu-32 namespace: default spec: replicas: 4 template: spec: containers: - name: benchmark-eks-resnet image: seedjeffwan/eks-dl-benchmark:cuda10-tf1.13.1-hvd0.16.0-py3.5 command: - mpirun - -mca - btl_tcp_if_exclude - lo - -mca - pml - ob1 - -mca - btl - ^openib - --bind-to - socket - -map-by - slot - -x - LD_LIBRARY_PATH - -x - PATH - -x - NCCL_DEBUG=INFO - -x - NCCL_MIN_NRINGS=4 - -x - HOROVOD_FUSION_THRESHOLD=16777216 - -x - HOROVOD_HIERARCHICAL_ALLREDUCE=1 - python - models/resnet/tensorflow/train_imagenet_resnet_hvd.py args: - --batch_size=256 - --model=resnet50 - --num_batches=300 - --fp16 - --display_every=50 - --lr_decay_mode=poly - --synthetic - --intra_op_parallelism_threads=2 - --inter_op_parallelism_threads=8 - --num_parallel_calls=8 resources: limits: nvidia.com/gpu: 8