apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: test-nccl-efa spec: runPolicy: cleanPodPolicy: Running backoffLimit: 20 slotsPerWorker: 1 mpiReplicaSpecs: Launcher: replicas: 1 template: spec: imagePullPolicy: IfNotPresent restartPolicy: OnFailure containers: #- image: .dkr.ecr.us-west-2.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04 - image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04 name: test-nccl-efa-launcher #env: #- name: LD_LIBRARY_PATH # value: /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 #value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH #- name: PATH # value: /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin # value: $PATH:/opt/amazon/efa/bin command: - /opt/amazon/openmpi/bin/mpirun - --allow-run-as-root - --tag-output - -np - "2" - -bind-to - none - -map-by - slot - -x - PATH - -x - LD_LIBRARY_PATH - -x - NCCL_DEBUG=INFO - -x - NCCL_ALGO=Ring - -x - FI_PROVIDER=efa - -x - FI_EFA_USE_DEVICE_RDMA=1 - -x - RDMAV_FORK_SAFE=1 - -x - NCCL_SHM_DISABLE=0 - --mca - pml - ^cm - --oversubscribe - /opt/nccl-tests/build/all_reduce_perf - -b - "1" - -e - 1G - -f - "2" - -t - "1" - -g - "1" - -c - "1" - -n - "100" Worker: replicas: 2 template: spec: #nodeSelector: #beta.kubernetes.io/instance-type: "p4d.24xlarge" #beta.kubernetes.io/instance-type: "p3dn.24xlarge" #beta.kubernetes.io/instance-type: "g4dn.metal" imagePullPolicy: Always containers: #- image: .dkr.ecr.us-west-2.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04 - image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04 name: test-nccl-efa-worker volumeMounts: - name: shmem mountPath: /dev/shm - name: efa-vol mountPath: "/dev/infiniband" securityContext: privileged: true resources: limits: nvidia.com/gpu: 1 #hugepages-2Mi: 5120Mi #vpc.amazonaws.com/efa: 1 memory: 8000Mi requests: nvidia.com/gpu: 1 #hugepages-2Mi: 5120Mi #vpc.amazonaws.com/efa: 1 memory: 8000Mi volumes: - name: shmem hostPath: path: /dev/shm - name: efa-vol hostPath: path: "/dev/infiniband"