apiVersion: kubeflow.org/v1alpha2 kind: MPIJob metadata: name: nccl-test-debug spec: slotsPerWorker: 8 cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 template: spec: containers: - image: 898739678081.dkr.ecr.us-west-2.amazonaws.com/efa-eks-benchmark:v0.2-efa-nccl-debug-root name: nccl-test-debug env: - name: LD_LIBRARY_PATH value: /opt/amazon/openmpi/lib:/nccl/build/lib:/opt/amazon/efa/lib:/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH - name: PATH value: $PATH:/opt/amazon/efa/bin command: ["/bin/sh"] args: ["-c", "/opt/amazon/openmpi/bin/mpirun \ --allow-run-as-root \ -n 2 -N 1 \ -x NCCL_DEBUG=INFO \ -x NCCL_ALGO=ring \ -x FI_LOG_PROV=efa \ -x FI_EFA_TX_MIN_CREDITS=64 \ -x FI_EFA_ENABLE_SHM_TRANSFER=0 \ --mca plm_rsh_no_tree_spawn 1 \ --bind-to none \ --mca pml ob1 \ --mca mtl ofi \ --mca mtl_ofi_provider_include efa \ --mca oob_tcp_if_include eth0 \ --mca btl_tcp_if_include eth0 \ --oversubscribe \ /tmp/nccl-tests/build/all_reduce_perf -b 8 -e 2G -f 2 -t 1 -g 1 -c 1 -n 1000"] Worker: replicas: 2 template: spec: containers: - image: 898739678081.dkr.ecr.us-west-2.amazonaws.com/efa-eks-benchmark:v0.2-efa-nccl-debug-root name: nccl-test-debug securityContext: privileged: true volumeMounts: - mountPath: /dev/infiniband/uverbs0 name: infiniband-efa resources: limits: nvidia.com/gpu: 8 hugepages-2Mi: 256Mi requests: memory: 8000Mi volumes: - name: infiniband-efa hostPath: path: /dev/infiniband/uverbs0