apiVersion: kubeflow.org/v1alpha2 kind: MPIJob metadata: name: efa-info-test spec: slotsPerWorker: 1 cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 template: spec: imagePullPolicy: IfNotPresent restartPolicy: OnFailure containers: #- image: .dkr.ecr..amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04 - image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04 name: efa-info-launcher env: - name: LD_LIBRARY_PATH value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH - name: PATH value: $PATH:/opt/amazon/efa/bin:/usr/bin - name: XLA_FLAGS value: "--xla_gpu_cuda_data_dir=/usr/local/cuda" - name: TF_XLA_FLAGS value: "--tf_xla_cpu_global_jit" - name: NCCL_DEBUG value: INFO command: - /opt/amazon/openmpi/bin/mpirun - --allow-run-as-root - --tag-output - -np - "2" - -bind-to - none - -map-by - slot - -x - PATH - -x - LD_LIBRARY_PATH - -x - XLA_FLAGS - -x - TF_XLA_FLAGS - -x - NCCL_DEBUG=INFO - -x - NCCL_ALGO=RING - -x - FI_EFA_USE_DEVICE_RDMA=0 - -x - RDMAV_FORK_SAFE=0 - -x - NCCL_DEBUG - --mca - pml - ^cm - --mca - pml_rsh_agent=ssh - --oversubscribe - /opt/amazon/efa/bin/fi_info - -p - "efa" - -t - "FI_EP_RDM" Worker: replicas: 2 template: spec: imagePullPolicy: IfNotPresent containers: #- image: .dkr.ecr..amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04 - image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04 name: efa-info-worker resources: limits: nvidia.com/gpu: 1 #hugepages-2Mi: 5120Mi vpc.amazonaws.com/efa: 1 #memory: 8000Mi requests: nvidia.com/gpu: 1 #hugepages-2Mi: 5120Mi vpc.amazonaws.com/efa: 1 #memory: 8000Mi