apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: bert-efa-32 spec: slotsPerWorker: 1 cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 template: spec: restartPolicy: OnFailure volumes: - name: fsx persistentVolumeClaim: claimName: fsx-claim containers: - image: 231748552833.dkr.ecr.us-east-1.amazonaws.com/aws/bert-pt-py3:latest imagePullPolicy: Always name: bert-efa-pretraining env: - name: GPUS_PER_NODE value: "8" - name: MASTER_ADDR value: bert-efa-32-worker-0 - name: MASTER_PORT value: "12345" - name: NNODES value: $OMPI_MCA_orte_num_nodes - name: NODE_RANK value: $OMPI_COMM_WORLD_RANK - name: WORLD_SIZE value: $(($GPUS_PER_NODE*$NNODES)) - name: LD_LIBRARY_PATH value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH - name: PATH value: $PATH:/opt/amazon/efa/bin:/usr/bin:/usr/sbin - name: CODE_DIR value: /workspace/bert - name: INPUT_DIR value: /fsx/bert-pytorch2/pretrain/phase1/unbinned/parquet command: - /opt/amazon/openmpi/bin/mpirun - --allow-run-as-root - --tag-output - -np - "2" - -bind-to - none - -map-by - slot - -x - PATH - -x - LD_LIBRARY_PATH - -x - NCCL_DEBUG=INFO - -x - MASTER_ADDR - -x - MASTER_PORT - -x - NCCL_ALGO=RING - -x - FI_PROVIDER=efa - -x - FI_EFA_USE_DEVICE_RDMA=1 - -x - CODE_DIR - -x - INPUT_DIR - --mca - pml - ^cm - --oversubscribe - /fsx/bert-pytorch2/bert_pretraining.sh volumeMounts: - name: fsx mountPath: /fsx Worker: replicas: 2 template: spec: volumes: - name: dshm emptyDir: medium: Memory - name: fsx persistentVolumeClaim: claimName: fsx-claim nodeSelector: beta.kubernetes.io/instance-type: p4de.24xlarge containers: - image: 231748552833.dkr.ecr.us-east-1.amazonaws.com/aws/bert-pt-py3:latest imagePullPolicy: Always name: bert-worker volumeMounts: - mountPath: /dev/shm name: dshm - name: fsx mountPath: /fsx resources: limits: cpu: 92 nvidia.com/gpu: 8 hugepages-2Mi: 5120Mi vpc.amazonaws.com/efa: 4 memory: 180000Mi requests: cpu: 92 nvidia.com/gpu: 8 hugepages-2Mi: 5120Mi vpc.amazonaws.com/efa: 4 memory: 180000Mi