apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: megatron-8b-pretraining-efa spec: slotsPerWorker: 1 runPolicy: cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 template: spec: imagePullPolicy: Always restartPolicy: OnFailure volumes: - name: fsx persistentVolumeClaim: claimName: fsx-claim containers: - image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:megatron-cu11.6 name: megatron-training-launcher env: - name: DATA_PATH value: /fsx/gpt2-pytorch/my-gpt2_text_document - name: CHECKPOINT_PATH value: /fsx/gpt2-pytorch/gpt2/checkpoint - name: VOCAB_FILE value: /fsx/gpt2-pytorch/gpt2/gpt2-vocab.json - name: MERGES_FILE value: /fsx/gpt2-pytorch/gpt2/gpt2-merges.txt - name: GPUS_PER_NODE value: "8" - name: MASTER_ADDR value: megatron-8b-pretraining-efa-worker-0 - name: MASTER_PORT value: "6002" - name: NNODES value: $OMPI_MCA_orte_num_nodes - name: NODE_RANK value: $OMPI_COMM_WORLD_RANK - name: WORLD_SIZE value: $(($GPUS_PER_NODE*$NNODES)) - name: LD_LIBRARY_PATH value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH - name: PATH value: $PATH:/opt/amazon/efa/bin:/usr/local/bin:/usr/bin:/opt/conda/bin - name: XLA_FLAGS value: "--xla_gpu_cuda_data_dir=/usr/local/cuda" - name: TF_XLA_FLAGS value: "--tf_xla_cpu_global_jit" - name: DISTRIBUTED_ARGS value: "--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" command: - /opt/amazon/openmpi/bin/mpirun - --allow-run-as-root - --tag-output - -np - "16" - -bind-to - none - -map-by - slot - -x - PATH - -x - LD_LIBRARY_PATH - -x - MASTER_ADDR - -x - MASTER_PORT - /fsx/gpt2-pytorch/train_8B_gpt2.sh volumeMounts: - name: fsx mountPath: /fsx Worker: replicas: 16 template: spec: nodeSelector: beta.kubernetes.io/instance-type: p4d.24xlarge volumes: - name: dshm emptyDir: medium: Memory - name: fsx persistentVolumeClaim: claimName: fsx-claim containers: - image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:megatron-cu11.6 imagePullPolicy: Always env: - name: DATA_PATH value: /fsx/gpt2-pytorch/my-gpt2_text_document - name: CHECKPOINT_PATH value: /fsx/gpt2-pytorch/gpt2/checkpoint - name: VOCAB_FILE value: /fsx/gpt2-pytorch/gpt2/gpt2-vocab.json - name: MERGES_FILE value: /fsx/gpt2-pytorch/gpt2/gpt2-merges.txt name: megatron-worker resources: limits: cpu: 92 nvidia.com/gpu: 8 hugepages-2Mi: 10120Mi memory: 584000Mi vpc.amazonaws.com/efa: 4 requests: cpu: 92 nvidia.com/gpu: 8 hugepages-2Mi: 10120Mi memory: 584000Mi vpc.amazonaws.com/efa: 4 volumeMounts: - name: dshm mountPath: /dev/shm - name: fsx mountPath: /fsx