# create helm chart to parametrize and keep DRY --- apiVersion: v1 kind: Service metadata: name: lead-node namespace: mpi-operator spec: clusterIP: None selector: training.kubeflow.org/job-name: bart-squad # keep it the same as name for MPIJob training.kubeflow.org/job-role: worker training.kubeflow.org/replica-index: "0" ports: - protocol: TCP port: 29400 targetPort: 29400 --- apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: bart-squad namespace: mpi-operator spec: slotsPerWorker: 8 # number of GPUs on each P4de mpiReplicaSpecs: Launcher: replicas: 1 template: spec: restartPolicy: OnFailure containers: - image: 123.dkr.ecr.us-east-1.amazonaws.com/lipovsek:jax # TODO: no need for heavy GPU image imagePullPolicy: IfNotPresent #Always name: launcher resources: limits: cpu: 2 memory: 5Gi requests: cpu: 2 memory: 5Gi env: - name: PATH value: "/opt/amazon/efa/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" # TODO: issue with expanding - name: LD_LIBRARY_PATH value: "/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64" # TODO: issue with expanding - name: XLA_FLAGS value: "--xla_gpu_cuda_data_dir=/usr/local/cuda" - name: TF_XLA_FLAGS value: "--tf_xla_cpu_global_jit" args: - /opt/amazon/openmpi/bin/mpirun - --allow-run-as-root - --tag-output - -np - "16" - --map-by - "ppr:8:node" - --bind-to - none - --mca - pml - ^cm - --mca - btl - "tcp,self" - --mca - btl_tcp_if_exclude - "lo,docker1" - -x - PATH - -x - LD_LIBRARY_PATH - -x - XLA_FLAGS - -x - TF_XLA_FLAGS - -x - FI_PROVIDER=efa - -x - NCCL_DEBUG=INFO - -x - FI_EFA_USE_DEVICE_RDMA=1 - -x - RDMAV_FORK_SAFE=1 - -x - NCCL_PROTO=simple - -x - LEAD_NODE=lead-node # service name - /bin/bash - /tmp/synthetic.sh #- /workspace/transformers/examples/flax/question-answering/run.sh Worker: replicas: 2 template: spec: nodeSelector: alpha.eksctl.io/nodegroup-name: gpu-compute tolerations: - key: "smml.io/gpu" operator: Exists effect: NoSchedule containers: - image: 123.dkr.ecr.us-east-1.amazonaws.com/lipovsek:jax imagePullPolicy: Always name: training-worker resources: limits: nvidia.com/gpu: 8 hugepages-2Mi: 5120Mi vpc.amazonaws.com/efa: 4 memory: 138Gi cpu: 11 requests: nvidia.com/gpu: 8 hugepages-2Mi: 5120Mi vpc.amazonaws.com/efa: 4 memory: 138Gi cpu: 11