apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: bert-large-pretraining-efa spec: mpiReplicaSpecs: Launcher: replicas: 1 template: spec: containers: - command: - mpirun - -x - LD_LIBRARY_PATH=/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/shared/lib:/opt/hpcx/ompi/lib - -x - PATH=/shared/bin:/usr/local/sbin:/usr/local/bin:/usr/bin:/usr/bin/site_perl:/usr/bin/vendor_perl:/usr/bin/core_perl - -x - NCCL_SHM_DISABLE=0 - -map-by - slot - -x - NCCL_MIN_NCHANNELS=8 - -x - NCCL_DEBUG=INFO - -x - NCCL_SOCKET_IFNAME=eth0 - -x - FI_EFA_FORK_SAFE=1 - -x - RDMAV_FORM_SAFE=1 - -x - NCCL_PROTO=simple - -x - NCCL_ALGO=RING - -x - FI_LOG_LEVEL=warn - -x - FI_EFA_USE_DEVICE_RDMA=0 - -x - TF_XLA_FLAGS - --allow-run-as-root - -np - "16" - --oversubscribe - /usr/bin/python3 - /workspace/BERT/run_pretraining.py - --input_files_dir=/fsx/workspace/bert/data/tfrecord/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training - --eval_files_dir=/fsx/workspace/bert/data/tfrecord/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/test - --output_dir=/fsx/workspace/bert/shared/checkpoints/phase_1 - --bert_config_file=/fsx/workspace/bert/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json - --do_train=True - --do_eval=True - --train_batch_size=64 - --eval_batch_size=8 - --max_seq_length=128 - --max_predictions_per_seq=20 - --num_train_steps=100000 - --num_warmup_steps=2800 - --num_accumulation_steps=128 - --save_checkpoints_steps=100 - --learning_rate=4.6875e-5 - --horovod - --amp=false - --manual_fp16 - --use_xla=true - --allreduce_post_accumulation=True env: - name: XLA_FLAGS value: --xla_gpu_cuda_data_dir=/usr/local/cuda - name: TF_XLA_FLAGS value: --tf_xla_cpu_global_jit image: ${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/bert:1.0 imagePullPolicy: Always name: test-run-launcher volumeMounts: - name: fsx mountPath: /fsx initContainers: - command: - sh - -c - sleep 5 image: ${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/bert:1.0 name: init restartPolicy: Never volumes: - name: fsx persistentVolumeClaim: claimName: fsx-claim Worker: replicas: 2 template: spec: containers: - image: ${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/bert:1.0 imagePullPolicy: Always name: test-run-worker resources: limits: hugepages-2Mi: 5120Mi memory: 400Gi nvidia.com/gpu: 8 vpc.amazonaws.com/efa: 4 requests: hugepages-2Mi: 5120Mi memory: 400Gi nvidia.com/gpu: 8 vpc.amazonaws.com/efa: 4 volumeMounts: - mountPath: /dev/shm name: dshm - mountPath: /wd name: wd - name: fsx mountPath: /fsx restartPolicy: Never volumes: - name: fsx persistentVolumeClaim: claimName: fsx-claim - emptyDir: medium: Memory name: dshm - hostPath: path: /tmp name: wd runPolicy: cleanPodPolicy: Running slotsPerWorker: 8