apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
  name: bert-large-pretraining-efa
spec:
  mpiReplicaSpecs:
    Launcher:
      replicas: 1
      template:
        spec:
          containers:
          - command:
            - mpirun
            - -x
            - LD_LIBRARY_PATH=/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/shared/lib:/opt/hpcx/ompi/lib
            - -x
            - PATH=/shared/bin:/usr/local/sbin:/usr/local/bin:/usr/bin:/usr/bin/site_perl:/usr/bin/vendor_perl:/usr/bin/core_perl
            - -x
            - NCCL_SHM_DISABLE=0
            - -map-by
            - slot
            - -x
            - NCCL_MIN_NCHANNELS=8
            - -x
            - NCCL_DEBUG=INFO
            - -x
            - NCCL_SOCKET_IFNAME=eth0
            - -x
            - FI_EFA_FORK_SAFE=1
            - -x
            - RDMAV_FORM_SAFE=1
            - -x
            - NCCL_PROTO=simple
            - -x
            - NCCL_ALGO=RING
            - -x
            - FI_LOG_LEVEL=warn
            - -x
            - FI_EFA_USE_DEVICE_RDMA=0
            - -x
            - TF_XLA_FLAGS
            - --allow-run-as-root
            - -np
            - "16"
            - --oversubscribe
            - /usr/bin/python3
            - /workspace/BERT/run_pretraining.py
            - --input_files_dir=/fsx/workspace/bert/data/tfrecord/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
            - --eval_files_dir=/fsx/workspace/bert/data/tfrecord/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/test
            - --output_dir=/fsx/workspace/bert/shared/checkpoints/phase_1
            - --bert_config_file=/fsx/workspace/bert/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json
            - --do_train=True
            - --do_eval=True
            - --train_batch_size=64
            - --eval_batch_size=8
            - --max_seq_length=128
            - --max_predictions_per_seq=20
            - --num_train_steps=100000
            - --num_warmup_steps=2800
            - --num_accumulation_steps=128
            - --save_checkpoints_steps=100
            - --learning_rate=4.6875e-5
            - --horovod
            - --amp=false
            - --manual_fp16
            - --use_xla=true
            - --allreduce_post_accumulation=True
            env:
            - name: XLA_FLAGS
              value: --xla_gpu_cuda_data_dir=/usr/local/cuda
            - name: TF_XLA_FLAGS
              value: --tf_xla_cpu_global_jit
            image: ${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/bert:1.0
            imagePullPolicy: Always
            name: test-run-launcher
            volumeMounts:
            - name:  fsx
              mountPath: /fsx
          initContainers:
          - command:
            - sh
            - -c
            - sleep 5
            image: ${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/bert:1.0
            name: init
          restartPolicy: Never
          volumes:
          - name:  fsx
            persistentVolumeClaim:
              claimName: fsx-claim
    Worker:
      replicas: 2
      template:
        spec:
          containers:
          - image: ${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/bert:1.0
            imagePullPolicy: Always
            name: test-run-worker
            resources:
              limits:
                hugepages-2Mi: 5120Mi
                memory: 400Gi
                nvidia.com/gpu: 8
                vpc.amazonaws.com/efa: 4
              requests:
                hugepages-2Mi: 5120Mi
                memory: 400Gi
                nvidia.com/gpu: 8
                vpc.amazonaws.com/efa: 4
            volumeMounts:
            - mountPath: /dev/shm
              name: dshm
            - mountPath: /wd
              name: wd
            - name:  fsx
              mountPath: /fsx
          restartPolicy: Never
          volumes:
          - name:  fsx
            persistentVolumeClaim:
              claimName: fsx-claim
          - emptyDir:
              medium: Memory
            name: dshm
          - hostPath:
              path: /tmp
            name: wd
  runPolicy:
    cleanPodPolicy: Running
  slotsPerWorker: 8