--- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: name: training-priority value: 1000 preemptionPolicy: PreemptLowerPriority globalDefault: false description: "This is the priority class for training" --- apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: {{ .Values.maskrcnn.name }} namespace: {{ .Values.global.namespace }} labels: app.kubernetes.io/name: {{ .Values.maskrcnn.name }} app.kubernetes.io/instance: {{ .Release.Name }} app.kubernetes.io/managed-by: {{ .Release.Service }} spec: runPolicy: backoffLimit: {{ .Values.maskrcnn.backoff_limit }} cleanPodPolicy: Running mpiImplementation: OpenMPI mpiReplicaSpecs: Launcher: replicas: 1 template: spec: priorityClassName: "training-priority" restartPolicy: OnFailure containers: - name: {{ .Values.maskrcnn.name }} env: - name: HOROVOD_AUTOTUNE value: "1" - name: NCCL_SOCKET_IFNAME value: "{{ .Values.maskrcnn.nccl_socket_ifname }}" - name: NCCL_DEBUG value: "{{ .Values.maskrcnn.nccl_debug }}" - name: NCCL_BUFFSIZE value: "{{ .Values.maskrcnn.nccl_buffsize }}" - name: TF_DEVICE_MIN_SYS_MEMORY_IN_MB value: "{{ .Values.maskrcnn.tf_device_min_sys_mem_mb }}" - name: TF_CPP_MIN_LOG_LEVEL value: "2" command: - mpirun args: - --output-filename - /{{ .Values.global.shared_fs }}/{{ .Release.Name }}-{{ date "2006-01-02-15-04-05" .Release.Time }} - --allow-run-as-root - -np - "{{ .Values.maskrcnn.gpus }}" - -bind-to - none - -map-by - slot - -mca - btl_tcp_if_exclude - {{ .Values.maskrcnn.if_exclude }} - -mca - oob_tcp_if_exclude - {{ .Values.maskrcnn.if_exclude }} - -mca - plm_rsh_no_tree_spawn - "1" - -x - HOROVOD_AUTOTUNE - -x - NCCL_SOCKET_IFNAME - -x - NCCL_DEBUG - -x - NCCL_BUFFSIZE - -x - TF_DEVICE_MIN_SYS_MEMORY_IN_MB - -x - TF_CPP_MIN_LOG_LEVEL - -x - LD_LIBRARY_PATH - -x - PATH - -mca - pml - ob1 - -mca - btl - ^openib - --display-map - --tag-output - --timestamp-output - python3 - {{ .Values.maskrcnn.train_script }} - --logdir - /{{ .Values.global.shared_fs }}/{{ .Release.Name }}-{{ date "2006-01-02-15-04-05" .Release.Time }}/train_log/maskrcnn - --config - MODE_MASK={{ .Values.maskrcnn.mode_mask }} - MODE_FPN={{ .Values.maskrcnn.mode_fpn }} - DATA.BASEDIR=/{{ .Values.maskrcnn.data_fs }}/{{ .Values.maskrcnn.data_dir }} - DATA.TRAIN={{ .Values.maskrcnn.data_train }} - DATA.VAL={{ .Values.maskrcnn.data_val }} - TRAIN.EVAL_PERIOD={{ .Values.maskrcnn.eval_period_in_epochs }} - TRAIN.STEPS_PER_EPOCH={{ .Values.maskrcnn.steps_per_epoch }} - TRAIN.LR_SCHEDULE={{ .Values.maskrcnn.lr_schedule }} - BACKBONE.WEIGHTS=/{{ .Values.maskrcnn.data_fs }}/{{ .Values.maskrcnn.backbone_weights }} - BACKBONE.NORM={{ .Values.maskrcnn.backbone_norm }} - TRAINER=horovod - {{ .Values.maskrcnn.extra_config }} image: {{ .Values.maskrcnn.image }} imagePullPolicy: {{ .Values.maskrcnn.image_pull_policy }} Worker: replicas: {{ .Values.maskrcnn.gpu_nodes }} template: spec: priorityClassName: "training-priority" restartPolicy: Never volumes: - name: {{ .Values.global.shared_fs }} persistentVolumeClaim: claimName: {{ .Values.global.shared_pvc }} - name: ebs hostPath: path: /ebs type: DirectoryOrCreate - name: shm hostPath: path: /dev/shm type: Directory containers: - name: {{ .Values.maskrcnn.name }} image: {{ .Values.maskrcnn.image }} imagePullPolicy: {{ .Values.maskrcnn.image_pull_policy }} volumeMounts: - mountPath: /{{ .Values.global.shared_fs }} name: {{ .Values.global.shared_fs }} - mountPath: /ebs name: ebs - mountPath: /dev/shm name: shm resources: limits: nvidia.com/gpu: {{ .Values.maskrcnn.gpus_per_node }} slotsPerWorker: {{ .Values.maskrcnn.gpus_per_node }}