apiVersion: "kubeflow.org/v1" kind: PyTorchJob metadata: name: cifar10-train spec: elasticPolicy: rdzvBackend: etcd rdzvHost: etcd-service rdzvPort: 2379 minReplicas: 1 maxReplicas: 128 maxRestarts: 100 metrics: - type: Resource resource: name: cpu target: type: Utilization averageUtilization: 80 pytorchReplicaSpecs: Worker: replicas: ${NODE_COUNT} restartPolicy: OnFailure template: spec: containers: - name: pytorch image: ${REGISTRY}${IMAGE}${TAG} imagePullPolicy: IfNotPresent env: - name: PROCESSOR value: "${PROCESSOR}" command: - python3 - -m - torch.distributed.run - /workspace/cifar10-model-train.py - "--epochs=${EPOCHS}" - "--batch-size=${BATCH_SIZE}" - "--workers=${CPU_LIMIT}" - "--model-file=${MOUNT_PATH}/cifar10-model.pth" - "${MOUNT_PATH}/cifar-10-batches-py/" volumeMounts: - name: efs-pv mountPath: ${MOUNT_PATH} # The following enables the worker pods to use increased shared memory # which is required when specifying more than 0 data loader workers - name: dshm mountPath: /dev/shm volumes: - name: efs-pv persistentVolumeClaim: claimName: efs-pvc - name: dshm emptyDir: medium: Memory