apiVersion: "kubeflow.org/v1" kind: "PyTorchJob" metadata: name: "pytorch-job-efs" spec: pytorchReplicaSpecs: Master: replicas: 1 restartPolicy: OnFailure template: metadata: annotations: sidecar.istio.io/inject: "false" spec: containers: - name: pytorch image: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.11.0-cpu-py38-ubuntu20.04-e3 volumeMounts: - mountPath: /efs-shared name: efs-shared args: - python - ./efs-shared/pipeline/mnist.py - --epochs - "5" - --seed - "7" - --log-interval - "60" #resources: # limits: # nvidia.com/gpu: 1 volumes: - name: efs-shared persistentVolumeClaim: claimName: efs-pvc Worker: replicas: 2 restartPolicy: OnFailure template: metadata: annotations: sidecar.istio.io/inject: "false" spec: containers: - name: pytorch image: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.11.0-cpu-py38-ubuntu20.04-e3 volumeMounts: - mountPath: /efs-shared name: efs-shared args: - python - ./efs-shared/pipeline/mnist.py - --epochs - "5" - --seed - "7" - --log-interval - "60" #resources: # limits: # nvidia.com/gpu: 1 volumes: - name: efs-shared persistentVolumeClaim: claimName: efs-pvc