apiVersion: "kubeflow.org/v1"
kind: "PyTorchJob"
metadata:
  name: "pytorch-job-efs"
spec:
  pytorchReplicaSpecs:
    Master:
      replicas: 1
      restartPolicy: OnFailure
      template:
        metadata:
          annotations:
            sidecar.istio.io/inject: "false"
        spec:
          containers:
            - name: pytorch
              image: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.11.0-cpu-py38-ubuntu20.04-e3
              volumeMounts:
                - mountPath: /efs-shared
                  name: efs-shared
              args:
                - python
                - ./efs-shared/pipeline/mnist.py
                - --epochs
                - "5"
                - --seed
                - "7"
                - --log-interval
                - "60"
              #resources:
              #  limits:
              #    nvidia.com/gpu: 1
          volumes:
              - name: efs-shared
                persistentVolumeClaim:
                  claimName: efs-pvc

    Worker:
      replicas: 2
      restartPolicy: OnFailure
      template:
        metadata:
          annotations:
            sidecar.istio.io/inject: "false"
        spec:
          containers:
            - name: pytorch
              image: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.11.0-cpu-py38-ubuntu20.04-e3
              volumeMounts:
                - mountPath: /efs-shared
                  name: efs-shared
              args:
                - python
                - ./efs-shared/pipeline/mnist.py
                - --epochs
                - "5"
                - --seed
                - "7"
                - --log-interval
                - "60"
              #resources:
              #  limits:
              #    nvidia.com/gpu: 1
          volumes:
              - name: efs-shared
                persistentVolumeClaim:
                  claimName: efs-pvc