apiVersion: "kubeflow.org/v1" kind: PyTorchJob metadata: name: mnist spec: pytorchReplicaSpecs: Master: replicas: 1 restartPolicy: OnFailure template: spec: containers: - name: pytorch image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727 imagePullPolicy: Always command: - "python3" - "/opt/pytorch-mnist/mnist.py" - "--epochs=10" Worker: replicas: 2 restartPolicy: OnFailure template: metadata: labels: app: mnist spec: #nodeSelector: #node.kubernetes.io/instance-type: "g4dn.metal" affinity: podAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: app operator: In values: - mnist topologyKey: "topology.kubernetes.io/zone" containers: - name: pytorch image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727 imagePullPolicy: Always resources: requests: cpu: 4 #nvidia.com/gpu: 8 limits: cpu: 4 #nvidia.com/gpu: 8 command: - "python3" - "/opt/pytorch-mnist/mnist.py" - "--epochs=10"