apiVersion: "kubeflow.org/v1beta2" kind: "PyTorchJob" metadata: name: "pytorch-dist-mnist-gloo" spec: pytorchReplicaSpecs: Master: replicas: 1 restartPolicy: OnFailure template: spec: containers: - name: pytorch image: seedjeffwan/pytorch-dist-mnist-test:1.0 imagePullPolicy: Always args: ["--backend", "gloo", "--save-model"] # Comment out the below resources to use the CPU. resources: limits: nvidia.com/gpu: 1 volumeMounts: - mountPath: /mount name: host-volume volumes: - name: host-volume hostPath: path: /tmp type: Directory Worker: replicas: 1 restartPolicy: OnFailure template: spec: containers: - name: pytorch imagePullPolicy: Always image: seedjeffwan/pytorch-dist-mnist-test:1.0 args: ["--backend", "gloo"] # Comment out the below resources to use the CPU. resources: limits: nvidia.com/gpu: 1