apiVersion: "kubeflow.org/v1" kind: "PyTorchJob" metadata: name: "pytorch-dist-mnist-nccl" spec: pytorchReplicaSpecs: Master: replicas: 1 restartPolicy: OnFailure template: metadata: annotations: sidecar.istio.io/inject: "false" spec: containers: - name: pytorch image: ${REGISTRY}${IMAGE}:latest args: ["--backend", "nccl"] resources: limits: nvidia.com/gpu: 1 Worker: replicas: 2 restartPolicy: OnFailure template: metadata: annotations: sidecar.istio.io/inject: "false" labels: app: mnist spec: #nodeSelector: #node.kubernetes.io/instance-type: "g4dn.2xlarge" affinity: podAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: app operator: In values: - mnist topologyKey: "topology.kubernetes.io/zone" containers: - name: pytorch image: ${REGISTRY}${IMAGE}:latest args: ["--backend", "nccl"] resources: limits: nvidia.com/gpu: 1