# Example from: https://github.com/kubeflow/training-operator/tree/master/examples/pytorch/elastic/imagenet apiVersion: "kubeflow.org/v1" kind: PyTorchJob metadata: name: imagenet-elastic-gpu spec: elasticPolicy: rdzvBackend: etcd rdzvHost: etcd-service rdzvPort: 2379 minReplicas: 2 maxReplicas: 36 maxRestarts: 100 metrics: - type: Resource resource: name: cpu target: type: Utilization averageUtilization: 80 pytorchReplicaSpecs: Worker: replicas: 36 restartPolicy: OnFailure template: metadata: labels: app: imagenet spec: #nodeSelector: # node.kubernetes.io/instance-type: "g4dn.metal" affinity: podAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: app operator: In values: - imagenet topologyKey: "topology.kubernetes.io/zone" containers: - name: pytorch image: kubeflow/pytorch-elastic-example-imagenet:latest imagePullPolicy: IfNotPresent resources: requests: nvidia.com/gpu: 1 limits: nvidia.com/gpu: 1 env: - name: LOGLEVEL value: DEBUG command: - bash - -c - "wget https://raw.githubusercontent.com/pytorch/elastic/master/examples/imagenet/main.py -O /workspace/examples/imagenet.py; python -m torch.distributed.run /workspace/examples/imagenet.py --arch=resnet18 --epochs=20 --batch-size=32 --workers=0 --checkpoint-file=/workspace/checkpoint.pth.tar /workspace/data/tiny-imagenet-200"