# Example from: https://github.com/kubeflow/training-operator/tree/master/examples/pytorch/elastic/imagenet apiVersion: "kubeflow.org/v1" kind: PyTorchJob metadata: name: imagenet-elastic-cpu spec: elasticPolicy: rdzvBackend: etcd rdzvHost: etcd-service rdzvPort: 2379 minReplicas: 2 maxReplicas: 24 maxRestarts: 100 metrics: - type: Resource resource: name: cpu target: type: Utilization averageUtilization: 80 pytorchReplicaSpecs: Worker: replicas: 2 restartPolicy: OnFailure template: spec: containers: - name: pytorch image: kubeflow/pytorch-elastic-example-imagenet:latest imagePullPolicy: IfNotPresent resources: requests: cpu: 4 env: - name: LOGLEVEL value: DEBUG command: - python - -m - torch.distributed.run - /workspace/examples/imagenet.py - "--arch=resnet18" - "--epochs=20" - "--batch-size=32" - "--workers=0" - "/workspace/data/tiny-imagenet-200"