# Example from: https://github.com/kubeflow/training-operator/tree/master/examples/pytorch/elastic/imagenet
apiVersion: "kubeflow.org/v1"
kind: PyTorchJob
metadata:
  name: imagenet-elastic-gpu
spec:
  elasticPolicy:
    rdzvBackend: etcd
    rdzvHost: etcd-service
    rdzvPort: 2379
    minReplicas: 2
    maxReplicas: 36
    maxRestarts: 100
    metrics:
      - type: Resource
        resource:
          name: cpu
          target:
            type: Utilization
            averageUtilization: 80
  pytorchReplicaSpecs:
    Worker:
      replicas: 36
      restartPolicy: OnFailure
      template:
        metadata:
          labels:
            app: imagenet
        spec:
          #nodeSelector:
          #   node.kubernetes.io/instance-type: "g4dn.metal"
          affinity:
            podAffinity:
              requiredDuringSchedulingIgnoredDuringExecution:
              - labelSelector:
                  matchExpressions:
                  - key: app
                    operator: In
                    values:
                    - imagenet
                topologyKey: "topology.kubernetes.io/zone"
          containers:
            - name: pytorch
              image: kubeflow/pytorch-elastic-example-imagenet:latest
              imagePullPolicy: IfNotPresent
              resources:
                requests:
                  nvidia.com/gpu: 1
                limits:
                  nvidia.com/gpu: 1
              env:
              - name: LOGLEVEL
                value: DEBUG
              command:
                - bash
                - -c
                - "wget https://raw.githubusercontent.com/pytorch/elastic/master/examples/imagenet/main.py -O /workspace/examples/imagenet.py; python -m torch.distributed.run /workspace/examples/imagenet.py --arch=resnet18 --epochs=20 --batch-size=32 --workers=0 --checkpoint-file=/workspace/checkpoint.pth.tar /workspace/data/tiny-imagenet-200"