apiVersion: elastic.pytorch.org/v1alpha1 kind: ElasticJob metadata: name: imagenet #namespace: elastic-job spec: # Use "etcd-service:2379" if you already apply etcd.yaml rdzvEndpoint: etcd-service:2379 minReplicas: 1 maxReplicas: 128 replicaSpecs: Worker: replicas: 2 restartPolicy: ExitCode template: apiVersion: v1 kind: Pod spec: nodeSelector: beta.kubernetes.io/instance-type: g4dn.metal containers: - name: elasticjob-worker image: torchelastic/examples:0.2.0 imagePullPolicy: Always env: - name: NCCL_DEBUG value: INFO # - name: NCCL_SOCKET_IFNAME # value: lo # - name: FI_PROVIDER # value: sockets args: - "--nproc_per_node=8" - "/workspace/examples/imagenet/main.py" - "--arch=resnet18" - "--epochs=20" - "--batch-size=32" # number of data loader workers (NOT trainers) # zero means load the data on the same process as the trainer # this is set so that the container does not OOM since # pytorch data loaders use shm - "--workers=0" - "/workspace/data/tiny-imagenet-200" resources: limits: nvidia.com/gpu: 8