apiVersion: elastic.pytorch.org/v1alpha1 kind: ElasticJob metadata: name: classy-vision namespace: elastic-job spec: # Use "etcd-service:2379" if you already apply etcd.yaml rdzvEndpoint: ":" minReplicas: 1 maxReplicas: 2 replicaSpecs: Worker: replicas: 2 restartPolicy: ExitCode template: apiVersion: v1 kind: Pod spec: containers: - name: elasticjob-worker image: torchelastic/examples:0.2.0 imagePullPolicy: Always args: - "--nproc_per_node=1" - "/workspace/classy_vision/classy_train.py" - "--config_file" - "/workspace/classy_vision/configs/template_config.json" # number of data loader workers (NOT trainers) # zero means load the data on the same process as the trainer # this is set so that the container does not OOM since # pytorch data loaders use shm - "--num_workers=0"