apiVersion: elastic.pytorch.org/v1alpha1 kind: ElasticJob metadata: name: imagenet #namespace: elastic-job spec: rdzvEndpoint: etcd-service:2379 minReplicas: 1 maxReplicas: 128 replicaSpecs: Worker: replicas: 1 restartPolicy: ExitCode template: apiVersion: v1 kind: Pod spec: nodeSelector: beta.kubernetes.io/instance-type: p3.8xlarge # g4dn.12xlarge # p3dn.24xlarge containers: - name: elasticjob-worker image: xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com/pytorch-efa:latest imagePullPolicy: Always env: - name: NCCL_DEBUG value: INFO # - name: FI_PROVIDER # value: sockets command: ["torchrun"] args: - "--nproc_per_node=4" - "/workspace/elastic/examples/imagenet/main.py" - "--arch=resnet50" - "--epochs=1" - "--batch-size=64" # number of data loader workers (NOT trainers) # zero means load the data on the same process as the trainer # pytorch data loaders use shm - "--workers=4" - "--checkpoint-file=/efs-shared/checkpoint.pth.tar" # This is the directory structure for ImageNet dataset - "/efs-shared/ILSVRC/Data/CLS-LOC/" resources: limits: nvidia.com/gpu: 4 volumeMounts: - name: efs-pv mountPath: /efs-shared # The following enables the worker pods to use increased shared memory # which is required when specifying more than 0 data loader workers - name: dshm mountPath: /dev/shm volumes: - name: efs-pv persistentVolumeClaim: claimName: efs-pvc - name: dshm emptyDir: medium: Memory