apiVersion: elastic.pytorch.org/v1alpha1 kind: ElasticJob metadata: name: imagenet-efa #namespace: elastic-job spec: # Use "etcd-service:2379" if you already applied etcd.yaml rdzvEndpoint: etcd-service:2379 minReplicas: 1 maxReplicas: 128 replicaSpecs: Worker: replicas: 2 restartPolicy: ExitCode template: apiVersion: v1 kind: Pod spec: nodeSelector: #beta.kubernetes.io/instance-type: p3dn.24xlarge beta.kubernetes.io/instance-type: p4d.24xlarge #beta.kubernetes.io/instance-type: g4dn.metal containers: - name: elasticjob-worker image: xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com/pytorch-efa:latest imagePullPolicy: Always env: - name: NCCL_DEBUG value: INFO - name: NCCL_ALGO value: Ring - name: FI_PROVIDER value: efa - name: FI_EFA_USE_DEVICE_RDMA value: "1" - name: RDMAV_FORK_SAFE value: "1" - name: FI_LOG_LEVEL value: "1" command: ["torchrun"] args: - "--nproc_per_node=8" - "/workspace/elastic/examples/imagenet/main.py" - "--arch=efficientnet_b7" - "--epochs=1" - "--batch-size=64" # number of data loader workers (NOT trainers) # zero means load the data on the same process as the trainer # pytorch data loaders use shm - "--workers=4" - "--checkpoint-file=/fsx-shared/checkpoint.pth.tar" # This is the directory structure for ImageNet dataset - "/fsx-shared/ILSVRC/Data/CLS-LOC/" resources: limits: nvidia.com/gpu: 8 hugepages-2Mi: 5120Mi vpc.amazonaws.com/efa: 4 memory: 80000Mi requests: nvidia.com/gpu: 8 hugepages-2Mi: 5120Mi vpc.amazonaws.com/efa: 4 memory: 80000Mi volumeMounts: - name: fsx-pv mountPath: /fsx-shared # The following enables the worker pods to use increased shared memory # which is required when specifying more than 0 data loader workers - name: dshm mountPath: /dev/shm volumes: - name: fsx-pv persistentVolumeClaim: claimName: fsx-pvc - name: dshm emptyDir: medium: Memory