apiVersion: elastic.pytorch.org/v1alpha1
kind: ElasticJob
metadata:
  name: imagenet-efa
  #namespace: elastic-job
spec:
  # Use "etcd-service:2379" if you already applied etcd.yaml
  rdzvEndpoint: etcd-service:2379
  minReplicas: 1
  maxReplicas: 128
  replicaSpecs:
    Worker:
      replicas: 2
      restartPolicy: ExitCode
      template:
        apiVersion: v1
        kind: Pod
        spec:
          nodeSelector:
            #beta.kubernetes.io/instance-type: p3dn.24xlarge
            beta.kubernetes.io/instance-type: p4d.24xlarge
            #beta.kubernetes.io/instance-type: g4dn.metal
          containers:
            - name: elasticjob-worker
              image: xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com/pytorch-efa:latest
              imagePullPolicy: Always
              env:
                - name: NCCL_DEBUG
                  value: INFO
                - name: NCCL_ALGO
                  value: Ring
                - name: FI_PROVIDER
                  value: efa
                - name: FI_EFA_USE_DEVICE_RDMA
                  value: "1"
                - name: RDMAV_FORK_SAFE
                  value: "1"
                - name: FI_LOG_LEVEL
                  value: "1"
              command: ["torchrun"]
              args:
                - "--nproc_per_node=8"
                - "/workspace/elastic/examples/imagenet/main.py"
                - "--arch=efficientnet_b7"
                - "--epochs=1"
                - "--batch-size=64"
                # number of data loader workers (NOT trainers)
                # zero means load the data on the same process as the trainer
                # pytorch data loaders use shm
                - "--workers=4"
                - "--checkpoint-file=/fsx-shared/checkpoint.pth.tar"
                # This is the directory structure for ImageNet dataset
                - "/fsx-shared/ILSVRC/Data/CLS-LOC/"
              resources:
                limits:
                  nvidia.com/gpu: 8
                  hugepages-2Mi: 5120Mi
                  vpc.amazonaws.com/efa: 4
                  memory: 80000Mi
                requests:
                  nvidia.com/gpu: 8
                  hugepages-2Mi: 5120Mi
                  vpc.amazonaws.com/efa: 4
                  memory: 80000Mi
              volumeMounts:
                - name: fsx-pv
                  mountPath: /fsx-shared
                # The following enables the worker pods to use increased shared memory 
                # which is required when specifying more than 0 data loader workers
                - name: dshm
                  mountPath: /dev/shm
          volumes:
            - name: fsx-pv
              persistentVolumeClaim:
                claimName: fsx-pvc
            - name: dshm
              emptyDir:     
                medium: Memory