apiVersion: elastic.pytorch.org/v1alpha1
kind: ElasticJob
metadata:
  name: imagenet
  #namespace: elastic-job
spec:
  rdzvEndpoint: etcd-service:2379
  minReplicas: 1
  maxReplicas: 128
  replicaSpecs:
    Worker:
      replicas: 1
      restartPolicy: ExitCode
      template:
        apiVersion: v1
        kind: Pod
        spec:
          nodeSelector:
            beta.kubernetes.io/instance-type: p3.8xlarge # g4dn.12xlarge # p3dn.24xlarge
          containers:
            - name: elasticjob-worker
              image: xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com/pytorch-efa:latest
              imagePullPolicy: Always
              env:
                - name: NCCL_DEBUG
                  value: INFO
              #  - name: FI_PROVIDER
              #    value: sockets
              command: ["torchrun"]
              args:
                - "--nproc_per_node=4"
                - "/workspace/elastic/examples/imagenet/main.py"
                - "--arch=resnet50"
                - "--epochs=1"
                - "--batch-size=64"
                # number of data loader workers (NOT trainers)
                # zero means load the data on the same process as the trainer
                # pytorch data loaders use shm
                - "--workers=4"
                - "--checkpoint-file=/efs-shared/checkpoint.pth.tar"
                # This is the directory structure for ImageNet dataset
                - "/efs-shared/ILSVRC/Data/CLS-LOC/"
              resources:
                limits:
                  nvidia.com/gpu: 4
              volumeMounts:
                - name: efs-pv
                  mountPath: /efs-shared
                # The following enables the worker pods to use increased shared memory 
                # which is required when specifying more than 0 data loader workers
                - name: dshm
                  mountPath: /dev/shm
          volumes:
            - name: efs-pv
              persistentVolumeClaim:
                claimName: efs-pvc
            - name: dshm
              emptyDir:     
                medium: Memory