apiVersion: "sparkoperator.k8s.io/v1beta2"
kind: SparkApplication
metadata:
  name: word-count
  namespace: spark
spec:
  type: Python
  pythonVersion: "3"
  mode: cluster
  image: ghcr.io/tripl-ai/arc:arc_3.10.0_spark_3.0.3_scala_2.12_hadoop_3.2.0_1.0.0
  imagePullPolicy: Always
  mainApplicationFile: "s3a://$(BUCKET_PARAM)/app_code/job/wordcount.py"
  arguments: ["s3a://amazon-reviews-pds/parquet/","s3a://$(BUCKET_PARAM)/app_code/output/native"]
  sparkVersion: "3.0.3"
  sparkConf:
    "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem"
    "spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider"
    "spark.kubernetes.allocation.batch.size": "15" 
    "spark.io.encryption.enabled": "true"
    "spark.kubernetes.local.dirs.tmpfs": "true"
  volumes:
    - name: spark-local-dir-1
      hostPath:
        path: "/tmp"
        type: Directory      
  dynamicAllocation:
    enabled: true
    initialExecutors: 1
    minExecutors: 1
    maxExecutors: 20
  restartPolicy:
    type: OnFailure
    onFailureRetries: 3
    onFailureRetryInterval: 10
    onSubmissionFailureRetries: 5
    onSubmissionFailureRetryInterval: 5         
  driver:
    # driver run on Spot
    affinity:
      nodeAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:  
          nodeSelectorTerms:
            - matchExpressions:
              - key: lifecycle
                operator: In 
                values: 
                - Ec2Spot
                # - OnDemand    
    env:
      - name: BUCKET_PARAM
        valueFrom:
          configMapKeyRef:
            name: special-config
            key: codeBucket
    cores: 1
    memory: "1G"
    labels:
      role: driver
    serviceAccount: nativejob
    volumeMounts:
      - name: spark-local-dir-1
        mountPath: "/tmp"
  executor:
   # executors run on Spot
    affinity:
      nodeAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:     
          nodeSelectorTerms:
            - matchExpressions:
              - key: lifecycle
                operator: In 
                values: 
                - Ec2Spot    
    cores: 1
    memory: "4G"
    labels:
      role: executor
    volumeMounts:
      - name: spark-local-dir-1
        mountPath: "/tmp"