apiVersion: "sparkoperator.k8s.io/v1beta2" kind: SparkApplication metadata: name: tpcds-benchmark-data-generation-1g namespace: default spec: type: Scala image: seedjeffwan/spark:v2.4.5-examples sparkVersion: 2.4.5 mainClass: com.amazonaws.eks.tpcds.DataGeneration mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar mode: cluster arguments: # TPC-DS data localtion - "s3a://spark-k8s-data/TPCDS-TEST-1G" # Path to kit in the docker image - "/opt/tpcds-kit/tools" # Data Format - "parquet" # Scale factor (in GB) - "1" # Generate data num partitions - "100" # Create the partitioned fact tables - "false" # Shuffle to get partitions coalesced into single files. - "false" # Logging set to WARN - "true" sparkConf: "spark.speculation": "false" "spark.network.timeout": "2400" # S3 Credential "spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.InstanceProfileCredentialsProvider" # S3 Configs "spark.hadoop.fs.s3a.connection.timeout": "1200000" "spark.hadoop.fs.s3a.path.style.access": "true" "spark.hadoop.fs.s3a.connection.maximum": "200" "spark.hadoop.fs.s3a.fast.upload": "true" "spark.hadoop.fs.s3a.readahead.range": "256K" "spark.hadoop.fs.s3a.input.fadvise": "random" # S3A Committer "spark.hadoop.mapreduce.outputcommitter.factory.scheme.s3a": "org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory" # "spark.sql.sources.commitProtocolClass": "org.apache.spark.internal.io.cloud.PathOutputCommitProtocol" # "spark.sql.parquet.output.committer.class": "org.apache.hadoop.mapreduce.lib.output.BindingPathOutputCommitter" "spark.hadoop.fs.s3a.committer.name": "directory" "spark.hadoop.fs.s3a.committer.staging.conflict-mode": "append" driver: cores: 2 coreLimit: "2048m" memory: "8000m" serviceAccount: spark executor: instances: 3 cores: 1 memory: "8000m" memoryOverhead: 2g restartPolicy: type: Never