#!/bin/bash # SPDX-FileCopyrightText: Copyright 2021 Amazon.com, Inc. or its affiliates. # SPDX-License-Identifier: MIT-0 # export EMRCLUSTER_NAME=emr-on-eks-quickstart # export AWS_REGION=us-east-1 export ACCOUNTID=$(aws sts get-caller-identity --query Account --output text) export VIRTUAL_CLUSTER_ID=$(aws emr-containers list-virtual-clusters --query "virtualClusters[?name == '$EMRCLUSTER_NAME' && state == 'RUNNING'].id" --output text) export EMR_ROLE_ARN=arn:aws:iam::$ACCOUNTID:role/$EMRCLUSTER_NAME-execution-role export S3BUCKET=$EMRCLUSTER_NAME-$ACCOUNTID-$AWS_REGION aws emr-containers start-job-run \ --virtual-cluster-id $VIRTUAL_CLUSTER_ID \ --name em68-delta \ --execution-role-arn $EMR_ROLE_ARN \ --release-label emr-6.8.0-latest \ --job-driver '{ "sparkSubmitJobDriver": { "entryPoint": "s3://'$S3BUCKET'/blog/delta_scd_script.py", "entryPointArguments":["'$S3BUCKET'"], "sparkSubmitParameters": "--conf spark.executor.memory=2G --conf spark.executor.cores=2"}}' \ --configuration-overrides '{ "applicationConfiguration": [ { "classification": "spark-defaults", "properties": { "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", "spark.sql.catalog.spark_catalog":"org.apache.spark.sql.delta.catalog.DeltaCatalog", "spark.serializer":"org.apache.spark.serializer.KryoSerializer", "spark.hadoop.hive.metastore.client.factory.class":"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory", "spark.jars": "https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.1.0/delta-core_2.12-2.1.0.jar,https://repo1.maven.org/maven2/io/delta/delta-storage/2.1.0/delta-storage-2.1.0.jar" }} ], "monitoringConfiguration": { "s3MonitoringConfiguration": {"logUri": "s3://'$S3BUCKET'/elasticmapreduce/emr-containers"}}}'