#!/bin/bash # SPDX-FileCopyrightText: Copyright 2021 Amazon.com, Inc. or its affiliates. # SPDX-License-Identifier: MIT-0 # export EMRCLUSTER_NAME=emr-on-eks-quickstart # export AWS_REGION=us-east-1 export ACCOUNTID=$(aws sts get-caller-identity --query Account --output text) export VIRTUAL_CLUSTER_ID=$(aws emr-containers list-virtual-clusters --query "virtualClusters[?name == '$EMRCLUSTER_NAME' && state == 'RUNNING'].id" --output text) export EMR_ROLE_ARN=arn:aws:iam::$ACCOUNTID:role/$EMRCLUSTER_NAME-execution-role export S3BUCKET=$EMRCLUSTER_NAME-$ACCOUNTID-$AWS_REGION aws emr-containers start-job-run \ --virtual-cluster-id $VIRTUAL_CLUSTER_ID \ --name em68-hudi-cow \ --execution-role-arn $EMR_ROLE_ARN \ --release-label emr-6.8.0-latest \ --job-driver '{ "sparkSubmitJobDriver": { "entryPoint": "s3://'$S3BUCKET'/blog/hudi_scd_script.py", "entryPointArguments":["'$AWS_REGION'","'$S3BUCKET'","COW"], "sparkSubmitParameters": "--jars local:///usr/lib/hudi/hudi-spark-bundle.jar,local:///usr/lib/spark/external/lib/spark-avro.jar --conf spark.executor.memory=2G --conf spark.executor.cores=2"}}' \ --configuration-overrides '{ "applicationConfiguration": [ { "classification": "spark-defaults", "properties": { "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.hadoop.hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" }} ], "monitoringConfiguration": { "s3MonitoringConfiguration": {"logUri": "s3://'$S3BUCKET'/elasticmapreduce/emr-containers"}}}'