#!/bin/bash ##################################################### # The environment variables are specific to the demo. # Populate the values below to match your setup. # # EMR cluster variables AWS_PROFILE='default' REGION='us-east-1' SUBNET_ID='subnet-12345678' EMR_CLUSTER_NAME='CrrPreexistingDemo' # NOTE: The Spark application stores its results in the inventory bucket. # This script clears the results on each run using s3 rm --recursive. INVENTORY_BUCKET='crr-preexisting-demo-inventory' MASTER_INSTANCE_TYPE='m4.xlarge' CORE_INSTANCE_TYPE='m4.2xlarge' CORE_INSTANCE_COUNT='1' EMR_RELEASE='emr-5.17.0' # EMR job variables GLUE_DATABASE_NAME='default' ATHENA_TABLE_NAME='crr_preexisting_demo' INVENTORY_DATE='2019-02-24-04-00' PARTITIONS='1' ##################################################### echo "Creating default EMR roles.." aws --profile $AWS_PROFILE emr create-default-roles echo "Uploading bootstrap actions and steps to S3.." aws --profile $AWS_PROFILE s3 cp emr_scripts/bootstrap.sh s3://${INVENTORY_BUCKET}/emr/bootstrap.sh aws --profile $AWS_PROFILE s3 cp emr_scripts/step_0.sh s3://${INVENTORY_BUCKET}/emr/step_0.sh aws --profile $AWS_PROFILE s3 cp emr_scripts/copy_objects.py s3://${INVENTORY_BUCKET}/emr/copy_objects.py echo "Clearing results table.." aws --profile $AWS_PROFILE s3 rm s3://${INVENTORY_BUCKET}/results --recursive echo "Creating EMR cluster.." aws emr create-cluster \ --auto-terminate \ --profile $AWS_PROFILE \ --applications Name=Spark Name=Hadoop \ --name $EMR_CLUSTER_NAME \ --region $REGION \ --ebs-root-volume-size 10 \ --release-label $EMR_RELEASE \ --service-role EMR_DefaultRole \ --auto-scaling-role EMR_AutoScaling_DefaultRole \ --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \ --tags 'Name='$EMR_CLUSTER_NAME'' \ --instance-groups '[{"InstanceCount":'$CORE_INSTANCE_COUNT',"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"CORE","InstanceType":"'$CORE_INSTANCE_TYPE'","Name":"Core"}, {"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"MASTER","InstanceType":"'$MASTER_INSTANCE_TYPE'","Name":"Master"}]' \ --log-uri 's3n://'$INVENTORY_BUCKET'/emr/logs/' \ --ec2-attributes InstanceProfile=EMR_EC2_DefaultRole,SubnetId=$SUBNET_ID \ --configurations '[{"Classification":"spark","Properties":{"maximizeResourceAllocation":"true"},"Configurations":[]},{"Classification":"spark-hive-site","Properties":{"hive.metastore.client.factory.class":"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"},"Configurations":[]}]' \ --bootstrap-actions '[{"Path":"s3://'$INVENTORY_BUCKET'/emr/bootstrap.sh","Name":"Custom action"}]' \ --steps '[{"Args":["s3://'$INVENTORY_BUCKET'/emr/step_0.sh","s3://'$INVENTORY_BUCKET'/emr/copy_objects.py"],"Type":"CUSTOM_JAR","ActionOnFailure":"CONTINUE","Jar":"s3://'$REGION'.elasticmapreduce/libs/script-runner/script-runner.jar","Properties":"","Name":"Download Spark Script"}, {"Args":["spark-submit","--deploy-mode","client","/home/hadoop/copy_objects.py","'$GLUE_DATABASE_NAME'.'$ATHENA_TABLE_NAME'","'$INVENTORY_DATE'","s3://'$INVENTORY_BUCKET'/results/", "--partitions", "'$PARTITIONS'", "--acls"],"Type":"CUSTOM_JAR","ActionOnFailure":"CONTINUE","Jar":"command-runner.jar","Properties":"","Name":"Run Spark Application"}]'