AWSTemplateFormatVersion: '2010-09-09' Description: ML Pipeline Stack for EMR Cluster Deployment Mappings: InstanceOnDemand: eu-west-1: c5xlarge: 0.17 c52xlarge: 0.34 c54xlarge: 0.68 c518xlarge: 3.06 c5d2xlarge: 0.384 c5d4xlarge: 0.768 c5d18xlarge: 3.456 c5n2xlarge: 0.432 c5n4xlarge: 0.864 c5n18xlarge: 3.888 p32xlarge: 3.06 p38xlarge: 12.24 m3xlarge: 0.15 m32xlarge: 0.30 m4xlarge: 0.15 m42xlarge: 0.30 m44xlarge: 0.50 m4416xlarge: 2.00 c3xlarge: 0.15 c32xlarge: 0.30 c34xlarge: 0.50 c4xlarge: 0.15 c42xlarge: 0.30 c44xlarge: 0.50 Parameters: # Core Fields ClusterName: Description: The name for the EMR Cluster Type: String Owner: Description: the cluster owner Type: String BootstrapBucket: Description: S3 bucket where the bootstrap script is stored. Type: String EMRRelease: Description: The EMR release you'd like to use Type: String Default: emr-5.24.0 AdditionalMasterSecurityGroup: Description: Master Security group Type: String SubnetA: Description: Master Security group Type: String ClusterSize: Description: No of Code Nodes Type: Number Default: 2 # EC2 Wide Params Keyname: Description: emrcluster-launch Type: AWS::EC2::KeyPair::KeyName EMRClusterSubnet: Description: Which Subnet the cluster will be deployed in? Type: String AllowedValues: - SubnetA - SubnetB Default: SubnetA # Master Instance Parameters MasterInstanceGroupType: Description: What type of instance do you want in the cluster? Type: String AllowedValues: - c5.xlarge - c5.2xlarge - c5.4xlarge - c5.18xlarge - c5d.2xlarge - c5d.4xlarge - c5d.18xlarge - c5n.2xlarge - c5n.4xlarge - c5n.18xlarge - p3.2xlarge - p3.8xlarge - m3.xlarge - m3.2xlarge - m4.xlarge - m4.2xlarge - m4.4xlarge - m4.16xlarge - c3.xlarge - c3.2xlarge - c3.4xlarge - c4.xlarge - c4.2xlarge - c4.4xlarge Default: c5.2xlarge # Core Instance Parameters CoreInstanceGroupType: Description: What type of instance do you want in the cluster? Type: String AllowedValues: - c5.xlarge - c5.2xlarge - c5.4xlarge - c5.18xlarge - c5d.2xlarge - c5d.4xlarge - c5d.18xlarge - c5n.2xlarge - c5n.4xlarge - c5n.18xlarge - p3.2xlarge - p3.8xlarge - m3.xlarge - m3.2xlarge - m4.xlarge - m4.2xlarge - m4.4xlarge - m4.16xlarge - c3.xlarge - c3.2xlarge - c3.4xlarge - c4.xlarge - c4.2xlarge - c4.4xlarge Default: c5.2xlarge Resources: Cluster: Type: AWS::EMR::Cluster Properties: Applications: - Name: 'Hadoop' - Name: 'Hive' - Name: 'Presto' - Name: 'Ganglia' - Name: 'Spark' - Name: 'Hue' AutoScalingRole: Fn::Join: - '' - - 'arn:aws:iam::' - !Ref 'AWS::AccountId' - ':role/EMR_AutoScaling_DefaultRole' Instances: MasterInstanceGroup: InstanceCount: 1 InstanceType: !Ref 'MasterInstanceGroupType' Market: 'ON_DEMAND' Name: Fn::Join: - '-' - - !Ref 'ClusterName' - 'MasterGrp' - !Ref 'AWS::StackName' CoreInstanceGroup: InstanceCount: !Ref 'ClusterSize' InstanceType: !Ref 'CoreInstanceGroupType' Market: 'ON_DEMAND' EbsConfiguration: EbsBlockDeviceConfigs: - VolumeSpecification: SizeInGB: 64 VolumeType: gp2 AutoScalingPolicy: Constraints: MinCapacity: 2 MaxCapacity: 10 Rules: - Action: SimpleScalingPolicyConfiguration: ScalingAdjustment: 2 CoolDown: 300 AdjustmentType: CHANGE_IN_CAPACITY Description: 'Scale Up' Trigger: CloudWatchAlarmDefinition: MetricName: ContainerPending ComparisonOperator: GREATER_THAN_OR_EQUAL Statistic: AVERAGE Period: 300 Dimensions: - Value: '${emr.clusterId}' Key: JobFlowId EvaluationPeriods: 2 Unit: COUNT Namespace: AWS/ElasticMapReduce Threshold: 3 Name: scale_out - Action: SimpleScalingPolicyConfiguration: ScalingAdjustment: -3 CoolDown: 300 AdjustmentType: CHANGE_IN_CAPACITY Description: 'Scale Down' Trigger: CloudWatchAlarmDefinition: MetricName: ContainerPending ComparisonOperator: LESS_THAN_OR_EQUAL Statistic: AVERAGE Period: 300 Dimensions: - Value: '${emr.clusterId}' Key: JobFlowId EvaluationPeriods: 3 Unit: COUNT Namespace: AWS/ElasticMapReduce Threshold: 0 Name: scale_in Name: Fn::Join: - '-' - - !Ref 'ClusterName' - 'CoreGrp' - !Ref 'AWS::StackName' TerminationProtected: False AdditionalMasterSecurityGroups: - !Ref AdditionalMasterSecurityGroup Ec2SubnetId: !Ref SubnetA Ec2KeyName: !Ref Keyname Name: Fn::Join: - '-' - - !Ref 'ClusterName' - !Ref 'AWS::StackName' LogUri: Fn::Join: - '' - - 's3://' - !Ref 'BootstrapBucket' - '/elasticmapreduce/' - !Ref 'ClusterName' - '-' - 'Cluster' JobFlowRole: !Ref 'EMREC2InstanceProfile' ServiceRole: !Ref 'EMRRole' ReleaseLabel: !Ref 'EMRRelease' VisibleToAllUsers: 'true' Configurations: - Classification: spark ConfigurationProperties: maximizeResourceAllocation: true - Classification: hive-site ConfigurationProperties: hive.metastore.client.factory.class: com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory - Classification: presto-connector-hive ConfigurationProperties: hive.metastore.glue.datacatalog.enabled: true - Classification: spark-hive-site ConfigurationProperties: hive.metastore.client.factory.class: com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory ## # # To add bootstrap actions, you can modify the script bootstrapactions.sh and store it in your s3 bucket. # ## BootstrapActions: - Name: 'InstallScript' ScriptBootstrapAction: Path: !Sub 's3://${BootstrapBucket}/emr-bootstrap-scripts/bootstrapactions.sh' Tags: - Key: 'Name' Value: !Join - '-' - - !Ref 'ClusterName' - 'Cluster' - !Ref 'AWS::StackName' - Key: 'Project' Value: 'bigdata' - Key: 'Environment' Value: 'demo' - Key: 'Owner' Value: !Ref 'Owner' - Key: 'Domain' Value: 'bigdata' EMRRole: Type: AWS::IAM::Role Properties: RoleName: Fn::Join: - '-' - - 'EMRServiceRole' - !Ref 'AWS::StackName' AssumeRolePolicyDocument: Version: 2008-10-17 Statement: - Sid: '' Effect: Allow Principal: Service: 'elasticmapreduce.amazonaws.com' Action: 'sts:AssumeRole' Path: / ManagedPolicyArns: - 'arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceRole' EMREC2Role: Type: AWS::IAM::Role Properties: RoleName: Fn::Join: - '-' - - 'EMREC2Role' - !Ref 'AWS::StackName' AssumeRolePolicyDocument: Version: 2008-10-17 Statement: - Sid: '' Effect: Allow Principal: Service: 'ec2.amazonaws.com' Action: 'sts:AssumeRole' Path: / Policies: - PolicyName: 'root' PolicyDocument: Version: '2012-10-17' Statement: - Effect: 'Allow' Action: - s3:ListBucket - s3:GetObject* - s3:PutObject* - s3:DeleteObject* Resource: [ !Sub 'arn:aws:s3:::${BootstrapBucket}', !Sub 'arn:aws:s3:::${BootstrapBucket}/*', !Sub 'arn:aws:s3:::aws-bigdata-blog', !Sub 'arn:aws:s3:::aws-bigdata-blog/*', !Sub 'arn:aws:iam:::role/${EMRRole}' ] EMREC2InstanceProfile: Type: AWS::IAM::InstanceProfile Properties: Path: / Roles: - !Ref EMREC2Role Outputs: EMRClusterID: Description: Id of the EMR Cluster Value: !Ref Cluster Export: Name: !Sub 'EMRClusterID-${AWS::StackName}'