AWSTemplateFormatVersion: "2010-09-09" Description: "Creates Glue resources (qs-1r18anahd)" Conditions: CreateLZS3BucketCondition: !Equals [!Ref CreateLandingZoneS3Bucket, 'Yes'] Mappings: LandingzoneTransformerMap: default: script: "transform_raw_to_clean.py" london: script: "transform_raw_to_clean_london.py" Resources: MeterDataGlueWorkflow: Type: AWS::Glue::Workflow Properties: Description: Workflow for manufactured meter data analytics LogsLogGroup: Type: AWS::Logs::LogGroup MeterDataBusinessDailyCrawler: Type: AWS::Glue::Crawler Properties: Role: !Sub "service-role/${GlueIamRole}" Targets: S3Targets: - Path: !Sub "s3://${BusinessZoneS3Bucket}/daily" DatabaseName: !Ref DBName SchemaChangePolicy: UpdateBehavior: "LOG" DeleteBehavior: "DEPRECATE_IN_DATABASE" MeterDataCleanCrawler: Type: AWS::Glue::Crawler Properties: Role: !Sub "service-role/${GlueIamRole}" Targets: S3Targets: - Path: !Sub "s3://${CleanZoneS3Bucket}" DatabaseName: !Ref DBName SchemaChangePolicy: UpdateBehavior: "LOG" DeleteBehavior: "DEPRECATE_IN_DATABASE" MeterDataRawGlueCrawler: Type: AWS::Glue::Crawler Properties: Role: !Sub "service-role/${GlueIamRole}" Targets: S3Targets: - Path: !If - CreateLZS3BucketCondition - !Sub "s3://${LandingZoneS3Bucket}" - !Sub "s3://${LandingZoneS3BucketName}" DatabaseName: !Ref DBName SchemaChangePolicy: UpdateBehavior: "UPDATE_IN_DATABASE" DeleteBehavior: "DEPRECATE_IN_DATABASE" Configuration: "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"}},\"Grouping\":{\"TableGroupingPolicy\":\"CombineCompatibleSchemas\"}}" # # Glue Jobs # TransformCleanToBusinessPartitionGlueJob: Type: AWS::Glue::Job Properties: Role: !GetAtt GlueIamRole.Arn ExecutionProperty: MaxConcurrentRuns: 1 Command: Name: "glueetl" ScriptLocation: !Sub "s3://${GlueScriptsS3Bucket}/admin/transform_clean_to_business_partition.py" PythonVersion: "3" DefaultArguments: --TempDir: !Sub "s3://${GlueTempS3Bucket}/admin" --enable-metrics: "" --enable-continuous-cloudwatch-log: "true" --enable-continuous-log-filter: "true" --job-bookmark-option: "job-bookmark-enable" --job-language: "python" --enable-s3-parquet-optimized-committer : true --db_name: !Ref DBName --table_name: !Ref CleanZoneS3Bucket --business_zone_bucket: !Ref BusinessZoneS3Bucket --temp_workflow_bucket: !Ref TempWorkflowS3Bucket MaxRetries: 0 #AllocatedCapacity: !Ref DPU MaxCapacity: !Ref DPU Timeout: 2880 GlueVersion: "2.0" TransformRawToCleanGlueJob: Type: AWS::Glue::Job Properties: Role: !GetAtt GlueIamRole.Arn ExecutionProperty: MaxConcurrentRuns: 1 Command: Name: "glueetl" ScriptLocation: !Sub - s3://${GlueScriptsS3Bucket}/admin/${script} - {script: !FindInMap [LandingzoneTransformerMap, !Ref LandingzoneTransformer, "script"]} PythonVersion: "3" DefaultArguments: --TempDir: !Sub "s3://${GlueTempS3Bucket}/admin" --enable-metrics: "" --enable-continuous-cloudwatch-log: "true" --enable-continuous-log-filter: "true" --job-bookmark-option: "job-bookmark-enable" --job-language: "python" --enable-s3-parquet-optimized-committer : true --db_name: !Ref DBName --table_name: !If - CreateLZS3BucketCondition - !Ref LandingZoneS3Bucket - !Ref LandingZoneS3BucketName --clean_data_bucket: !Ref CleanZoneS3Bucket --temp_workflow_bucket: !Ref TempWorkflowS3Bucket --region: !Ref AWS::Region MaxRetries: 0 Timeout: 2880 GlueVersion: "2.0" #AllocatedCapacity: !Ref DPU MaxCapacity: !Ref DPU ScheduledStartGlueTrigger: Type: AWS::Glue::Trigger Properties: Actions: - CrawlerName: !Ref MeterDataRawGlueCrawler Type: SCHEDULED Description: "Scheduled trigger, 9am each day" Schedule: cron(0 9 * * ? *) StartOnCreation: true WorkflowName: !Ref MeterDataGlueWorkflow CleanDataCrawlerGlueTrigger: Type: AWS::Glue::Trigger Properties: Actions: - CrawlerName: !Ref MeterDataCleanCrawler Type: "CONDITIONAL" StartOnCreation: true WorkflowName: !Ref MeterDataGlueWorkflow Predicate: Conditions: - LogicalOperator: "EQUALS" JobName: !Ref TransformRawToCleanGlueJob State: "SUCCEEDED" BusinessDailyCrawlerGlueTrigger: Type: AWS::Glue::Trigger Properties: Type: "CONDITIONAL" StartOnCreation: true WorkflowName: !Ref MeterDataGlueWorkflow Actions: - CrawlerName: !Ref MeterDataBusinessDailyCrawler Predicate: Conditions: - LogicalOperator: "EQUALS" JobName: !Ref TransformCleanToBusinessPartitionGlueJob State: "SUCCEEDED" CleanToBusinessGlueTrigger: Type: AWS::Glue::Trigger Properties: Type: "CONDITIONAL" StartOnCreation: true WorkflowName: !Ref MeterDataGlueWorkflow Actions: - JobName: !Ref TransformCleanToBusinessPartitionGlueJob Predicate: Conditions: - LogicalOperator: "EQUALS" CrawlerName: !Ref MeterDataCleanCrawler CrawlState: "SUCCEEDED" RawToCleanGlueTrigger: Type: AWS::Glue::Trigger Properties: Type: "CONDITIONAL" StartOnCreation: true WorkflowName: !Ref MeterDataGlueWorkflow Actions: - JobName: !Ref TransformRawToCleanGlueJob Predicate: Conditions: - LogicalOperator: "EQUALS" CrawlerName: !Ref MeterDataRawGlueCrawler CrawlState: "SUCCEEDED" MeterDataS3ManagedPolicy: Type: AWS::IAM::ManagedPolicy Properties: Path: "/service-role/" PolicyDocument: !If - CreateLZS3BucketCondition - !Sub | { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "s3:*" ], "Resource": [ "arn:${AWS::Partition}:s3:::${BusinessZoneS3Bucket}", "arn:${AWS::Partition}:s3:::${LandingZoneS3Bucket}", "arn:${AWS::Partition}:s3:::${CleanZoneS3Bucket}", "arn:${AWS::Partition}:s3:::${TempWorkflowS3Bucket}", "arn:${AWS::Partition}:s3:::${GlueScriptsS3Bucket}", "arn:${AWS::Partition}:s3:::${GlueTempS3Bucket}" ] } ] } - !Sub | { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "s3:*" ], "Resource": [ "arn:${AWS::Partition}:s3:::${BusinessZoneS3Bucket}", "arn:${AWS::Partition}:s3:::${LandingZoneS3BucketName}", "arn:${AWS::Partition}:s3:::${CleanZoneS3Bucket}", "arn:${AWS::Partition}:s3:::${TempWorkflowS3Bucket}", "arn:${AWS::Partition}:s3:::${GlueScriptsS3Bucket}", "arn:${AWS::Partition}:s3:::${GlueTempS3Bucket}" ] } ] } GlueIamRole: Type: AWS::IAM::Role Properties: Path: "/service-role/" AssumeRolePolicyDocument: "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Principal\":{\"Service\":\"glue.amazonaws.com\"},\"Action\":\"sts:AssumeRole\"}]}" MaxSessionDuration: 3600 ManagedPolicyArns: - !Ref MeterDataS3ManagedPolicy - !Sub "arn:${AWS::Partition}:iam::aws:policy/AmazonS3FullAccess" - !Sub "arn:${AWS::Partition}:iam::aws:policy/service-role/AWSGlueServiceRole" - !Sub "arn:${AWS::Partition}:iam::aws:policy/AmazonRedshiftFullAccess" BusinessZoneS3Bucket: Type: AWS::S3::Bucket LandingZoneS3Bucket: Type: AWS::S3::Bucket Condition: CreateLZS3BucketCondition CleanZoneS3Bucket: Type: AWS::S3::Bucket TempWorkflowS3Bucket: Type: AWS::S3::Bucket CisS3Bucket: Type: AWS::S3::Bucket GlueDatabase: Type: AWS::Glue::Database Properties: DatabaseInput: Name: !Ref DBName CatalogId: !Ref "AWS::AccountId" Outputs: LandingZoneS3Bucket: Value: !If - CreateLZS3BucketCondition - !Ref LandingZoneS3Bucket - !Ref LandingZoneS3BucketName CleanZoneS3Bucket: Value: !Ref CleanZoneS3Bucket TempWorkflowS3Bucket: Value: !Ref TempWorkflowS3Bucket BusinessZoneS3Bucket: Value: !Ref BusinessZoneS3Bucket GlueWorkflow: Value: !Ref MeterDataGlueWorkflow JobNameTransformCleanToBusiness: Value: !Ref TransformCleanToBusinessPartitionGlueJob NameCrawlerBusinessDaily: Value: !Ref MeterDataBusinessDailyCrawler GlueRoleArn: Value: !GetAtt GlueIamRole.Arn Parameters: DBName: Type: String Default: meterdata GlueScriptsS3Bucket: Type: String GlueTempS3Bucket: Type: String LandingzoneTransformer: Type: String CreateLandingZoneS3Bucket: Type: String Default: 'Yes' Description: Select 'No' if you have an existing S3 bucket with raw meter data which you would like to use. Otherwise leave default value as 'Yes'. AllowedValues: - 'Yes' - 'No' LandingZoneS3BucketName: Type: String Default: "" Description: (Conditional) You must provide a value if you select 'No' for 'Create Landing Zone bucket' parameter above. DPU: Type: String Description: No. of DPUs for Glue Job Default: 10