# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of this # software and associated documentation files (the "Software"), to deal in the Software # without restriction, including without limitation the rights to use, copy, modify, # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. AWSTemplateFormatVersion: "2010-09-09" Description: > This template sets up AWS Glue Job, Trigger, IAM Role, S3 bucket. Parameters: DataLake360DatabaseName: Type: String MinLength: "1" Description: "Data Lake 360 Database Name." Default: "datalake360db" ETLScriptsPrefix: Type: String MinLength: "1" Description: "Location of the Glue job ETL script in S3." Default: 'dataLakeDataCollector/glueJob' DataBucketName: Type: String MinLength: "1" Description: "Name of the S3 bucket in which the DataLake360 data will be uploaded. Bucket is created by this CFT." Default: 'www-datalake-data' ArtifactBucketName: Type: String MinLength: "1" Description: "Name of the S3 bucket in which the ETL script reside. Bucket is NOT created by this CFT. User need to upload the script file manually" Default: 'www-datalake-code' JobSchedule: Type: String MinLength: "1" Description: "ETL Job Schedule time in cron format" Default: 'cron(00 23 * * ? *)' Resources: DataLakeDataStore: Type: AWS::S3::Bucket Properties: BucketName: !Ref DataBucketName ### AWS GLUE RESOURCES ### AWSGlueJobRole: Type: "AWS::IAM::Role" Properties: AssumeRolePolicyDocument: Version: '2012-10-17' Statement: - Effect: Allow Principal: Service: - glue.amazonaws.com Action: - sts:AssumeRole Policies: - PolicyName: root PolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Action: - "s3:GetObject" - "s3:PutObject" - "s3:ListBucket" - "s3:DeleteObject" - "s3:GetObjectTagging" Resource: - !Sub "arn:aws:s3:::${DataBucketName}" - !Sub "arn:aws:s3:::${DataBucketName}/*" - !Sub "arn:aws:s3:::${ArtifactBucketName}" - !Sub "arn:aws:s3:::${ArtifactBucketName}/*" ManagedPolicyArns: - arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole Path: "/" DataLake360DB: Type: "AWS::Glue::Database" Properties: DatabaseInput: Description: "DataLake 360 database." Name: !Ref DataLake360DatabaseName CatalogId: !Ref AWS::AccountId DataLakeDataCollectorJob: Type: "AWS::Glue::Job" Properties: Role: !Ref AWSGlueJobRole Name: "DataLake360-DataCollector" Command: { "Name" : "pythonshell", "ScriptLocation": !Sub "s3://${ArtifactBucketName}/${ETLScriptsPrefix}/dataLakeDataCollector.py", "PythonVersion": "3" } DefaultArguments: { "--s3_output_bucket": !Sub "${DataBucketName}" } MaxRetries: 0 Description: "Collect Data Lake metadata." MaxCapacity: 1 GlueVersion: 1.0 ScheduledJobTrigger: Type: AWS::Glue::Trigger Properties: Type: SCHEDULED Description: Glue Job Schedule Trigger Schedule: !Ref JobSchedule Actions: - JobName: !Ref DataLakeDataCollectorJob Name: datalake360-scheduled StartOnCreation: true