#Glue CF Stack AWSTemplateFormatVersion: '2010-09-09' # Sample CloudFormation template in YAML to create a database, Crawler and tables # Parameters substituted in the Resources section # These parameters are names of the resources created in the Data Catalog Parameters: CFNDatabaseName: Type: String Default: amazonconnect_db Description: Database name to hold all the tables for Connect Data CFNS3ConnectDataPath: Type: String Default: #Example: s3://connect-mmnm-ctr/contact-centre/ <- this is the parent prefix inside which you have the year/month/day structure of the data Description: S3 path for Amazon connect CTR data CFNS3ContactlensDataPath: Type: String Default: # Exaple: s3://amazon-connect-6e0d69da3532/Analysis/Voice <- this is the s3 prefix where the contact lens data is saved Description: S3 path for Amazon contact Lens Data CFNS3ConnectCustomerDataPath: Type: String Default: # Example: s3://amazon-connect-f5277b9142db/Customerdata <- this is where the customer data is saved on s3, it can be extracted from other sources, such as salesforce etc. Description: S3 path for Amazon connect Customer Data CFNCrawlerNameForConnectData: Type: String Default: cfn-crawler-connect-ctr-data Description: Crawler Name for Amazon Connect Data CFNCrawlerNameForContactlensData: Type: String Default: cfn-crawler-contact-lens-data Description: Crawler Name for Amazon Contact lens Data CFNCrawlerNameForConnectCustomerData: Type: String Default: cfn-crawler-connect-customer-data Description: Crawler Name for Amazon Connect Customer Data CFNTablePrefixName: Type: String Default: cfn_connect_ Description: prefix to for all the tables to be created for Amazon connect data CFNScriptLocation: Type: String Default: # Example: s3://amazon-connect-f5277b9142db/scripts/connect-blog-aggregateCustomerSentimets.py <- this is where the glue script is saved on s3 Description: Glue ETl Script location on S3 # Resources to create metadata in the Data Catalog Resources: ### # Create an AWS Glue database CFNDatabaseConnect: Type: AWS::Glue::Database Properties: CatalogId: !Ref AWS::AccountId DatabaseInput: Name: !Ref CFNDatabaseName Description: Database to hold tables for Amazon Connect data ### #Create IAM Role assumed by the crawler. For demonstration, this role is given all permissions. CFNRoleConnect: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Version: "2012-10-17" Statement: - Effect: "Allow" Principal: Service: - "glue.amazonaws.com" Action: - "sts:AssumeRole" Path: "/" Policies: - PolicyName: "root" PolicyDocument: Version: "2012-10-17" Statement: - Effect: "Allow" Action: "*" Resource: "*" #create Crawlers # Crawler for Connect CTR CFNCrawlerConnectCTR: Type: AWS::Glue::Crawler Properties: Name: !Ref CFNCrawlerNameForConnectData Role: !GetAtt CFNRoleConnect.Arn #Classifiers: none, use the default classifier Description: AWS Glue crawler to crawl flights data #Schedule: none, use default run-on-demand DatabaseName: !Ref CFNDatabaseName Targets: S3Targets: # S3 bucket with the Connect CTR data - Path: !Ref CFNS3ConnectDataPath TablePrefix: !Ref CFNTablePrefixName SchemaChangePolicy: UpdateBehavior: "UPDATE_IN_DATABASE" DeleteBehavior: "LOG" Configuration: "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}" # Crawler for Contact lens DATA CFNCrawlerContactlens: Type: AWS::Glue::Crawler Properties: Name: !Ref CFNCrawlerNameForContactlensData Role: !GetAtt CFNRoleConnect.Arn #Classifiers: none, use the default classifier Description: AWS Glue crawler to crawl flights data #Schedule: none, use default run-on-demand DatabaseName: !Ref CFNDatabaseName Targets: S3Targets: # S3 bucket with the Connect CTR data - Path: !Ref CFNS3ContactlensDataPath TablePrefix: !Ref CFNTablePrefixName SchemaChangePolicy: UpdateBehavior: "UPDATE_IN_DATABASE" DeleteBehavior: "LOG" Configuration: "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}" # Crawler for Connect Customer Data CFNCrawlerConnectCustomer: Type: AWS::Glue::Crawler Properties: Name: !Ref CFNCrawlerNameForConnectCustomerData Role: !GetAtt CFNRoleConnect.Arn #Classifiers: none, use the default classifier Description: AWS Glue crawler to crawl flights data #Schedule: none, use default run-on-demand DatabaseName: !Ref CFNDatabaseName Targets: S3Targets: # S3 bucket with the Connect CTR data - Path: !Ref CFNS3ConnectCustomerDataPath TablePrefix: !Ref CFNTablePrefixName SchemaChangePolicy: UpdateBehavior: "UPDATE_IN_DATABASE" DeleteBehavior: "LOG" Configuration: "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}" #################################### # Create Glue ETL Job CFNAggregateCustomerSentiments: Type: AWS::Glue::Job Properties: Name: "CFN-Aggregate-Customer-Sentiments" Role: !GetAtt CFNRoleConnect.Arn GlueVersion: 2.0 DefaultArguments: "--job-bookmark-option": "job-bookmark-enable" # If script written in Scala, then set DefaultArguments={'--job-language'; 'scala', '--class': 'your scala class'} #Connections: No connection needed for S3 to S3 job # ConnectionsList #MaxRetries: Double Description: Job created with CloudFormation #LogUri: String Command: Name: glueetl ScriptLocation: !Ref CFNScriptLocation # for access to directories use proper IAM role with permission to buckets and folders that begin with "aws-glue-" # script uses temp directory from job definition if required (temp directory not used S3 to S3) # script defines target for output as s3://aws-glue-target/sal WorkerType: G.1X NumberOfWorkers: 5 ExecutionProperty: MaxConcurrentRuns: 1 MaxRetries: 0