# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of this # software and associated documentation files (the "Software"), to deal in the Software # without restriction, including without limitation the rights to use, copy, modify, # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. AWSTemplateFormatVersion: "2010-09-09" Description: > This template sets up sample AWS Glue resources to be orchestrated by AWS Step Functions. Parameters: MarketingAndSalesDatabaseName: Type: String MinLength: "4" Default: "marketingandsales_qs" Description: "Name of the AWS Glue database to contain this CloudFormation template's tables." SalesPipelineTableName: Type: String MinLength: "4" Default: "salespipeline_qs" Description: "Name of the Sales Pipeline data table in AWS Glue." MarketingTableName: Type: String MinLength: "4" Default: "marketing_qs" Description: "Name of the Marketing data table in AWS Glue." ETLScriptsPrefix: Type: String MinLength: "1" Description: "Location of the Glue job ETL scripts in S3." ETLOutputPrefix: Type: String MinLength: "1" Description: "Name of the S3 output path to which this CloudFormation template's AWS Glue jobs are going to write ETL output." DataBucketName: Type: String MinLength: "1" Description: "Name of the S3 bucket in which the source Marketing and Sales data will be uploaded. Bucket is created by this CFT." ArtifactBucketName: Type: String MinLength: "1" Description: "Name of the S3 bucket in which the Marketing and Sales ETL scripts reside. Bucket is NOT created by this CFT." Resources: ### AWS GLUE RESOURCES ### AWSGlueJobRole: Type: "AWS::IAM::Role" Properties: AssumeRolePolicyDocument: Version: '2012-10-17' Statement: - Effect: Allow Principal: Service: - glue.amazonaws.com Action: - sts:AssumeRole Policies: - PolicyName: root PolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Action: - "s3:GetObject" - "s3:PutObject" - "s3:ListBucket" - "s3:DeleteObject" Resource: - !Sub "arn:aws:s3:::${DataBucketName}" - !Sub "arn:aws:s3:::${DataBucketName}/*" - !Sub "arn:aws:s3:::${ArtifactBucketName}" - !Sub "arn:aws:s3:::${ArtifactBucketName}/*" ManagedPolicyArns: - arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole Path: "/" MarketingAndSalesDatabase: Type: "AWS::Glue::Database" Properties: DatabaseInput: Description: "Marketing and Sales database (Amazon QuickSight Samples)." Name: !Ref MarketingAndSalesDatabaseName CatalogId: !Ref AWS::AccountId SalesPipelineTable: Type: "AWS::Glue::Table" DependsOn: MarketingAndSalesDatabase Properties: TableInput: Description: "Sales Pipeline table (Amazon QuickSight Sample)." TableType: "EXTERNAL_TABLE" Parameters: { "CrawlerSchemaDeserializerVersion": "1.0", "compressionType": "none", "classification": "csv", "recordCount": "16831", "typeOfData": "file", "CrawlerSchemaSerializerVersion": "1.0", "columnsOrdered": "true", "objectCount": "1", "delimiter": ",", "skip.header.line.count": "1", "averageRecordSize": "119", "sizeKey": "2002910" } StorageDescriptor: StoredAsSubDirectories: False Parameters: { "CrawlerSchemaDeserializerVersion": "1.0", "compressionType": "none", "classification": "csv", "recordCount": "16831", "typeOfData": "file", "CrawlerSchemaSerializerVersion": "1.0", "columnsOrdered": "true", "objectCount": "1", "delimiter": ",", "skip.header.line.count": "1", "averageRecordSize": "119", "sizeKey": "2002910" } InputFormat: "org.apache.hadoop.mapred.TextInputFormat" OutputFormat: "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat" Columns: - Type: string Name: date - Type: string Name: salesperson - Type: string Name: lead name - Type: string Name: segment - Type: string Name: region - Type: string Name: target close - Type: bigint Name: forecasted monthly revenue - Type: string Name: opportunity stage - Type: bigint Name: weighted revenue - Type: boolean Name: closed opportunity - Type: boolean Name: active opportunity - Type: boolean Name: latest status entry SerdeInfo: Parameters: { "field.delim": "," } SerializationLibrary: "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe" Compressed: False Location: !Sub "s3://${DataBucketName}/sales/" Retention: 0 Name: !Ref SalesPipelineTableName DatabaseName: !Ref MarketingAndSalesDatabaseName CatalogId: !Ref AWS::AccountId MarketingTable: Type: "AWS::Glue::Table" DependsOn: MarketingAndSalesDatabase Properties: TableInput: Description: "Marketing table (Amazon QuickSight Sample)." TableType: "EXTERNAL_TABLE" Parameters: { "CrawlerSchemaDeserializerVersion": "1.0", "compressionType": "none", "classification": "csv", "recordCount": "948", "typeOfData": "file", "CrawlerSchemaSerializerVersion": "1.0", "columnsOrdered": "true", "objectCount": "1", "delimiter": ",", "skip.header.line.count": "1", "averageRecordSize": "160", "sizeKey": "151746" } StorageDescriptor: StoredAsSubDirectories: False Parameters: { "CrawlerSchemaDeserializerVersion": "1.0", "compressionType": "none", "classification": "csv", "recordCount": "948", "typeOfData": "file", "CrawlerSchemaSerializerVersion": "1.0", "columnsOrdered": "true", "objectCount": "1", "delimiter": ",", "skip.header.line.count": "1", "averageRecordSize": "160", "sizeKey": "151746" } InputFormat: "org.apache.hadoop.mapred.TextInputFormat" OutputFormat: "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat" Columns: - Type: string Name: date - Type: bigint Name: new visitors seo - Type: bigint Name: new visitors cpc - Type: bigint Name: new visitors social media - Type: bigint Name: return visitors - Type: bigint Name: twitter mentions - Type: bigint Name: twitter follower adds - Type: bigint Name: twitter followers cumulative - Type: bigint Name: mailing list adds - Type: bigint Name: mailing list cumulative - Type: bigint Name: website pageviews - Type: bigint Name: website visits - Type: bigint Name: website unique visits - Type: bigint Name: mobile uniques - Type: bigint Name: tablet uniques - Type: bigint Name: desktop uniques - Type: bigint Name: free sign up - Type: bigint Name: paid conversion - Type: string Name: events SerdeInfo: Parameters: { "field.delim": "," } SerializationLibrary: "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe" Compressed: False Location: !Sub "s3://${DataBucketName}/marketing/" Retention: 0 Name: !Ref MarketingTableName DatabaseName: !Ref MarketingAndSalesDatabaseName CatalogId: !Ref AWS::AccountId ProcessSalesDataJob: Type: "AWS::Glue::Job" Properties: Role: !Ref AWSGlueJobRole Name: "ProcessSalesData" Command: { "Name" : "glueetl", "ScriptLocation": !Sub "s3://${ArtifactBucketName}/${ETLScriptsPrefix}/process_sales_data.py" } DefaultArguments: { "--database_name" : !Ref MarketingAndSalesDatabaseName, "--table_name" : !Ref SalesPipelineTableName, "--s3_output_path": !Sub "s3://${DataBucketName}/${ETLOutputPrefix}/tmp/sales" } MaxRetries: 0 Description: "Process Sales Pipeline data." AllocatedCapacity: 5 ProcessMarketingDataJob: Type: "AWS::Glue::Job" Properties: Role: !Ref AWSGlueJobRole Name: "ProcessMarketingData" Command: { "Name" : "glueetl", "ScriptLocation": !Sub "s3://${ArtifactBucketName}/${ETLScriptsPrefix}/process_marketing_data.py" } DefaultArguments: { "--database_name" : !Ref MarketingAndSalesDatabaseName, "--table_name" : !Ref MarketingTableName, "--s3_output_path": !Sub "s3://${DataBucketName}/${ETLOutputPrefix}/tmp/marketing" } MaxRetries: 0 Description: "Process Marketing data." AllocatedCapacity: 5 JoinMarketingAndSalesDataJob: Type: "AWS::Glue::Job" Properties: Role: !Ref AWSGlueJobRole Name: "JoinMarketingAndSalesData" Command: { "Name" : "glueetl", "ScriptLocation": !Sub "s3://${ArtifactBucketName}/${ETLScriptsPrefix}/join_marketing_and_sales_data.py" } DefaultArguments: { "--database_name": !Ref MarketingAndSalesDatabaseName, "--s3_output_path": !Sub "s3://${DataBucketName}/${ETLOutputPrefix}/sales-leads-influenced", "--s3_sales_data_path": !Sub "s3://${DataBucketName}/${ETLOutputPrefix}/tmp/sales", "--s3_marketing_data_path": !Sub "s3://${DataBucketName}/${ETLOutputPrefix}/tmp/marketing" } MaxRetries: 0 Description: "Join Marketing and Sales data." AllocatedCapacity: 5