# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 AWSTemplateFormatVersion: 2010-09-09 Transform: AWS::Serverless-2016-10-31 Description: Data Quality with Deequ on AWS Glue Parameters: pArtifactsBucket: Description: The name of the artifacts bucket Type: String pCreateFrontEnd: Description: Set to true to create an AWS Amplify web UI Type: String AllowedValues: - "true" - "false" Default: "true" pEnv: Description: The environment name Type: String Default: dev Globals: Function: Runtime: python3.7 Handler: lambda_function.lambda_handler Conditions: EnableFrontEnd: !Equals [!Ref pCreateFrontEnd, "true"] DisableFrontEnd: !Equals [!Ref pCreateFrontEnd, "false"] Resources: AmplifyDevBranch: Type: AWS::Amplify::Branch Condition: EnableFrontEnd Properties: BranchName: !Ref pEnv AppId: !GetAtt AmplifyApp.AppId EnableAutoBuild: yes AmplifyApp: Type: AWS::Amplify::App Condition: EnableFrontEnd Properties: Name: deequ-constraints Repository: !Sub https://git-codecommit.${AWS::Region}.amazonaws.com/v1/repos/amazon-deequ-glue Description: Deequ - Data Quality Constraints IAMServiceRole: !GetAtt AmplifyServiceRole.Arn BuildSpec: | version: 0.1 backend: phases: preBuild: commands: - cd src/deequ-constraints - npm install -g @aws-amplify/cli build: commands: - amplifyPush --simple frontend: phases: preBuild: commands: - yarn install build: commands: - yarn run build artifacts: baseDirectory: src/deequ-constraints/build files: - '**/*' cache: paths: - src/deequ-constraints/node_modules/**/* ######## EVENTS ######### rDataQualityTriggerEventRule: Type: AWS::Events::Rule Properties: Description: Trigger Data Quality Step Function every 30 minutes State: DISABLED ScheduleExpression: "cron(*/30 * * * ? *)" Targets: - Id: data-quality-event-rule Arn: !Ref rDataQualityStateMachine RoleArn: !GetAtt rEventTriggerRole.Arn Input: | { "glueDatabase": "default", "glueTables": "table1,table2" } ####### S3 ######### rDataQualityBucket: Type: AWS::S3::Bucket Properties: BucketName: !Sub data-quality-${AWS::Region}-${AWS::AccountId} VersioningConfiguration: Status: Enabled BucketEncryption: ServerSideEncryptionConfiguration: - ServerSideEncryptionByDefault: SSEAlgorithm: AES256 PublicAccessBlockConfiguration: BlockPublicAcls: True BlockPublicPolicy: True IgnorePublicAcls: True RestrictPublicBuckets: True ######## IAM ######### AmplifyServiceRole: Type: AWS::IAM::Role Condition: EnableFrontEnd Properties: AssumeRolePolicyDocument: Version: "2012-10-17" Statement: - Effect: Allow Principal: Service: - amplify.amazonaws.com Action: sts:AssumeRole Path: / ManagedPolicyArns: - arn:aws:iam::aws:policy/AdministratorAccess-Amplify Policies: - PolicyName: "amplify-console-role" PolicyDocument: Version: "2012-10-17" Statement: - Resource: - !Sub "arn:aws:ssm:${AWS::Region}:${AWS::AccountId}:parameter/DataQuality/${pEnv}/AppSync/GraphQLApi" Effect: Allow Action: - ssm:AddTagsToResource - ssm:GetParameter - ssm:GetParameters - ssm:DeleteParameter - ssm:DeleteParameters - ssm:ListTagsForResource - ssm:PutParameter rCommonIAMPolicy: Type: AWS::IAM::ManagedPolicy Properties: PolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Action: - logs:CreateLogGroup Resource: !Sub arn:aws:logs:${AWS::Region}:${AWS::AccountId}:* - Effect: Allow Action: - logs:CreateLogStream - logs:PutLogEvents Resource: - !Sub arn:aws:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/* - !Sub arn:aws:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/glue/* - Effect: Allow Action: - dynamodb:BatchGetItem - dynamodb:DescribeTable - dynamodb:GetItem - dynamodb:GetRecords - dynamodb:Query - dynamodb:Scan - dynamodb:BatchWriteItem - dynamodb:DeleteItem - dynamodb:UpdateItem - dynamodb:PutItem Resource: - !Sub arn:aws:dynamodb:${AWS::Region}:${AWS::AccountId}:table/DataQuality* rGlueJobsIAMRole: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Principal: Service: - glue.amazonaws.com Action: - sts:AssumeRole Path: / ManagedPolicyArns: - arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole - arn:aws:iam::aws:policy/AmazonS3FullAccess - !Ref rCommonIAMPolicy Policies: - PolicyName: data-quality-jobs PolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Action: - ssm:GetParameter - ssm:GetParameters Resource: - !Sub "arn:aws:ssm:${AWS::Region}:${AWS::AccountId}:parameter/DataQuality/${pEnv}/AppSync/GraphQLApi" rGlueCrawlerRole: Type: AWS::IAM::Role Properties: ManagedPolicyArns: - !Ref rCommonIAMPolicy AssumeRolePolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Principal: Service: lambda.amazonaws.com Action: sts:AssumeRole Policies: - PolicyName: data-quality-crawl PolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Action: - glue:StartCrawler Resource: - !Sub arn:aws:glue:${AWS::Region}:${AWS::AccountId}:crawler/${rGlueCrawler} rStatesExecutionRole: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Principal: Service: - !Sub states.${AWS::Region}.amazonaws.com Action: sts:AssumeRole Policies: - PolicyName: data-quality-states-execution PolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Action: - lambda:InvokeFunction Resource: - !GetAtt rGlueCrawlerLambdaFunction.Arn - Effect: Allow Action: - glue:StartJobRun - glue:GetJobRun* Resource: - !Sub arn:aws:glue:${AWS::Region}:${AWS::AccountId}:job/${rControllerGlueJob} rEventTriggerRole: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Principal: Service: - events.amazonaws.com Action: sts:AssumeRole Path: / Policies: - PolicyName: data-quality-trigger PolicyDocument: Version: "2012-10-17" Statement: - Effect: Allow Action: - states:StartExecution Resource: !Ref rDataQualityStateMachine ####### DYNAMODB ######### rDataQualitySuggestionsDynamoDBTable: Type: AWS::DynamoDB::Table Condition: DisableFrontEnd Properties: KeySchema: - AttributeName: id KeyType: HASH AttributeDefinitions: - AttributeName: id AttributeType: S - AttributeName: tableHashKey AttributeType: S BillingMode: PAY_PER_REQUEST GlobalSecondaryIndexes: - IndexName: table-index KeySchema: - AttributeName: tableHashKey KeyType: HASH Projection: ProjectionType: ALL TableName: !Sub DataQualitySuggestion-${pEnv} rDataQualityAnalysisDynamoDBTable: Type: AWS::DynamoDB::Table Condition: DisableFrontEnd Properties: KeySchema: - AttributeName: id KeyType: HASH AttributeDefinitions: - AttributeName: id AttributeType: S - AttributeName: tableHashKey AttributeType: S BillingMode: PAY_PER_REQUEST GlobalSecondaryIndexes: - IndexName: table-index KeySchema: - AttributeName: tableHashKey KeyType: HASH Projection: ProjectionType: ALL TableName: !Sub DataQualityAnalyzer-${pEnv} ######## LAMBDA ######### rGlueCrawlerLambdaFunction: Type: AWS::Serverless::Function Properties: CodeUri: ./main/utils/crawl-data/src FunctionName: data-quality-crawl-data Description: Glue crawler Environment: Variables: GLUE_CRAWLER: !Ref rGlueCrawler MemorySize: 256 Timeout: 120 Role: !GetAtt rGlueCrawlerRole.Arn ######## STEP FUNCTIONS ######### rDataQualityStateMachine: Type: AWS::StepFunctions::StateMachine Properties: StateMachineName: data-quality-sm DefinitionString: !Sub - |- { "Comment": "Data Quality WorkFlow", "StartAt": "Data Quality Controller", "States": { "Data Quality Controller": { "Type": "Task", "Resource": "arn:aws:states:::glue:startJobRun.sync", "Parameters": { "JobName": "${gController}", "Arguments": { "--glueDatabase.$":"$.glueDatabase", "--glueTables.$":"$.glueTables" } }, "ResultPath": "$.output", "Next": "Run Glue Crawler" }, "Run Glue Crawler": { "Type": "Task", "Resource": "${lCrawler}", "Comment": "Run Glue Crawler", "ResultPath": "$.statusCode", "End": true } } } - { gController: !Ref rControllerGlueJob, lCrawler: !GetAtt rGlueCrawlerLambdaFunction.Arn, } RoleArn: !GetAtt rStatesExecutionRole.Arn ######## GLUE ######### rControllerGlueJob: Type: AWS::Glue::Job Properties: Command: Name: pythonshell PythonVersion: "3" ScriptLocation: !Sub "s3://${pArtifactsBucket}/deequ/scripts/deequ-controller.py" DefaultArguments: "--TempDir": !Sub "s3://${pArtifactsBucket}/deequ/" "--enable-metrics": "true" "--env": !Ref pEnv "--glueSuggestionVerificationJob": !Ref rSuggestionVerificationRunnerGlueJob "--glueVerificationJob": !Ref rVerificationRunnerGlueJob "--glueProfilerJob": !Ref rProfilerRunnerGlueJob "--glueDatabase": "default" "--glueTables": "table1,table2" "--targetBucketName": !Sub "s3://${rDataQualityBucket}" ExecutionProperty: MaxConcurrentRuns: 10 GlueVersion: "1.0" Timeout: 65 Name: data-quality-controller Role: !GetAtt rGlueJobsIAMRole.Arn rSuggestionVerificationRunnerGlueJob: Type: AWS::Glue::Job Properties: Command: Name: glueetl ScriptLocation: !Sub "s3://${pArtifactsBucket}/deequ/scripts/deequ-suggestion-analysis-verification-runner.scala" DefaultArguments: "--TempDir": !Sub "s3://${pArtifactsBucket}/deequ/" "--job-language": "scala" "--class": "GlueApp" "--enable-continuous-cloudwatch-log": "true" "--enable-continuous-log-filter": "true" "--enable-metrics": "true" "--enable-glue-datacatalog": "" "--extra-jars": !Sub "s3://${pArtifactsBucket}/deequ/jars/deequ-1.0.3-RC1.jar" "--dynamodbSuggestionTableName": "default" "--dynamodbAnalysisTableName": "default" "--glueDatabase": "default" "--glueTables": "table1,table2" "--targetBucketName": !Sub "s3://${rDataQualityBucket}" ExecutionProperty: MaxConcurrentRuns: 10 GlueVersion: "2.0" MaxCapacity: 3 MaxRetries: 0 Timeout: 60 Name: data-quality-suggestion-analysis-verification-runner Role: !GetAtt rGlueJobsIAMRole.Arn rVerificationRunnerGlueJob: Type: AWS::Glue::Job Properties: Command: Name: glueetl ScriptLocation: !Sub "s3://${pArtifactsBucket}/deequ/scripts/deequ-analysis-verification-runner.scala" DefaultArguments: "--TempDir": !Sub "s3://${pArtifactsBucket}/deequ/" "--job-language": "scala" "--class": "GlueApp" "--enable-continuous-cloudwatch-log": "true" "--enable-continuous-log-filter": "true" "--enable-metrics": "true" "--enable-glue-datacatalog": "" "--extra-jars": !Sub "s3://${pArtifactsBucket}/deequ/jars/deequ-1.0.3-RC1.jar" "--dynamodbSuggestionTableName": "default" "--dynamodbAnalysisTableName": "default" "--glueDatabase": "default" "--glueTables": "table1,table2" "--targetBucketName": !Sub "s3://${rDataQualityBucket}" ExecutionProperty: MaxConcurrentRuns: 10 MaxRetries: 0 Timeout: 60 GlueVersion: "2.0" MaxCapacity: 3 Name: data-quality-analysis-verification-runner Role: !GetAtt rGlueJobsIAMRole.Arn rProfilerRunnerGlueJob: Type: AWS::Glue::Job Properties: Command: Name: glueetl ScriptLocation: !Sub "s3://${pArtifactsBucket}/deequ/scripts/deequ-profile-runner.scala" DefaultArguments: "--TempDir": !Sub "s3://${pArtifactsBucket}/deequ/" "--job-language": "scala" "--class": "GlueApp" "--enable-continuous-cloudwatch-log": "true" "--enable-continuous-log-filter": "true" "--enable-metrics": "true" "--enable-glue-datacatalog": "" "--extra-jars": !Sub "s3://${pArtifactsBucket}/deequ/jars/deequ-1.0.3-RC1.jar" "--glueDatabase": "default" "--glueTables": "table1,table2" "--targetBucketName": !Sub "s3://${rDataQualityBucket}" ExecutionProperty: MaxConcurrentRuns: 10 MaxRetries: 0 Timeout: 60 GlueVersion: "2.0" MaxCapacity: 3 Name: data-quality-profile-runner Role: !GetAtt rGlueJobsIAMRole.Arn rGlueDataCatalog: Type: AWS::Glue::Database Properties: CatalogId: !Ref AWS::AccountId DatabaseInput: Description: Data Quality Catalog Name: data_quality_db rGlueCrawler: Type: AWS::Glue::Crawler Properties: Role: !GetAtt rGlueJobsIAMRole.Arn DatabaseName: !Ref rGlueDataCatalog Name: data-quality-crawler Targets: S3Targets: - Path: !Sub s3://${rDataQualityBucket} ######## SSM ######### AmplifyAppIdSSM: Condition: EnableFrontEnd Type: AWS::SSM::Parameter Properties: Name: /DataQuality/Amplify/AppID Type: String Value: !GetAtt AmplifyApp.AppId Description: Data Quality Amplify App Id