{ "AWSTemplateFormatVersion": "2010-09-09", "Description": "(SO0033) - machine-learning-for-all: Machine Learning for All is a solution that helps data scientists in the industry get started using machine learning to generate insights from their data. The solution provides a framework for an end-to-end machine learning process including ad-hoc data exploration, data processing and feature engineering, and modeling training and evaluation.", "Parameters": { "Industry": { "Description": "Select industry for which the schema is to be provided.", "Type": "String", "Default": "%%INDUSTRY%%", "AllowedValues": [ "%%INDUSTRY%%" ] }, "S3Bucket": { "Description": "Please provide the name of the source artifacts bucket.", "Type": "String", "Default": "" }, "S3PrefixRaw": { "Description": "Please provide the prefix in S3 bucket to process source data.", "Type": "String", "Default": "machine-learning-for-all/%%VERSION%%/data" }, "DestinationBucket": { "Description": "Please provide the name of the destination bucket.", "Type": "String", "Default": "" }, "S3PrefixTransform": { "Description": "Please provide the prefix in S3 destination bucket to transform the files.", "Type": "String", "Default": "machine-learning-for-all/%%VERSION%%/parq-conv" }, "SyntheticData": { "Description": "Transfers Synthetic Demo data.", "Type": "String", "Default": "Yes", "AllowedValues": [ "Yes", "No" ] }, "CronJob": { "Description": "Cron for glue job scheduling.", "Type": "String", "Default": "cron(0 0 1/1 * ? *)" }, "AutoStartTransformation": { "Description": "Initiate ETL Job Transformation for converting csv to parquet format.", "Type": "String", "Default": "Yes", "AllowedValues": [ "Yes", "No" ] }, "DeploySagemaker": { "Description": "Deploys a Sagemaker Instance.", "Type": "String", "Default": "Yes", "AllowedValues": [ "Yes", "No" ] }, "NotebookInstanceType": { "AllowedValues": [ "ml.t2.medium", "ml.m4.xlarge", "ml.p2.xlarge" ], "ConstraintDescription": "Must select a valid notebook instance type.", "Default": "ml.t2.medium", "Description": "Select Instance type for the SageMaker Notebook.", "Type": "String" }, "KmsKeyId": { "Description": "AWS KMS key ID used to encrypt data at rest on the ML storage volume attached to notebook instance.", "Default": "", "Type": "String" } }, "Metadata": { "AWS::CloudFormation::Interface": { "ParameterGroups": [ { "Label": { "default": "Data Configuration" }, "Parameters": [ "Industry", "S3Bucket", "S3PrefixRaw", "DestinationBucket", "S3PrefixTransform", "SyntheticData" ] }, { "Label": { "default": "ETL" }, "Parameters": [ "CronJob", "AutoStartTransformation" ] }, { "Label": { "default": "Machine Learning" }, "Parameters": [ "DeploySagemaker", "NotebookInstanceType", "KmsKeyId" ] } ], "ParameterLabels": { "S3Bucket": { "default": "Source Bucket" }, "S3PrefixRaw": { "default": "Source Prefix" }, "DestinationBucket": { "default": "Destination Bucket" }, "S3PrefixTransform": { "default": "Destination Prefix" }, "SyntheticData": { "default": "Synthetic Data" }, "CronJob": { "default": "Cron Job" }, "DeploySagemaker": { "default": "Deploy Sagemaker" }, "AutoStartTransformation": { "default": "Start Transformation" }, "NotebookInstanceType": { "default": "Notebook InstanceType" }, "KmsKeyId": { "default": "KMS Key Id" } } } }, "Mappings": { "Send": { "AnonymousUsage": { "Data": "Yes" } }, "SB": { "Src": { "Bucket": "%%BUCKET_NAME%%", "SolutionName": "%%SOLUTION_NAME%%" } }, "Function": { "SolutionHelper" : { "S3Bucket" : "%%BUCKET_NAME%%", "S3Key" : "%%SOLUTION_NAME%%/%%VERSION%%/local-solution-helper.zip", "Description": "CloudFormation custom resource function invoked for sending anonymous data and creating unique ID." }, "CustomResource": { "S3Bucket": "%%BUCKET_NAME%%", "S3Key": "%%SOLUTION_NAME%%/%%VERSION%%/ml-custom-resource.zip", "Description": "machine-learning-for-all: CloudFormation custom resource function invoked during CloudFormation create, update, and delete stack operations." } }, "ETL": { "Glue": { "DPU": "10" } } }, "Conditions": { "MockData": { "Fn::Equals": [ { "Ref": "SyntheticData" }, "Yes" ] }, "AutoStart": { "Fn::Equals": [ { "Ref": "AutoStartTransformation" }, "Yes" ] }, "CreateSagemaker": { "Fn::Equals": [ { "Ref": "DeploySagemaker" }, "Yes" ] }, "HasKmsKeyId": {"Fn::Not": [{"Fn::Equals": ["", {"Ref": "KmsKeyId"}]}]} }, "Resources": { "LambdaExecutionRole": { "Type": "AWS::IAM::Role", "Properties": { "AssumeRolePolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Principal": { "Service": [ "lambda.amazonaws.com" ] }, "Action": [ "sts:AssumeRole" ] } ] }, "Path": "/", "Policies": [ { "PolicyName": "sagemaker", "PolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "sagemaker:Create*", "sagemaker:Delete*" ], "Resource": [ { "Fn::Join": [ "", [ "arn:aws:sagemaker:", { "Ref": "AWS::Region" }, ":", { "Ref": "AWS::AccountId" }, ":*" ] ] } ] }, { "Effect": "Allow", "Action": [ "iam:PassRole" ], "Resource": { "Fn::Join": [ "", [ "arn:aws:iam::", { "Ref": "AWS::AccountId" }, ":role/*" ] ] }, "Condition": { "StringEquals": { "iam:PassedToService": "sagemaker.amazonaws.com" } } } ] } }, { "PolicyName": "glue", "PolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "glue:DeleteJob", "glue:StartJobRun", "glue:PutDataCatalogEncryptionSettings" ], "Resource": "*" }, { "Effect": "Allow", "Action": [ "glue:CreateTable" ], "Resource": { "Fn::Join": [ "", [ "arn:aws:glue:", { "Ref": "AWS::Region" }, ":", { "Ref": "AWS::AccountId" }, ":*" ] ] } } ] } }, { "PolicyName": "logs", "PolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "logs:CreateLogGroup", "logs:CreateLogStream", "logs:PutLogEvents" ], "Resource": { "Fn::Join": [ "", [ "arn:aws:logs:", { "Ref": "AWS::Region" }, ":", { "Ref": "AWS::AccountId" }, ":log-group:/aws/lambda/*" ] ] } } ] } }, { "PolicyName": "s3", "PolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "s3:GetObject" ], "Resource": [ { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Fn::FindInMap": [ "SB", "Src", "Bucket" ] }, "-", { "Ref": "AWS::Region" }, "/", { "Fn::FindInMap": [ "SB", "Src", "SolutionName" ] }, "/*" ] ] }, { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Ref": "S3Bucket" }, "/*" ] ] } ] }, { "Effect": "Allow", "Action": [ "s3:PutObject", "s3:CreateBucket", "s3:PutBucketPolicy", "s3:PutBucketPublicAccessBlock", "s3:PutEncryptionConfiguration" ], "Resource": [ { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Ref": "S3Bucket" }, "/*" ] ] }, { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Ref": "S3Bucket" } ] ] }, { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Ref": "DestinationBucket" } ] ] } ] }, { "Effect": "Allow", "Action": [ "s3:PutAccountPublicAccessBlock" ], "Resource": "*" } ] } } ] }, "Metadata": { "cfn_nag": { "rules_to_suppress": [ { "id": "F3", "reason": "This is required for Glue access to run all AWS Glue API operations." }, { "id": "W11", "reason": "This is required for Glue access to run all AWS Glue API operations. https://docs.aws.amazon.com/glue/latest/dg/create-service-policy.html" } ] } } }, "JobRole": { "Type": "AWS::IAM::Role", "Properties": { "AssumeRolePolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Principal": { "Service": [ "glue.amazonaws.com" ] }, "Action": [ "sts:AssumeRole" ] } ] }, "Path": "/", "Policies": [ { "PolicyName": "gluejob", "PolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "glue:*" ], "Resource": "*" } ] } }, { "PolicyName": "s3", "PolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "s3:GetObject", "s3:PutObject", "s3:DeleteObject", "s3:GetBucketLocation", "s3:CreateBucket" ], "Resource": [ "arn:aws:s3:::aws-glue-*/*", "arn:aws:s3:::*/*aws-glue-*/*", { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Ref": "S3Bucket" }, "/*" ] ] }, { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Ref": "DestinationBucket" }, "/*" ] ] } ] }, { "Effect": "Allow", "Action": [ "s3:ListBucket" ], "Resource": "*" } ] } }, { "PolicyName": "logs", "PolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "logs:CreateLogGroup", "logs:CreateLogStream", "logs:PutLogEvents" ], "Resource": [ { "Fn::Join": [ "", [ "arn:aws:logs:", { "Ref": "AWS::Region" }, ":", { "Ref": "AWS::AccountId" }, ":log-group:/aws-glue/*" ] ] } ] } ] } } ] }, "Metadata": { "cfn_nag": { "rules_to_suppress": [ { "id": "F3", "reason": "This is required for Glue access to run all AWS Glue API operations." }, { "id": "W11", "reason": "This is required for Glue access to run all AWS Glue API operations. https://docs.aws.amazon.com/glue/latest/dg/create-service-policy.html" } ] } } }, "SagemakerExecutionRole": { "Type": "AWS::IAM::Role", "Condition": "CreateSagemaker", "Properties": { "AssumeRolePolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Principal": { "Service": [ "sagemaker.amazonaws.com" ] }, "Action": [ "sts:AssumeRole" ] } ] }, "ManagedPolicyArns": [ "arn:aws:iam::aws:policy/AmazonSageMakerFullAccess" ], "Path": "/", "Policies": [ { "PolicyName": "AmazonSagemakerExecution", "PolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "iam:GetRole", "iam:PassRole" ], "Resource": { "Fn::Join": [ "", [ "arn:aws:iam::", { "Ref": "AWS::AccountId" }, ":role/", { "Ref": "AWS::StackName" }, "-SagemakerExecutionRole-*" ] ] } }, { "Effect": "Allow", "Action": [ "s3:GetObject", "s3:PutObject", "s3:DeleteObject" ], "Resource": [ { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Ref": "S3Bucket" }, "/*" ] ] }, { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Ref": "DestinationBucket" }, "/*" ] ] } ] }, { "Effect": "Allow", "Action": [ "s3:ListBucket" ], "Resource": [ { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Ref": "DestinationBucket" } ] ] } ] } ] } } ] }, "Metadata": { "cfn_nag": { "rules_to_suppress": [ { "id": "W11", "reason": "This is required for Sagemaker access to run -SagemakerExecutionRole-*." } ] } } }, "LambdaFunction": { "Type": "AWS::Lambda::Function", "Properties": { "Environment": { "Variables": { "AWS_ACCOUNT_ID": { "Ref": "AWS::AccountId" }, "SB_BUCKET": { "Fn::Join": [ "-", [ { "Fn::FindInMap": [ "SB", "Src", "Bucket" ] }, { "Ref": "AWS::Region" } ] ] }, "S3_BUCKET": { "Ref": "S3Bucket" }, "S3_DESTINATION_BUCKET": { "Ref": "DestinationBucket" }, "SB_PREFIX_ARTIFACTS": { "Fn::Join": [ "/", [ { "Fn::FindInMap": [ "SB", "Src", "SolutionName" ] }, "%%VERSION%%", "industry", { "Ref": "Industry" } ] ] }, "S3_PREFIX_ARTIFACTS": { "Fn::Join": [ "/", [ { "Fn::FindInMap": [ "SB", "Src", "SolutionName" ] }, "%%VERSION%%", { "Ref": "Industry" } ] ] }, "S3_PREFIX_RAWDATA": { "Ref": "S3PrefixRaw" }, "LifeCycleConfigName": { "Fn::Join": [ "-", [ "Script", { "Ref": "AWS::StackName" } ] ] }, "GlueJob": { "Fn::Join": [ "-", [ "csv-to-parquet", { "Ref": "Industry" }, { "Ref": "AWS::StackName" } ] ] }, "Database": { "Ref": "MlDatabase" }, "OutputPath": { "Fn::Join": [ "", [ "s3://", { "Ref": "DestinationBucket" }, "/", { "Ref": "S3PrefixTransform" } ] ] } } }, "Handler": "lambda_function.handler", "Role": { "Fn::GetAtt": [ "LambdaExecutionRole", "Arn" ] }, "Code": { "S3Bucket": { "Fn::Join": [ "", [ { "Fn::FindInMap": [ "Function", "CustomResource", "S3Bucket" ] }, "-", { "Ref": "AWS::Region" } ] ] }, "S3Key": { "Fn::FindInMap": [ "Function", "CustomResource", "S3Key" ] } }, "Runtime": "python3.7", "Timeout": 240 } }, "MlArtifactsResource": { "Type": "Custom::MlArtifactsCustomResource", "DependsOn" : "LambdaFunction", "Version": "1.0", "Properties": { "ServiceToken": { "Fn::GetAtt": [ "LambdaFunction", "Arn" ] }, "TransferSyntheticData": { "Fn::If": [ "MockData", "true", "false" ] }, "CopyArtifacts": "true", "S3Bucket": { "Ref" : "S3Bucket" } } }, "MlDatabase": { "Type": "AWS::Glue::Database", "Properties": { "CatalogId": { "Ref": "AWS::AccountId" }, "DatabaseInput": { "Description": "Database to hold tables for machine learning data" } } }, "TransformationJob": { "Type": "AWS::Glue::Job", "DependsOn": "MlArtifactsResource", "Properties": { "Command": { "Name": "glueetl", "PythonVersion": "3", "ScriptLocation": { "Fn::Join": [ "", [ "s3://", { "Ref": "S3Bucket" }, "/", { "Fn::FindInMap": [ "SB", "Src", "SolutionName" ] }, "/%%VERSION%%/", { "Ref": "Industry" }, "/scripts/glue-script/csv-to-parquet.py" ] ] } }, "GlueVersion": "1.0", "ExecutionProperty": { "MaxConcurrentRuns": 3 }, "AllocatedCapacity": { "Fn::FindInMap": [ "ETL", "Glue", "DPU" ] }, "MaxRetries": 1, "Name": { "Fn::Join": [ "-", [ "csv-to-parquet", { "Ref": "Industry" }, { "Ref": "AWS::StackName" } ] ] }, "Role": { "Ref": "JobRole" } } }, "ScheduledJobTrigger": { "Type": "AWS::Glue::Trigger", "Properties": { "Type": "SCHEDULED", "Description": "DESCRIPTION_SCHEDULED", "Schedule": { "Ref": "CronJob" }, "Actions": [ { "JobName": { "Ref": "TransformationJob" }, "Arguments": { "--job-bookmark-option": "job-bookmark-enable" } } ], "Name": { "Fn::Join": [ "-", [ "ml-for-all", { "Ref": "AWS::StackName" } ] ] } } }, "MlSagemakerResource": { "Type": "Custom::MlSagemakerCustomResource", "DependsOn": "MlArtifactsResource", "Version": "1.0", "Properties": { "ServiceToken": { "Fn::GetAtt": [ "LambdaFunction", "Arn" ] }, "CreateConfig": { "Fn::If": [ "CreateSagemaker", "true", "false" ] } } }, "Model": { "Type": "AWS::SageMaker::Model", "Condition": "CreateSagemaker", "DependsOn": "MlArtifactsResource", "Properties": { "PrimaryContainer": { "Image": { "Fn::Join": [ "", [ { "Ref": "AWS::AccountId" }, ".dkr.ecr.", { "Ref": "AWS::Region" }, ".amazonaws.com/linear-learner:1" ] ] }, "ModelDataUrl": { "Fn::Join": [ "", [ "s3://", { "Ref": "S3Bucket" }, "/", { "Fn::FindInMap": [ "SB", "Src", "SolutionName" ] }, "/%%VERSION%%/", { "Ref": "Industry" }, "/models/model.tar.gz" ] ] } }, "ExecutionRoleArn": { "Fn::GetAtt": [ "SagemakerExecutionRole", "Arn" ] } } }, "NotebookInstance": { "Type": "AWS::SageMaker::NotebookInstance", "Condition": "CreateSagemaker", "DependsOn": "MlSagemakerResource", "Properties": { "KmsKeyId" : {"Fn::If": ["HasKmsKeyId", {"Ref": "KmsKeyId"}, {"Ref": "AWS::NoValue"}]}, "NotebookInstanceName": { "Fn::Join": [ "-", [ "ml-for-all", { "Ref": "AWS::StackName" } ] ] }, "InstanceType": { "Ref": "NotebookInstanceType" }, "RoleArn": { "Fn::GetAtt": [ "SagemakerExecutionRole", "Arn" ] }, "LifecycleConfigName": { "Fn::Join": [ "-", [ "Script", { "Ref": "AWS::StackName" } ] ] } } }, "MlGlueResource": { "Type": "Custom::MlGlueCustomResource", "DependsOn": [ "TransformationJob" ], "Version": "1.0", "Properties": { "ServiceToken": { "Fn::GetAtt": [ "LambdaFunction", "Arn" ] }, "InvokeGlueJob": { "Fn::If": [ "AutoStart", "true", "false" ] } } }, "SolutionHelperRole": { "Type": "AWS::IAM::Role", "Properties": { "AssumeRolePolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Principal": { "Service": "lambda.amazonaws.com" }, "Action": "sts:AssumeRole" } ] }, "Path": "/", "Policies": [ { "PolicyName": "Custom_Solution_Helper_Permissions", "PolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "logs:CreateLogGroup", "logs:CreateLogStream", "logs:PutLogEvents" ], "Resource": { "Fn::Join": [ "", [ "arn:aws:logs:", { "Ref": "AWS::Region" }, ":", { "Ref": "AWS::AccountId" }, ":log-group:/aws/lambda/*" ] ] } } ] } } ] }, "Metadata": { "cfn_nag": { "rules_to_suppress": [ { "id": "W11", "reason": "This is required for lambda log-group specification." } ] } } }, "SolutionHelper": { "Type": "AWS::Lambda::Function", "Properties": { "Handler": "local_solution_helper/solution-helper.lambda_handler", "Role": { "Fn::GetAtt": [ "SolutionHelperRole", "Arn" ] }, "Description": { "Fn::FindInMap" : [ "Function", "SolutionHelper", "Description"] }, "Code": { "S3Bucket": { "Fn::Join": [ "", [ { "Fn::FindInMap" : [ "Function", "SolutionHelper", "S3Bucket"]}, "-", { "Ref": "AWS::Region" } ] ] }, "S3Key": { "Fn::FindInMap" : [ "Function", "SolutionHelper", "S3Key"] } }, "Runtime": "python3.7", "Timeout": 300 } }, "CreateUniqueID": { "Type": "Custom::CreateUUID", "Properties": { "ServiceToken": { "Fn::GetAtt": [ "SolutionHelper", "Arn" ] }, "CreateUniqueID": "true" } }, "AnonymousData": { "Type": "Custom::AnonymousData", "Properties": { "ServiceToken": { "Fn::GetAtt": [ "SolutionHelper", "Arn" ] }, "SendAnonymousData": { "Fn::Join": [ "", [ "{ 'Solution' : 'SO0033',", "'UUID' : '", { "Fn::GetAtt": [ "CreateUniqueID", "UUID" ] }, "', ", "'Data': {", "'SyntheticData': '", { "Ref": "SyntheticData" }, "',", "'AutoStartTransformation': '", { "Ref": "AutoStartTransformation" }, "',", "'DPU': '", { "Fn::FindInMap": [ "ETL", "Glue", "DPU" ] }, "',", "'DeploySagemaker': '", { "Ref": "DeploySagemaker" }, "',", "'InstanceType': '", { "Ref": "NotebookInstanceType" }, "',", "'Region': '", { "Ref": "AWS::Region" }, "',", "'SendAnonymousData': '", { "Fn::FindInMap": [ "Send", "AnonymousUsage", "Data" ] }, "'", "}", "}" ] ] } } } }, "Outputs": { "Database": { "Description": "Machine learning Database", "Value": { "Ref": "MlDatabase" } }, "GlueJob": { "Description": "Glue Job Name", "Value": { "Ref": "TransformationJob" } }, "BasicNotebookInstanceId": { "Condition": "CreateSagemaker", "Value": { "Ref": "NotebookInstance" } }, "SagemakerInstance": { "Description": "Name of Sagemaker Instance", "Condition": "CreateSagemaker", "Value": { "Fn::Join": [ "-", [ "ml", { "Ref": "Industry" }, { "Ref": "AWS::StackName" } ] ] } } } }