AWSTemplateFormatVersion: '2010-09-09' Transform: 'AWS::Serverless-2016-10-31' Description: 'Corrosion Detection - Machine Learning Training and Model Hosting Step function workflow' Resources: # Step functions state machine StateMachineModelAndEndpointStateMachine: Type: 'AWS::StepFunctions::StateMachine' Properties: StateMachineName: 'Corrosion-Detection-ModelAndEndpointStateMachine' RoleArn: !GetAtt StateMachineExecutionRole.Arn DefinitionString: !Sub | { "Comment": "Machine Learning Training and Model Hosting workflow", "StartAt": "StartTrainingJob", "Version": "1.0", "States": { "StartTrainingJob": { "Type": "Task", "Resource": "arn:aws:states:::sagemaker:createTrainingJob.sync", "Parameters": { "AlgorithmSpecification": { "TrainingImage": "811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest", "TrainingInputMode": "File" }, "HyperParameters": { "max_depth.$" : "$.HyperParameters.max_depth", "learning_rate.$" : "$.HyperParameters.learning_rate", "eta.$" : "$.HyperParameters.eta", "colsample_bytree.$" : "$.HyperParameters.colsample_bytree", "gamma.$" : "$.HyperParameters.gamma", "n_estimators.$" : "$.HyperParameters.n_estimators", "min_child_weight.$": "$.HyperParameters.min_child_weight", "num_class.$" : "$.HyperParameters.num_class", "subsample.$" : "$.HyperParameters.subsample", "num_round.$" : "$.HyperParameters.num_round", "objective.$" : "$.HyperParameters.objective" }, "InputDataConfig": [ { "ChannelName": "train", "CompressionType": "None", "ContentType": "text/csv", "DataSource": { "S3DataSource": { "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri.$": "$.InputTrainingS3Uri" } }, "RecordWrapperType": "None" }, { "ChannelName": "validation", "CompressionType": "None", "ContentType": "text/csv", "DataSource": { "S3DataSource": { "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri.$": "$.InputValidationS3Uri" } }, "RecordWrapperType": "None" } ], "OutputDataConfig": { "S3OutputPath.$": "$.S3OutputPath" }, "ResourceConfig": { "InstanceCount.$": "$.InstanceCount", "InstanceType.$": "$.InstanceType", "VolumeSizeInGB": 30 }, "RoleArn": "${SageMakerExecutionRole.Arn}", "StoppingCondition": { "MaxRuntimeInSeconds.$": "$.MaxRuntimeInSeconds" }, "TrainingJobName.$": "$.TrainingJobName" }, "ResultPath": "$.Step1", "Next": "CreateEndPoint" }, "CreateEndPoint": { "Type": "Task", "Resource": "${ModelAndEndpointDeployHandler.Arn}", "ResultPath": "$.endpointCreationResult", "Next": "CheckEndPointStatus" }, "CheckEndPointStatus": { "Type": "Task", "Resource": "${ModelAndEndpointCheckDeployHandler.Arn}", "ResultPath": "$.endpointStatusResult", "Next": "IsEndpointInService" }, "IsEndpointInService": { "Type": "Choice", "Choices": [ { "Not": { "Variable": "$.endpointStatusResult.EndpointStatus", "StringEquals": "InService" }, "Next": "wait_sixty_seconds" } ], "Default": "UpdateImagesForTraining" }, "wait_sixty_seconds": { "Type": "Wait", "Seconds": 60, "Next": "CheckEndPointStatus" }, "UpdateImagesForTraining": { "Type": "Task", "Resource": "${ImagesForTrainingUpdator.Arn}", "ResultPath": "$.ImagesForTrainingCounter", "Next": "Done" }, "Done": { "Type": "Pass", "End": true } } } TrainingJobStateMachine: Type: 'AWS::StepFunctions::StateMachine' Properties: StateMachineName: 'Corrosion-Detection-TrainingJobStateMachine' RoleArn: !GetAtt StateMachineExecutionRole.Arn DefinitionString: !Sub | { "Comment": "Sagemaker Machine Learning Model Training workflow", "StartAt": "StartTrainingJob", "Version": "1.0", "States": { "StartTrainingJob": { "Type": "Task", "Resource": "arn:aws:states:::sagemaker:createTrainingJob.sync", "Parameters": { "AlgorithmSpecification": { "TrainingImage": "811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest", "TrainingInputMode": "File" }, "HyperParameters": { "max_depth.$" : "$.HyperParameters.max_depth", "learning_rate.$" : "$.HyperParameters.learning_rate", "eta.$" : "$.HyperParameters.eta", "colsample_bytree.$" : "$.HyperParameters.colsample_bytree", "gamma.$" : "$.HyperParameters.gamma", "n_estimators.$" : "$.HyperParameters.n_estimators", "min_child_weight.$": "$.HyperParameters.min_child_weight", "num_class.$" : "$.HyperParameters.num_class", "subsample.$" : "$.HyperParameters.subsample", "num_round.$" : "$.HyperParameters.num_round", "objective.$" : "$.HyperParameters.objective" }, "InputDataConfig": [ { "ChannelName": "train", "CompressionType": "None", "ContentType": "text/csv", "DataSource": { "S3DataSource": { "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri.$": "$.InputTrainingS3Uri" } }, "RecordWrapperType": "None" }, { "ChannelName": "validation", "CompressionType": "None", "ContentType": "text/csv", "DataSource": { "S3DataSource": { "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri.$": "$.InputValidationS3Uri" } }, "RecordWrapperType": "None" } ], "OutputDataConfig": { "S3OutputPath.$": "$.S3OutputPath" }, "ResourceConfig": { "InstanceCount.$": "$.InstanceCount", "InstanceType.$": "$.InstanceType", "VolumeSizeInGB": 30 }, "RoleArn": "${SageMakerExecutionRole.Arn}", "StoppingCondition": { "MaxRuntimeInSeconds.$": "$.MaxRuntimeInSeconds" }, "TrainingJobName.$": "$.TrainingJobName" }, "ResultPath": "$.Step1", "Next": "UpdateImagesForTraining" }, "UpdateImagesForTraining": { "Type": "Task", "Resource": "${ImagesForTrainingUpdator.Arn}", "ResultPath": "$.Step2", "Next": "Done" }, "Done": { "Type": "Pass", "End": true } } } EndpointStateMachine: Type: 'AWS::StepFunctions::StateMachine' Properties: StateMachineName: 'Corrosion-Detection-EndpointStateMachine' RoleArn: !GetAtt StateMachineExecutionRole.Arn DefinitionString: !Sub | { "Comment": "Sagemaker Endpoint Creation workflow", "StartAt": "CreateEndPoint", "Version": "1.0", "States": { "CreateEndPoint": { "Type": "Task", "Resource": "${EndpointDeployHandler.Arn}", "ResultPath": "$.endpointCreationResult", "Next": "CheckEndPointStatus" }, "CheckEndPointStatus": { "Type": "Task", "Resource": "${ModelAndEndpointCheckDeployHandler.Arn}", "ResultPath": "$.endpointStatusResult", "Next": "IsEndpointInService" }, "IsEndpointInService": { "Type": "Choice", "Choices": [ { "Not": { "Variable": "$.endpointStatusResult.EndpointStatus", "StringEquals": "InService" }, "Next": "wait_sixty_seconds" } ], "Default": "Done" }, "wait_sixty_seconds": { "Type": "Wait", "Seconds": 60, "Next": "CheckEndPointStatus" }, "Done": { "Type": "Pass", "End": true } } } # Lambda functions ModelAndEndpointDeployHandler: Type: 'AWS::Serverless::Function' Properties: Handler: ModelAndEndpointCreator.lambda_handler Runtime: python3.6 CodeUri: . Description: 'Endpoint Creator' MemorySize: 512 Timeout: 300 Environment: Variables: DOCKER_IMAGE: '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest' EXECUTION_ROLE_ARN: !GetAtt SageMakerExecutionRole.Arn Role: !GetAtt LambdaExecutionRole.Arn ModelAndEndpointCheckDeployHandler: Type: 'AWS::Serverless::Function' Properties: Handler: ModelAndEndpointCreator.check_handler Runtime: python3.6 CodeUri: . Description: 'Enpoint Checker' MemorySize: 512 Timeout: 300 Environment: Variables: DOCKER_IMAGE: '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest' EXECUTION_ROLE_ARN: !GetAtt SageMakerExecutionRole.Arn Role: !GetAtt LambdaExecutionRole.Arn EndpointDeployHandler: Type: 'AWS::Serverless::Function' Properties: Handler: ModelDeployer.lambda_handler Runtime: python3.6 CodeUri: . Description: 'Endpoint Deployer' MemorySize: 512 Timeout: 300 Environment: Variables: DOCKER_IMAGE: '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest' EXECUTION_ROLE_ARN: !GetAtt SageMakerExecutionRole.Arn Role: !GetAtt LambdaExecutionRole.Arn ImagesForTrainingUpdator: Type: 'AWS::Serverless::Function' Properties: Handler: TrainingImageUpdator.lambda_handler Runtime: python3.6 CodeUri: . Description: 'Images for Training Lambda Handler' MemorySize: 512 Timeout: 300 Role: !GetAtt LambdaExecutionRole.Arn # IAM roles LambdaExecutionRole: Type: "AWS::IAM::Role" Properties: AssumeRolePolicyDocument: Version: "2012-10-17" Statement: - Effect: Allow Principal: Service: lambda.amazonaws.com Action: 'sts:AssumeRole' ManagedPolicyArns: - 'arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole' Policies: - PolicyName: corrosion-app-lambda-exec-policy PolicyDocument: Statement: - Effect: Allow Action: - 'sagemaker:*' - 'cloudwatch:*' - 'dynamodb:*' - 'iam:*' - 'ssm:*' Resource: '*' SageMakerExecutionRole: Type: 'AWS::IAM::Role' Properties: AssumeRolePolicyDocument: Version: '2012-10-17' Statement: - Effect: Allow Principal: Service: !Sub 'sagemaker.amazonaws.com' Action: 'sts:AssumeRole' Policies: - PolicyName: corrosion-app-SageMakerExecutionRolePolicy PolicyDocument: Statement: - Effect: Allow Action: - 's3:*' - 'sagemaker:*' - 'cloudwatch:*' - 'logs:*' - 'ecr:*' Resource: '*' StateMachineExecutionRole: Type: 'AWS::IAM::Role' Properties: AssumeRolePolicyDocument: Version: '2012-10-17' Statement: - Effect: Allow Principal: Service: !Sub states.amazonaws.com Action: 'sts:AssumeRole' Policies: - PolicyName: corrosion-app-statemachine-SageMakerExecutionRolePolicy PolicyDocument: Statement: - Effect: Allow Action: - 'lambda:*' - 'sagemaker:*' - 'iam:PassRole' - 'events:*' Resource: '*' Outputs: TrainingAndEndpointStateMachineArn: Value: !Ref StateMachineModelAndEndpointStateMachine EndpointStateMachineArn: Value: !Ref EndpointStateMachine TrainingStateMachineArn: Value: !Ref TrainingJobStateMachine