AWSTemplateFormatVersion: "2010-09-09" Transform: AWS::Serverless-2016-10-31 Description: > training-pipeline Training pipeline and redeploy the new model Parameters: # For Building Cognito pool TrainingBucketName: Type: String Description: Name of Artifact Bucket # Env vars For training step ECRTrainingImageRepoName: Type: String Default: "dnn" TrainingInstanceSize: Type: String Default: "ml.c4.2xlarge" EndpointInstanceSize: Type: String Default: "ml.t2.medium" MaxNumberEndpoint: Type: Number Default: 5 # For cognito user pool EmailAddress: Type: String Description: Email address of the admin # More info about Globals: https://github.com/awslabs/serverless-application-model/blob/master/docs/globals.rst Globals: Function: MemorySize: 1792 # One full vCPU Timeout: 30 # Default timeout for API Gateway. Make it consistent with Lambda Api: Auth: Authorizers: MyCognitoAuth: UserPoolArn: !GetAtt UserPool.Arn # Can also accept an array Cors: AllowOrigin: "'*'" AllowHeaders: "'Content-Type,X-Amz-Date,Authorization,X-Api-Key,X-Amz-Security-Token'" AllowMethods: "'DELETE,GET,HEAD,OPTIONS,PATCH,POST,PUT'" MethodSettings: - ResourcePath: "/*" HttpMethod: "*" # TODO: Tune down logging level when you go live LoggingLevel: INFO DataTraceEnabled: True MetricsEnabled: True Resources: ########################################################################### ### Add a role for API Gateway to push log to CloudWatch ########################################################################### # This allows you to debug why API Gateway give an error response without triggering Lambda # (or it triggers a Lambda function, but the return response is incorrect) # # Note about deletion: (13 Feb 2020) # When you delete these resources, the ApiGateway::Account will still point to the same role # (CFN doesn't remove it as you would expect). The role, however, will be deleted. # This will cause an error of role not found. There is no effective way to avoid this as it's # CFN behavior. ########################################################################### AmazonAPIGatewayPushToCloudWatchLogsRole: Type: 'AWS::IAM::Role' Properties: AssumeRolePolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Principal: Service: - apigateway.amazonaws.com Action: 'sts:AssumeRole' Path: / ManagedPolicyArns: - arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs # Note that this setting is for all APIs in the deployed region # If you have manually assigned a role to API GW, it would be overridden. # Though it shouldn't break anything. ApiGatewayAccount: Type: 'AWS::ApiGateway::Account' Properties: CloudWatchRoleArn: !GetAtt - AmazonAPIGatewayPushToCloudWatchLogsRole - Arn ########################################################################### ### Step Functions for training and HPO #### ########################################################################### TrainingModelStateMachine: Type: AWS::Serverless::StateMachine # More info about State Machine Resource: https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/sam-resource-statemachine.html Properties: DefinitionUri: statemachine/model-training.asl.json DefinitionSubstitutions: DeleteOldestEndpointFunctionArn: !GetAtt DeleteOldestEndpointFunction.Arn CheckDeploymentStatusFunctionArn: !GetAtt CheckDeploymentStatusFunction.Arn ModelTable: !Ref ModelTable HPOTable: !Ref HPOTable Policies: # Find out more about SAM policy templates: https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-policy-templates.html - LambdaInvokePolicy: FunctionName: !Ref DeleteOldestEndpointFunction - LambdaInvokePolicy: FunctionName: !Ref CheckDeploymentStatusFunction - CloudWatchPutMetricPolicy: {} - Version: 2012-10-17 Statement: - Effect: Allow Resource: "*" Action: - sagemaker:CreateTransformJob - sagemaker:DescribeTransformJob - sagemaker:StopTransformJob - sagemaker:CreateTrainingJob - sagemaker:DescribeTrainingJob - sagemaker:StopTrainingJob - sagemaker:CreateHyperParameterTuningJob - sagemaker:DescribeHyperParameterTuningJob - sagemaker:StopHyperParameterTuningJob - sagemaker:CreateModel - sagemaker:CreateEndpointConfig - sagemaker:CreateEndpoint - sagemaker:DeleteEndpointConfig - sagemaker:DeleteEndpoint - sagemaker:UpdateEndpoint - sagemaker:AddTags - sagemaker:DeleteTags - sagemaker:ListTags - sqs:SendMessage - sns:Publish - ecs:RunTask - ecs:StopTask - ecs:DescribeTasks # - batch:SubmitJob # - batch:DescribeJobs # - batch:TerminateJob # - glue:StartJobRun # - glue:GetJobRun # - glue:GetJobRuns # - glue:BatchStopJobRu - Effect: Allow Action: - iam:PassRole Resource: "*" Condition: StringEquals: "iam:PassedToService": sagemaker.amazonaws.com - Effect: Allow Action: - events:PutTargets - events:PutRule - events:DescribeRule Resource: - arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTrainingJobsRule - arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTransformJobsRule - arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTuningJobsRule - arn:aws:events:*:*:rule/StepFunctionsGetEventsForECSTaskRule - arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule - Effect: Allow Action: - dynamodb:GetItem - dynamodb:PutItem - dynamodb:UpdateItem - dynamodb:DeleteItem Resource: - !GetAtt ModelTable.Arn - !Sub - '${TableArn}/index/*' - { TableArn: !GetAtt ModelTable.Arn } - !GetAtt HPOTable.Arn - !Sub - '${TableArn}/index/*' - { TableArn: !GetAtt HPOTable.Arn } SageMakerExecutionRole: Type: AWS::IAM::Role Properties: RoleName: !Sub ${AWS::StackName}-SageMakerExecutionRole AssumeRolePolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Principal: Service: - sagemaker.amazonaws.com Action: - sts:AssumeRole ManagedPolicyArns: - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess Path: / SageMakerExecutionPolicy: Type: AWS::IAM::Policy Properties: PolicyName: !Sub ${AWS::StackName}-SageMakerExecutionPolicy PolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Action: - s3:PutObject - s3:GetBucketPolicy - s3:GetObject - s3:ListBucket Resource: - !Join ['',['arn:aws:s3:::',!Ref TrainingBucketName, '/*']] - !Join ['',['arn:aws:s3:::',!Ref TrainingBucketName]] Roles: - !Ref SageMakerExecutionRole ################################################################## ########################### Model Table ############################# # Keep related status # - created # - status [TRAINING_MODEL, CREATING_MODEL, CONFIGURING_ENDPOINT, DELETING_OLD_ENDPOINT, DEPLOYING_ENDPOINT, READY, ERROR, ENDPOINT_DELETED] # - modelOutput (e.g. "s3://bucket/prefix/model.tar.gz") # - modelArn # - endpointConfigArn # - endpointArn # - error # - errorCause (JSON string) ################################################################## ModelTable: Type: AWS::DynamoDB::Table Properties: AttributeDefinitions: - AttributeName: "trainingId" AttributeType: "S" - AttributeName: "created" AttributeType: "S" - AttributeName: "endpointArn" AttributeType: "S" KeySchema: - AttributeName: "trainingId" KeyType: "HASH" - AttributeName: "created" KeyType: "RANGE" GlobalSecondaryIndexes: - IndexName: "endpointArn" KeySchema: - AttributeName: "endpointArn" KeyType: "HASH" Projection: ProjectionType: "KEYS_ONLY" BillingMode: "PAY_PER_REQUEST" # Use On-demand mode SSESpecification: SSEEnabled: true ################################################################## ########################### HPO Table ############################# # Keep related status # - created # - status [RUNNING_HPO, HPO_COMPLETED, ERROR] # - error # - errorCause (JSON string) ################################################################## HPOTable: Type: AWS::DynamoDB::Table Properties: AttributeDefinitions: - AttributeName: "trainingId" AttributeType: "S" - AttributeName: "created" AttributeType: "S" KeySchema: - AttributeName: "trainingId" KeyType: "HASH" - AttributeName: "created" KeyType: "RANGE" BillingMode: "PAY_PER_REQUEST" # Use On-demand mode SSESpecification: SSEEnabled: true ################################################################## ########################### API Lambda Functions ################# ################################################################## APIDependenciesLayer: Type: AWS::Serverless::LayerVersion Properties: LayerName: APIDependencies Description: Dependencies for API ContentUri: dependencies/api # TODO: Put requirement.txt in this folder CompatibleRuntimes: - python3.7 Metadata: BuildMethod: python3.7 # This one causes SAM to build the layer InferFunction: Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction Properties: CodeUri: functions/api/ Handler: infer.post Runtime: python3.7 Policies: - Version: 2012-10-17 Statement: - Effect: Allow Action: - sagemaker:InvokeEndpoint Resource: - !Sub arn:aws:sagemaker:${AWS::Region}:${AWS::AccountId}:endpoint/model-* Events: Infer: Type: Api # More info about API Event Source: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#api Properties: Path: /infer Method: post Auth: Authorizer: MyCognitoAuth Layers: - !Ref APIDependenciesLayer StartHPOFunction: Type: AWS::Serverless::Function Properties: CodeUri: functions/api/ Handler: start_job.post Runtime: python3.7 Environment: Variables: MODE: "HPO" TRAINING_STATE_MACHINE_ARN: !Ref TrainingModelStateMachine ACCOUNT_ID: !Ref AWS::AccountId REGION: !Ref AWS::Region ECR_TRAINING_IMAGE_REPO_NAME: !Ref ECRTrainingImageRepoName TRAINING_BUCKET_NAME: !Ref TrainingBucketName TRAINING_INSTANCE_SIZE: !Ref TrainingInstanceSize SAGEMAKER_EXECUTION_ROLE_ARN: !GetAtt SageMakerExecutionRole.Arn Policies: - AWSLambdaExecute - DynamoDBCrudPolicy: TableName: !Ref ModelTable - Version: 2012-10-17 Statement: - Effect: Allow Action: - states:StartExecution Resource: !Ref TrainingModelStateMachine Events: PostHPO: Type: Api Properties: Path: /hpo Method: post Auth: Authorizer: MyCognitoAuth Layers: - !Ref APIDependenciesLayer StartTrainingFunction: Type: AWS::Serverless::Function Properties: CodeUri: functions/api/ Handler: start_job.post Runtime: python3.7 Environment: Variables: MODE: "MODEL" TRAINING_STATE_MACHINE_ARN: !Ref TrainingModelStateMachine ACCOUNT_ID: !Ref AWS::AccountId REGION: !Ref AWS::Region ECR_TRAINING_IMAGE_REPO_NAME: !Ref ECRTrainingImageRepoName TRAINING_BUCKET_NAME: !Ref TrainingBucketName TRAINING_INSTANCE_SIZE: !Ref TrainingInstanceSize ENDPOINT_INSTANCE_SIZE: !Ref EndpointInstanceSize SAGEMAKER_EXECUTION_ROLE_ARN: !GetAtt SageMakerExecutionRole.Arn Policies: - AWSLambdaExecute - DynamoDBCrudPolicy: TableName: !Ref ModelTable - Version: 2012-10-17 Statement: - Effect: Allow Action: - states:StartExecution Resource: !Ref TrainingModelStateMachine Events: PostModel: Type: Api Properties: Path: /model Method: post Auth: Authorizer: MyCognitoAuth Layers: - !Ref APIDependenciesLayer GetModelsFunction: Type: AWS::Serverless::Function Properties: CodeUri: functions/api/ Handler: model.get Runtime: python3.7 Environment: Variables: MODEL_TABLE: !Ref ModelTable Policies: - DynamoDBReadPolicy: TableName: !Ref ModelTable Events: GetModels: Type: Api Properties: Path: /model Method: get Auth: Authorizer: MyCognitoAuth Layers: - !Ref APIDependenciesLayer GetHPOsFunction: Type: AWS::Serverless::Function Properties: CodeUri: functions/api/ Handler: hpo.get Runtime: python3.7 Environment: Variables: HPO_TABLE: !Ref HPOTable Policies: - DynamoDBReadPolicy: TableName: !Ref HPOTable Events: GetHPOs: Type: Api Properties: Path: /hpo Method: get Auth: Authorizer: MyCognitoAuth Layers: - !Ref APIDependenciesLayer ################################################################## ########################### Step Function Lambda Functions ####### ################################################################## DeleteOldestEndpointFunction: Type: AWS::Serverless::Function # More info about Function Resource: https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/sam-resource-function.html Properties: CodeUri: functions/step_functions/ Handler: delete_oldest_endpoint.lambda_handler Runtime: python3.7 Environment: Variables: MODEL_TABLE: !Ref ModelTable MAX_NUMBER_END_POINT: !Ref MaxNumberEndpoint Policies: - Version: 2012-10-17 # Inline Policy Statement: - Effect: Allow Action: - sagemaker:ListEndpoints - sagemaker:DeleteEndpoint Resource: "*" - Effect: Allow Action: - dynamodb:GetItem - dynamodb:PutItem - dynamodb:UpdateItem - dynamodb:DeleteItem - dynamodb:Query Resource: - !GetAtt ModelTable.Arn - !Sub - '${TableArn}/index/*' - { TableArn: !GetAtt ModelTable.Arn } CheckDeploymentStatusFunction: Type: AWS::Serverless::Function # More info about Function Resource: https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/sam-resource-function.html Properties: CodeUri: functions/step_functions/ Handler: check_deployment_status.lambda_handler Runtime: python3.7 Policies: - Version: 2012-10-17 # Inline Policy Statement: - Effect: Allow Action: - sagemaker:ListEndpoints - sagemaker:DescribeEndpoint Resource: "*" ############################################### ######### Cognito Pool ##################### ############################################## UserPool: Type: AWS::Cognito::UserPool Metadata: cfn_nag: rules_to_suppress: - id: F78 reason: "We dont use MFA within UserPool. Force CFN-NAG to prevent marking this as Failure" Properties: UserPoolName: !Ref 'AWS::StackName' AdminCreateUserConfig: AllowAdminCreateUserOnly: true InviteMessageTemplate: EmailMessage: !Join [' ', ['You are invited to join', !Ref 'AWS::StackName', 'your username is {username} and temporary password is: {####}']] EmailSubject: !Join [' ', ['Your', !Ref 'AWS::StackName', 'account']] UnusedAccountValidityDays: 7 AliasAttributes: - 'email' AutoVerifiedAttributes: - 'email' MfaConfiguration: 'OFF' Policies: PasswordPolicy: MinimumLength: 8 RequireLowercase: true RequireNumbers: true RequireSymbols: false RequireUppercase: true AdminUser: Type: "AWS::Cognito::UserPoolUser" Properties: DesiredDeliveryMediums: - EMAIL UserAttributes: - Name: email Value: !Ref EmailAddress Username: Admin UserPoolId: !Ref UserPool UserPoolClient: Type: "AWS::Cognito::UserPoolClient" Properties: ClientName: web-client GenerateSecret: false ReadAttributes: - email RefreshTokenValidity: 1 UserPoolId: !Ref UserPool IdentityPool: Type: "AWS::Cognito::IdentityPool" Properties: AllowUnauthenticatedIdentities: false CognitoIdentityProviders: - ClientId: !Ref UserPoolClient ProviderName: !Join ['', [!Sub 'cognito-idp.${AWS::Region}.amazonaws.com/', !Ref 'UserPool']] IdpAuthenticatedRole: Type: AWS::IAM::Role Properties: RoleName: !Sub ${AWS::StackName}-idp-au-role AssumeRolePolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Principal: Federated: - cognito-identity.amazonaws.com Action: - sts:AssumeRoleWithWebIdentity Condition: StringEquals: cognito-identity.amazonaws.com:aud: !Ref IdentityPool ForAnyValue:StringLike: cognito-identity.amazonaws.com:amr: "authenticated" Path: / Policies: - PolicyName: PipelinePolicy PolicyDocument: Version: 2012-10-17 Statement: - Action: - cognito-identity:DescribeIdentityPool - cognito-identity:LookupDeveloperIdentity - cognito-identity:ListIdentityPools - cognito-identity:ListIdentities - cognito-identity:ListTagsForResource - cognito-identity:GetOpenIdTokenForDeveloperIdentity - cognito-identity:GetOpenIdToken - cognito-identity:GetIdentityPoolRoles - cognito-identity:DescribeIdentity - cognito-identity:GetCredentialsForIdentity - cognito-identity:GetId - cognito-identity:SetIdentityPoolRoles - cognito-identity:CreateIdentityPool - cognito-identity:UpdateIdentityPool Effect: Allow Resource: - !Join ['', ['arn:aws:cognito-identity:*:*:identitypool/', !Ref IdentityPool ]] - Action: - s3:GetObject - s3:PutObject - s3:DeleteObject Effect: Allow Resource: # See https://aws.amazon.com/premiumsupport/knowledge-center/unable-validate-circular-dependency-cloudformation/ for details - !Join ['', ['arn:aws:s3:::', !Ref TrainingBucketName, '/public/*']] - !Join ['', ['arn:aws:s3:::', !Ref TrainingBucketName, '/protected/${cognito-identity.amazonaws.com:sub}/*']] - !Join ['', ['arn:aws:s3:::', !Ref TrainingBucketName, '/private/${cognito-identity.amazonaws.com:sub}/*']] - Action: - s3:PutObject Effect: Allow Resource: - !Join ['', ['arn:aws:s3:::', !Ref TrainingBucketName, '/uploads/*']] - Action: - s3:GetObject Effect: Allow Resource: - !Join ['', ['arn:aws:s3:::', !Ref TrainingBucketName, '/protected/*']] - Action: - s3:ListBucket Effect: Allow Resource: - !Join ['', ['arn:aws:s3:::', !Ref TrainingBucketName]] Condition: StringLike: "s3:prefix": - "public/" - "public/*" - "protected/" - "protected/*" - "private/${cognito-identity.amazonaws.com:sub}/" - "private/${cognito-identity.amazonaws.com:sub}/*" IdpUnauthenticatedRole: Type: AWS::IAM::Role Properties: RoleName: !Sub ${AWS::StackName}-idp-un-role AssumeRolePolicyDocument: Version: 2012-10-17 Statement: - Effect: Allow Principal: Federated: - cognito-identity.amazonaws.com Action: - sts:AssumeRoleWithWebIdentity Condition: StringEquals: cognito-identity.amazonaws.com:aud: !Ref IdentityPool ForAnyValue:StringLike: cognito-identity.amazonaws.com:amr: "unauthenticated" Path: / Policies: - PolicyName: PipelinePolicy PolicyDocument: Version: 2012-10-17 Statement: - Action: - s3:PutObject Effect: Deny Resource: - "*" IdentityPoolRoleAttachment: Type: "AWS::Cognito::IdentityPoolRoleAttachment" Properties: IdentityPoolId: !Ref IdentityPool Roles: { "authenticated":!GetAtt IdpAuthenticatedRole.Arn, "unauthenticated":!GetAtt IdpUnauthenticatedRole.Arn } Outputs: # StockTradingStateMachineHourlyTradingSchedule is an implicit Schedule event rule created out of Events key under Serverless::StateMachine # Find out more about other implicit resources you can reference within SAM # https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/sam-specification-generated-resources.html TrainingModelStateMachineArn: Description: "Training Model State machine ARN" Value: !Ref TrainingModelStateMachine TrainingStateMachineRoleArn: Description: "IAM Role created for Stock Trading State machine based on the specified SAM Policy Templates" Value: !GetAtt TrainingModelStateMachineRole.Arn EndpointURL: Description: "API Gateway endpoint URL for Prod stage for Hello World function" Value: !Sub "https://${ServerlessRestApi}.execute-api.${AWS::Region}.amazonaws.com/Prod/" UserPoolClient: Value: !Ref UserPoolClient IdentityPool: Value: !Ref IdentityPool UserPool: Value: !Ref UserPool Region: Value: !Ref AWS::Region UserPoolArn: Value: !GetAtt UserPool.Arn TrainingBucketName: Value: !Ref TrainingBucketName