Transform: AWS::Serverless-2016-10-31 Description: Deploy the production Amazon SageMaker Endpoint with Autoscaling, Model Monitoring Schedule and API Gateway Lambda. Parameters: ImageRepoUri: Type: String Description: Uri of the docker (ECR) model image ModelName: Type: String Description: Name of the model TrainJobId: Type: String Description: Id of the Codepipeline + SagemakerJobs DeployRoleArn: Type: String Description: The role for executing the deployment ModelVariant: Type: String Description: Name of the model variant ScheduleMetricName: Type: String Description: The metric to alarm on for schedule ScheduleMetricThreshold: Type: Number Description: The metric alarm threshold KmsKeyId: Description: AWS KMS key ID used to encrypt data at rest on the ML storage volume attached to endpoint config and S3 data capture. Type: String NotificationArn: Description: The arn for notification topic Type: String Mappings: # Latest Model Monitor mapping: https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/image_uri_config/model-monitor.json ModelAnalyzerMap: "af-south-1": "ImageUri": "875698925577.dkr.ecr.af-south-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "ap-east-1": "ImageUri": "001633400207.dkr.ecr.ap-east-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "ap-northeast-1": "ImageUri": "574779866223.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "ap-northeast-2": "ImageUri": "709848358524.dkr.ecr.ap-northeast-2.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "ap-south-1": "ImageUri": "126357580389.dkr.ecr.ap-south-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "ap-southeast-1": "ImageUri": "245545462676.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "ap-southeast-2": "ImageUri": "563025443158.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "ca-central-1": "ImageUri": "536280801234.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "cn-north-1": "ImageUri": "453000072557.dkr.ecr.cn-north-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "cn-northwest-1": "ImageUri": "453252182341.dkr.ecr.cn-northwest-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "eu-central-1": "ImageUri": "048819808253.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "eu-north-1": "ImageUri": "895015795356.dkr.ecr.eu-north-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "eu-south-1": "ImageUri": "933208885752.dkr.ecr.eu-south-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "eu-west-1": "ImageUri": "468650794304.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "eu-west-2": "ImageUri": "749857270468.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "eu-west-3": "ImageUri": "680080141114.dkr.ecr.eu-west-3.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "me-south-1": "ImageUri": "607024016150.dkr.ecr.me-south-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "sa-east-1": "ImageUri": "539772159869.dkr.ecr.sa-east-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "us-east-1": "ImageUri": "156813124566.dkr.ecr.us-east-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "us-east-2": "ImageUri": "777275614652.dkr.ecr.us-east-2.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "us-west-1": "ImageUri": "890145073186.dkr.ecr.us-west-1.amazonaws.com/sagemaker-model-monitor-analyzer:latest" "us-west-2": "ImageUri": "159807026194.dkr.ecr.us-west-2.amazonaws.com/sagemaker-model-monitor-analyzer:latest" Resources: Model: Type: "AWS::SageMaker::Model" Properties: ModelName: !Sub ${ModelName}-prd-${TrainJobId} PrimaryContainer: Image: !Ref ImageRepoUri ModelDataUrl: !Sub s3://sagemaker-${AWS::Region}-${AWS::AccountId}/${ModelName}/${ModelName}-${TrainJobId}/output/model.tar.gz ExecutionRoleArn: !Ref DeployRoleArn EndpointConfig: Type: "AWS::SageMaker::EndpointConfig" Properties: ProductionVariants: - InitialInstanceCount: 2 InitialVariantWeight: 1.0 InstanceType: ml.m5.large ModelName: !GetAtt Model.ModelName VariantName: !Sub ${ModelVariant}-${ModelName} DataCaptureConfig: CaptureContentTypeHeader: CsvContentTypes: - "text/csv" JsonContentTypes: - "application/json" CaptureOptions: - CaptureMode: Input - CaptureMode: Output DestinationS3Uri: !Sub s3://sagemaker-${AWS::Region}-${AWS::AccountId}/${ModelName}/datacapture EnableCapture: True InitialSamplingPercentage: 100 KmsKeyId: !Ref KmsKeyId EndpointConfigName: !Sub ${ModelName}-pec-${TrainJobId} KmsKeyId: !Ref KmsKeyId Endpoint: Type: "AWS::SageMaker::Endpoint" Properties: EndpointName: !Sub ${ModelName}-prd-${TrainJobId} EndpointConfigName: !GetAtt EndpointConfig.EndpointConfigName ApiFunction: Type: AWS::Serverless::Function Properties: FunctionName: !Sub ${ModelName}-api CodeUri: ../api Handler: app.lambda_handler Runtime: python3.7 Role: !GetAtt ApiFunctionRole.Arn KmsKeyArn: !Sub arn:aws:kms:${AWS::Region}:${AWS::AccountId}:key/${KmsKeyId} AutoPublishAlias: "live" #AutoPublishCodeSha256: !Ref TrainSha256 DeploymentPreference: Type: Canary10Percent5Minutes #AllAtOnce Alarms: # A list of alarms that you want to monitor - !Ref AliasErrorMetricGreaterThanZeroAlarm - !Ref LatestVersionErrorMetricGreaterThanZeroAlarm Hooks: # Validation Lambda functions that are run before and after traffic shifting PreTraffic: !Ref PreTrafficLambdaFunction PostTraffic: !Ref PostTrafficLambdaFunction TriggerConfigurations: - TriggerEvents: - DeploymentSuccess - DeploymentFailure TriggerName: DeploymentCompleteTrigger TriggerTargetArn: !Ref NotificationArn Environment: Variables: ENDPOINT_NAME: !GetAtt Endpoint.EndpointName Events: Invoke: Type: Api Properties: Path: /api Method: post Description: "Api deployment that invokes SageMaker endpoint" ApiFunctionRole: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Statement: - Action: - sts:AssumeRole Effect: Allow Principal: Service: - lambda.amazonaws.com Version: "2012-10-17" ManagedPolicyArns: - "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" Policies: - PolicyDocument: Statement: - Sid: AllowSageMaker Effect: Allow Action: - sagemaker:InvokeEndpoint Resource: "arn:aws:sagemaker:*:*:endpoint/*" - Sid: AlowSNS Effect: Allow Action: - sns:Publish Resource: !Ref NotificationArn Version: "2012-10-17" PolicyName: SageMakerInvokeEndpoint PreTrafficLambdaFunction: Type: AWS::Serverless::Function Properties: FunctionName: !Sub "CodeDeployHook_mlops-${ModelName}-PreTrafficLambdaFunction" CodeUri: ../api Handler: pre_traffic_hook.lambda_handler Runtime: python3.7 Timeout: 30 KmsKeyArn: !Sub arn:aws:kms:${AWS::Region}:${AWS::AccountId}:key/${KmsKeyId} Policies: - Version: "2012-10-17" Statement: - Sid: AllowSageMaker Effect: Allow Action: - sagemaker:DescribeEndpoint - sagemaker:DescribeEndpointConfig - sagemaker:InvokeEndpoint Resource: - "arn:aws:sagemaker:*:*:*/*" - Sid: AllowCodeDeploy Effect: Allow Action: - codedeploy:PutLifecycleEventHookExecutionStatus Resource: !Sub "arn:${AWS::Partition}:codedeploy:${AWS::Region}:${AWS::AccountId}:deploymentgroup:${ServerlessDeploymentApplication}/*" DeploymentPreference: Enabled: False Environment: Variables: ENDPOINT_NAME: !GetAtt Endpoint.EndpointName Description: "Perform checks pre-shifting traffic to lambda" PostTrafficLambdaFunction: Type: AWS::Serverless::Function Properties: FunctionName: !Sub "CodeDeployHook_mlops-${ModelName}-PostTrafficLambdaFunction" CodeUri: ../api Handler: post_traffic_hook.lambda_handler Runtime: python3.7 Timeout: 30 KmsKeyArn: !Sub arn:aws:kms:${AWS::Region}:${AWS::AccountId}:key/${KmsKeyId} Policies: - Version: "2012-10-17" Statement: - Sid: AllowS3List Effect: Allow Action: - s3:ListBucket Resource: !Sub arn:aws:s3:::sagemaker-${AWS::Region}-${AWS::AccountId} - Sid: AllowS3Get Effect: Allow Action: - s3:GetObject Resource: !Sub arn:aws:s3:::sagemaker-${AWS::Region}-${AWS::AccountId}/* - Sid: AllowCodeDeploy Effect: Allow Action: - codedeploy:PutLifecycleEventHookExecutionStatus Resource: !Sub "arn:${AWS::Partition}:codedeploy:${AWS::Region}:${AWS::AccountId}:deploymentgroup:${ServerlessDeploymentApplication}/*" DeploymentPreference: Enabled: False Environment: Variables: ENDPOINT_NAME: !GetAtt Endpoint.EndpointName DATA_CAPTURE_URI: !Sub s3://sagemaker-${AWS::Region}-${AWS::AccountId}/${ModelName}/datacapture Description: "Perform checks post-shifting traffic to lambda" SagemakerMonitoringSchedule: Type: AWS::SageMaker::MonitoringSchedule Properties: EndpointName: !GetAtt Endpoint.EndpointName MonitoringScheduleConfig: MonitoringJobDefinition: BaselineConfig: ConstraintsResource: S3Uri: !Sub s3://sagemaker-${AWS::Region}-${AWS::AccountId}/${ModelName}/monitoring/baseline/${ModelName}-pbl-${TrainJobId}/constraints.json StatisticsResource: S3Uri: !Sub s3://sagemaker-${AWS::Region}-${AWS::AccountId}/${ModelName}/monitoring/baseline/${ModelName}-pbl-${TrainJobId}/statistics.json MonitoringAppSpecification: ImageUri: !FindInMap [ModelAnalyzerMap, !Ref "AWS::Region", "ImageUri"] MonitoringInputs: - EndpointInput: EndpointName: !GetAtt Endpoint.EndpointName LocalPath: "/opt/ml/processing/endpointdata" MonitoringOutputConfig: MonitoringOutputs: - S3Output: LocalPath: "/opt/ml/processing/localpath" S3Uri: !Sub s3://sagemaker-${AWS::Region}-${AWS::AccountId}/${ModelName}/monitoring/reports MonitoringResources: ClusterConfig: InstanceCount: 1 InstanceType: ml.m5.xlarge VolumeKmsKeyId: !Ref KmsKeyId VolumeSizeInGB: 30 RoleArn: !Ref DeployRoleArn StoppingCondition: MaxRuntimeInSeconds: 1800 ScheduleConfig: ScheduleExpression: "cron(0 * ? * * *)" MonitoringScheduleName: !Sub ${ModelName}-pms SagemakerScheduleAlarm: Type: "AWS::CloudWatch::Alarm" Properties: AlarmName: !Sub ${ModelName}-metric-gt-threshold AlarmDescription: Schedule Metric > Threshold ComparisonOperator: GreaterThanThreshold Dimensions: - Name: Endpoint Value: !GetAtt Endpoint.EndpointName - Name: MonitoringSchedule Value: !Sub ${ModelName}-pms EvaluationPeriods: 1 DatapointsToAlarm: 1 MetricName: !Ref ScheduleMetricName Namespace: aws/sagemaker/Endpoints/data-metrics Period: 60 Statistic: Average Threshold: !Ref ScheduleMetricThreshold AliasErrorMetricGreaterThanZeroAlarm: Type: "AWS::CloudWatch::Alarm" Properties: AlarmName: !Sub ${ModelName}-alias-gt-zero AlarmDescription: Lambda Function Error > 0 ComparisonOperator: GreaterThanThreshold Dimensions: - Name: Resource Value: !Sub "${ApiFunction}:live" - Name: FunctionName Value: !Ref ApiFunction EvaluationPeriods: 2 MetricName: Errors Namespace: AWS/Lambda Period: 60 Statistic: Sum Threshold: 0 LatestVersionErrorMetricGreaterThanZeroAlarm: Type: "AWS::CloudWatch::Alarm" Properties: AlarmName: !Sub ${ModelName}-version-gt-zero AlarmDescription: Lambda Function Error > 0 ComparisonOperator: GreaterThanThreshold Dimensions: - Name: Resource Value: !Sub "${ApiFunction}:live" - Name: FunctionName Value: !Ref ApiFunction - Name: ExecutedVersion Value: !GetAtt ApiFunction.Version.Version EvaluationPeriods: 2 MetricName: Errors Namespace: AWS/Lambda Period: 60 Statistic: Sum Threshold: 0 AutoScaling: Type: AWS::ApplicationAutoScaling::ScalableTarget Properties: MaxCapacity: 10 MinCapacity: 2 ResourceId: !Sub endpoint/${ModelName}-prd-${TrainJobId}/variant/${ModelVariant}-${ModelName} RoleARN: !Sub arn:aws:iam::${AWS::AccountId}:role/MLOps ScalableDimension: sagemaker:variant:DesiredInstanceCount ServiceNamespace: sagemaker DependsOn: Endpoint AutoScalingPolicy: Type: "AWS::ApplicationAutoScaling::ScalingPolicy" Properties: PolicyName: SageMakerVariantInvocationsPerInstance PolicyType: TargetTrackingScaling ResourceId: !Sub endpoint/${ModelName}-prd-${TrainJobId}/variant/${ModelVariant}-${ModelName} ScalableDimension: sagemaker:variant:DesiredInstanceCount ServiceNamespace: sagemaker TargetTrackingScalingPolicyConfiguration: TargetValue: 750.0 ScaleInCooldown: 60 ScaleOutCooldown: 60 PredefinedMetricSpecification: PredefinedMetricType: SageMakerVariantInvocationsPerInstance DependsOn: AutoScaling Outputs: # ServerlessRestApi is an implicit API created out of Events key under Serverless::Function # Find out more about other implicit resources you can reference within SAM # https://github.com/awslabs/serverless-application-model/blob/master/docs/internals/generated_resources.rst#api DeploymentApplication: Description: "Regression deployment application" Value: !Ref ServerlessDeploymentApplication RestApi: Description: "API Gateway endpoint URL for Prod stage for Regression function" Value: !Sub "https://${ServerlessRestApi}.execute-api.${AWS::Region}.amazonaws.com/Prod/api/"