# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 """CDK for OCR stage of the document processing pipeline """ # Python Built-Ins: from typing import List, Optional, Union # External Dependencies: from aws_cdk import Token import aws_cdk.aws_iam as iam from aws_cdk.aws_s3 import Bucket from constructs import Construct # Local Dependencies: from .sagemaker_ocr import SageMakerOCRStep from .textract_ocr import TextractOCRStep from ..shared.sagemaker import SageMakerCallerFunction class OCRStep(Construct): """CDK construct for a document pipeline step to OCR incoming documents/images This construct's `.sfn_task` expects inputs with $.Input.Bucket and $.Input.Key properties specifying the location of the raw input document, and will return an object with Bucket and Key pointing to a consolidated JSON OCR output in Amazon Textract-compatible format. In addition to the standard (Amazon Textract-based) option, this construct supports building and deploying alternative, custom OCR options. Multiple engines may be built and/or deployed (to support experimentation), but the pipeline must be pointed to exactly one custom SageMaker or Amazon Textract OCR provider. """ def __init__( self, scope: Construct, id: str, lambda_role: iam.Role, ssm_param_prefix: Union[Token, str], input_bucket: Bucket, output_bucket: Bucket, output_prefix: str, input_prefix: Optional[str] = None, build_sagemaker_ocrs: List[str] = [], deploy_sagemaker_ocrs: List[str] = [], use_sagemaker_ocr: Optional[str] = None, enable_sagemaker_autoscaling: bool = False, shared_sagemaker_caller_lambda: Optional[SageMakerCallerFunction] = None, ): """Create an OCRStep Parameters ---------- scope : CDK construct scope id : CDK construct ID lambda_role : IAM Role that the Amazon Textract-invoking Lambda function will run with ssm_param_prefix : Prefix to be applied to generated SSM pipeline configuration parameter names (including the parameter to configure SageMaker endpoint name for thumbnail generation). input_bucket : Bucket from which input documents will be fetched. If auto-deployment of a thumbnailer endpoint is enabled, the model execution role will be granted access to this bucket (limited to `input_prefix`). output_bucket : (Pre-existing) S3 bucket where Textract result files should be stored output_prefix : Prefix under which Textract result files should be stored in S3 (under this prefix, the original input document keys will be mapped). input_prefix : Prefix under `input_bucket` from which input documents will be fetched. Used to configure SageMaker model execution role permissions when auto-deployment of thumbnailer endpoint is enabled. build_sagemaker_ocrs : List of alternative (SageMaker-based) OCR engine names to build container images and SageMaker Models for in the deployed stack. By default ([]), none will be included. See `CUSTOM_OCR_ENGINES` in pipeline/ocr/sagemaker_ocr.py for supported engines. deploy_sagemaker_ocrs : List of alternative OCR engine names to deploy SageMaker endpoints for in the stack. Any names in here must also be included in `build_sagemaker_ocrs`. Default []: Support Amazon Textract OCR only. use_sagemaker_ocr : Optional alternative OCR engine name to use in the deployed document pipeline. If set and not empty, this must also be present in `build_sagemaker_ocrs` and `deploy_sagemaker_ocrs`. Default None: Use Amazon Textract for initial document OCR. enable_sagemaker_autoscaling : Set True to enable auto-scaling on SageMaker OCR endpoints (if any are deployed), to optimize resource usage (recommended for production use). Set False to disable it and avoid cold-starts (good for development). shared_sagemaker_caller_lambda : Optional pre-existing SageMaker caller Lambda function, to share this between multiple SageMakerSSMSteps in the app if required. """ super().__init__(scope, id) if len(build_sagemaker_ocrs) > 0: self.sagemaker_step = SageMakerOCRStep( self, "SageMakerStep", lambda_role=lambda_role, ssm_param_prefix=ssm_param_prefix, input_bucket=input_bucket, ocr_results_bucket=output_bucket, input_prefix=input_prefix, ocr_results_prefix=output_prefix, build_engine_names=build_sagemaker_ocrs, deploy_engine_names=deploy_sagemaker_ocrs, use_engine_name=use_sagemaker_ocr, enable_autoscaling=enable_sagemaker_autoscaling, shared_sagemaker_caller_lambda=shared_sagemaker_caller_lambda, ) else: self.sagemaker_step = None self.textract_step = TextractOCRStep( self, "TextractStep", lambda_role=lambda_role, output_bucket=output_bucket, output_prefix=output_prefix, ) if use_sagemaker_ocr and self.sagemaker_step: self.sfn_task = self.sagemaker_step.sfn_task else: self.sfn_task = self.textract_step.sfn_task @property def textract_state_machine(self): return self.textract_step.textract_state_machine