AWSTemplateFormatVersion: '2010-09-09'
Transform: AWS::Serverless-2016-10-31

Metadata:
  AWS::ServerlessRepo::Application:
    Name: ComprehendPiiRedactionS3ObjectLambda
    Description: Deploys a Lambda which will provide capability to redact PII (Personally Identifiable Information) from a text file present in s3. This Lambda can be used as a s3 object lambda which will be triggered on get-object call when configured with access point
    Author: AWS Comprehend
    # SPDX License Id, e.g., MIT, MIT-0, Apache-2.0. See https://spdx.org/licenses for more details
    SpdxLicenseId: MIT-0
    LicenseUrl: LICENSE
    ReadmeUrl: REDACTION_README.md
    Labels: [serverless,comprehend,pii,nlp]
    HomePageUrl: https://aws.amazon.com/comprehend/
    SemanticVersion: 1.0.2
    SourceCodeUrl: https://github.com/aws-samples/amazon-comprehend-s3-object-lambda-functions

Parameters:
  LogLevel:
    Type: String
    Description: Log level for Lambda function logging, e.g., ERROR, INFO, DEBUG, etc.
    Default: INFO
  UnsupportedFileHandling:
    Type: String
    Description: Handling logic for Unsupported files. Valid values are PASS and FAIL.
    Default: FAIL
  IsPartialObjectSupported:
    Type: String
    Description: Whether to support partial objects or not. Accessing partial object through http headers such byte-range can corrupt the object and/or affect PII detection accuracy.
    Default: FALSE
  DocumentMaxSizeContainsPiiEntities:
    Type: Number
    Description: Maximum document size (in bytes) to be used for making calls to Comprehend's ContainsPiiEntities API.
    Default: 50000
  DocumentMaxSizeDetectPiiEntities:
    Type: Number
    Description: Maximum document size (in bytes) to be used for making calls to Comprehend's DetectPiiEntities API.
    Default: 5000
  PiiEntityTypes:
    Type: String
    Description: List of comma separated PII entity types to be considered for redaction. Refer Comprehend's documentation page for list of supported PII entity types.
    Default: ALL
  MaskCharacter:
    Type: String
    Description: A character that replaces each character in the redacted PII entity.
    Default: '*'
  MaskMode:
    Type: String
    Description: Specifies whether the PII entity is redacted with the mask character or the entity type. Valid values - REPLACE_WITH_PII_ENTITY_TYPE and MASK.
    Default: MASK
  SubsegmentOverlappingTokens:
    Type: Number
    Description: Number of tokens/words to overlap among segments of a document in case chunking is needed because of maximum document size limit.
    Default: 20
  DocumentMaxSize:
    Type: Number
    Description: Default maximum document size (in bytes) that this function can process otherwise will throw exception for too large document size.
    Default: 102400
  ConfidenceThreshold:
    Type: Number
    Description: The minimum prediction confidence score above which PII classification and detection would be considered as final answer. Valid range (0.5 to 1.0).
    Default: 0.5
  MaxCharsOverlap:
    Type: Number
    Description: Maximum characters to overlap among segments of a document in case chunking is needed because of maximum document size limit.
    Default: 200
  DefaultLanguageCode:
    Type: String
    Description: Default language of the text to be processed. This code will be used for interacting with Comprehend.
    Default: en
  DetectPiiEntitiesThreadCount:
    Type: Number
    Description: Number of threads to use for calling Comprehend's DetectPiiEntities API. This controls the number of simultaneous calls that will be made from this Lambda.
    Default: 8
  ContainsPiiEntitiesThreadCount:
    Type: Number
    Description:  Number of threads to use for calling Comprehend's ContainsPiiEntities API. This controls the number of simultaneous calls that will be made from this Lambda.
    Default: 20
  PublishCloudWatchMetrics:
    Type: String
    Description: True if publish metrics to Cloudwatch, false otherwise. See README.md for details on CloudWatch metrics.
    Default: True

Resources:
  PiiRedactionFunction:
    Type: AWS::Serverless::Function
    Properties:
      CodeUri: src/
      Handler: handler.redact_pii_documents_handler
      Runtime: python3.8
      Tracing: Active
      Timeout: 60
      Policies:
        - Statement:
            - Sid: ComprehendPiiDetectionPolicy
              Effect: Allow
              Action:
                - comprehend:DetectPiiEntities
                - comprehend:ContainsPiiEntities
              Resource: '*'
            - Sid: S3AccessPointCallbackPolicy
              Effect: Allow
              Action:
                - s3-object-lambda:WriteGetObjectResponse
              Resource: '*'
            - Sid: CloudWatchMetricsPolicy
              Effect: Allow
              Action:
                - cloudwatch:PutMetricData
              Resource: '*'
      Environment:
        Variables:
          LOG_LEVEL: !Ref LogLevel
          UNSUPPORTED_FILE_HANDLING: !Ref UnsupportedFileHandling
          PARTIAL_OBJECT_SUPPORT: !Ref IsPartialObjectSupported
          DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES: !Ref DocumentMaxSizeContainsPiiEntities
          DOCUMENT_MAX_SIZE_DETECT_PII_ENTITIES: !Ref DocumentMaxSizeDetectPiiEntities
          PII_ENTITY_TYPES: !Ref PiiEntityTypes
          MASK_CHARACTER: !Ref MaskCharacter
          MASK_MODE: !Ref MaskMode
          SUBSEGMENT_OVERLAPPING_TOKENS: !Ref SubsegmentOverlappingTokens
          DOCUMENT_MAX_SIZE: !Ref DocumentMaxSize
          CONFIDENCE_THRESHOLD: !Ref ConfidenceThreshold
          MAX_CHARS_OVERLAP: !Ref MaxCharsOverlap
          DEFAULT_LANGUAGE_CODE: !Ref DefaultLanguageCode
          DETECT_PII_ENTITIES_THREAD_COUNT: !Ref DetectPiiEntitiesThreadCount
          CONTAINS_PII_ENTITIES_THREAD_COUNT: !Ref ContainsPiiEntitiesThreadCount
          PUBLISH_CLOUD_WATCH_METRICS: !Ref PublishCloudWatchMetrics

Outputs:
  PiiRedactionFunctionName:
    Description: "Redaction Function Name"
    Value: !Ref PiiRedactionFunction