# MIT License # # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject # to the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from urllib.parse import unquote_plus import json import boto3 import re def lambda_handler(event, context): # Create an SSM Client ssm_client = boto3.client('ssm') # Create an S3 Client s3_client = boto3.client('s3') # Create a Textract Client textract_client = boto3.client('textract') # Create a Comprehend Client comprehend_client = boto3.client('comprehend') # Get the Custom Entity Recognizer's ARN from SSM Parameter Store comprehend_parameters = ssm_client.get_parameters(Names=['CustomEntityRecognizerARN-TCA2I', 'ComprehendExecutionRole-TCA2I', 'ComprehendTemporaryDataStoreBucketName-TCA2I'], WithDecryption=True) for parameter in comprehend_parameters['Parameters']: if parameter['Name'] == 'CustomEntityRecognizerARN-TCA2I': customer_recognizer_arn = parameter['Value'] elif parameter['Name'] == 'ComprehendExecutionRole-TCA2I': comprehend_execution_role_arn = parameter['Value'] elif parameter['Name'] == 'ComprehendTemporaryDataStoreBucketName-TCA2I': comprehend_output_bucket = parameter['Value'] # Iterate over all S3 Put records that have been passed to this lambda function. for record in event['Records']: bucket = record['s3']['bucket']['name'] key = unquote_plus(record['s3']['object']['key']) # Send S3 Object to Textract response = textract_client.detect_document_text( Document={'S3Object': {'Bucket': bucket, 'Name': key}}) # Get just the filename (without input/ or trailing filetype) filename = ".".join(key.split(".")[:-1]) filename = "/".join(filename.split("/")[1:]) # Get the text blocks blocks = response['Blocks'] # Save the JSON response from Textract to a folder in the S3 bucket raw_textract_data_response = s3_client.put_object( Bucket=bucket, Key='textract-output/raw/' + filename + '.json', Body=json.dumps(blocks) ) print(f'Text Extraction Complete for {bucket}/{key}') # Recreate the raw text from the Textract Output raw_text = "" for block in blocks[1:]: if (block['BlockType'] == "WORD"): break raw_text = raw_text + block['Text'] + " " # Store it in an S3 bucket processed_data_key = 'textract-output/processed/' + filename + '.txt' # Store Processed Data in S3 Bucket processed_textract_data_response = s3_client.put_object( Bucket=bucket, Key=processed_data_key, Body=json.dumps(raw_text) ) # Start the Custom Entity Recognition Job response = comprehend_client.start_entities_detection_job( InputDataConfig={ 'S3Uri': 's3://' + bucket + '/' + processed_data_key, 'InputFormat': 'ONE_DOC_PER_FILE' }, OutputDataConfig={ 'S3Uri': 's3://' + comprehend_output_bucket + '/comprehend-output/raw/' }, DataAccessRoleArn=comprehend_execution_role_arn, JobName= re.sub(r'\W+', '', filename) + '-TextractComprehendA2I', EntityRecognizerArn=customer_recognizer_arn, LanguageCode='en' ) print("Custom Entity Detection Job Started") return 0