# MIT License # # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject # to the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from urllib.parse import unquote_plus import pandas as pd import json import boto3 import re import sys import uuid prefix = 'intelligent-doc-demo' # Create an SSM Client ssm_client = boto3.client('ssm') # Create an S3 Client s3_client = boto3.client('s3') # Create a Textract Client textract_client = boto3.client('textract') # Create a Comprehend Client comprehend_client = boto3.client('comprehend') def get_kv_map(bucket,key): # process using image bytes response = textract_client.analyze_document(Document={'S3Object': { 'Bucket': bucket, 'Name': key }}, FeatureTypes=['FORMS']) print('got the image') # Get the text blocks blocks=response['Blocks'] # get key and value maps key_map = {} value_map = {} block_map = {} for block in blocks: block_id = block['Id'] block_map[block_id] = block if block['BlockType'] == "KEY_VALUE_SET": if 'KEY' in block['EntityTypes']: key_map[block_id] = block else: value_map[block_id] = block return key_map, value_map, block_map def get_kv_relationship(key_map, value_map, block_map): kvs = {} for block_id, key_block in key_map.items(): value_block = find_value_block(key_block, value_map) key = get_text(key_block, block_map) val = get_text(value_block, block_map) kvs[key] = val return kvs def find_value_block(key_block, value_map): for relationship in key_block['Relationships']: if relationship['Type'] == 'VALUE': for value_id in relationship['Ids']: value_block = value_map[value_id] return value_block def get_text(result, blocks_map): text = '' if 'Relationships' in result: for relationship in result['Relationships']: if relationship['Type'] == 'CHILD': for child_id in relationship['Ids']: word = blocks_map[child_id] if word['BlockType'] == 'WORD': text += word['Text'] + ' ' if word['BlockType'] == 'SELECTION_ELEMENT': if word['SelectionStatus'] == 'SELECTED': text += 'X ' return text def prepareForInference(bucket,kvs): df = pd.DataFrame(kvs.items()) print('got the dataframe') df.columns = ['key','value'] df1 = df[df.value != ''] df1 = df1.reset_index(drop=True) df1 = df1.drop(df1.index[1:7]) df1 = df1.reset_index(drop=True) df1 = df1.drop(df1.index[4]) df1 = df1.reset_index(drop=True) df1 = df1.drop(df1.index[9:12]) df1 = df1.reset_index(drop=True) df1 = df1.drop(df1.index[5:7]) df1 = df1.reset_index(drop=True) df_T = df1.T print("in the middle of all the dataframe operations") df_T.columns = df_T.iloc[0] df_T = df_T.reset_index(drop=True) df_T = df_T.drop([0]) df_T = df_T.reset_index(drop=True) df_T = df_T.rename(columns={"Name (First, Middle, Last, Suffix) ": "Name", "Date of Birth (mm/dd/yyyy) ": "Date of Birth"}) df_T.columns = df_T.columns.str.rstrip() data_key = prefix+"/cer-input/"+str(uuid.uuid4())+'laminference.csv' entry = '' for idx, row in df_T.iterrows(): entry += 'Country'+':'+str(row['Country']).strip()+" "+'Years'+':'+str(row['Years']).strip()+" "+'Cell Phone'+':'+str(row['Cell Phone']).strip()+" "+'Name'+':'+str(row['Name']).strip()+" "+'Social Security Number'+':'+str(row['Social Security Number']).strip()+" "+'TOTAL $'+':'+str(row['TOTAL $']).strip()+" "+'Date of Birth'+':'+str(row['Date of Birth']).strip()+'\n' print('done done and done') # Store Processed Data in S3 Bucket processed_textract_data_response = s3_client.put_object( Bucket=bucket, Key=data_key, Body=json.dumps(entry) ) return data_key def lambda_handler(event, context): # Get the Custom Entity Recognizer's ARN from SSM Parameter Store comprehend_parameters = ssm_client.get_parameters(Names=['CustomEntityRecognizerARN-TCA2I', 'ComprehendExecutionRole-TCA2I', 'S3OutBucketName-TCA2I'], WithDecryption=True) for parameter in comprehend_parameters['Parameters']: if parameter['Name'] == 'CustomEntityRecognizerARN-TCA2I': custom_recognizer_arn = parameter['Value'] elif parameter['Name'] == 'ComprehendExecutionRole-TCA2I': comprehend_execution_role_arn = parameter['Value'] elif parameter['Name'] == 'S3OutBucketName-TCA2I': out_bucket = parameter['Value'] # Iterate over all S3 Put records that have been passed to this lambda function. for record in event['Records']: bucket = record['s3']['bucket']['name'] key = unquote_plus(record['s3']['object']['key']) # get key value pairs from the uploaded input document key_map, value_map, block_map = get_kv_map(bucket,key) # Get Key Value relationship kvs = get_kv_relationship(key_map, value_map, block_map) print('got the key value pairs: ' + str(kvs)) # Transform using a dataframe, create CSV file and load to S3 for Comprehend Inference as a next step inkey = prepareForInference(bucket,kvs) print('uploaded the inference file') # Start the Custom Entity Recognition Job response = comprehend_client.start_entities_detection_job( InputDataConfig={ 'S3Uri': 's3://' + bucket + '/' + inkey, 'InputFormat': 'ONE_DOC_PER_LINE' }, OutputDataConfig={ 'S3Uri': 's3://' + out_bucket + '/'+ prefix+'/cer-output/' }, DataAccessRoleArn=comprehend_execution_role_arn, JobName= str(uuid.uuid4()) + '-TextractComprehendA2I', EntityRecognizerArn=custom_recognizer_arn, LanguageCode='en' ) print("Custom Entity Detection Job Started") return { 'statusCode': 200, 'body': json.dumps('I did what I was asked to do!') }