from textract_util import * import io import os import json import time import boto3 def lambda_handler(event, context): #Initialize Boto Resource s3 = boto3.resource('s3') textract = boto3.client('textract') dynamodb = boto3.client('dynamodb') table_name=os.environ['table_name'] file_list = [] if "Records" in event: records = event['Records'] numRecords = len(records) print("{} messages recieved".format(numRecords)) for record in records: documentBlocks = None num_pages = 0 num_lines = 0 bucket = "" upload_prefix = "" textractJobId = "" textractStatus = "" textractAPI = "" textractJobTag = "" textractS3ObjectName = "" textractS3Bucket = "" textractTimestamp = "" if 'Sns' in record.keys(): sns = record['Sns'] if 'Message' in sns.keys(): message = json.loads(sns['Message']) textractJobId = message['JobId'] print("{} = {}".format("JobId", textractJobId)) textractStatus = message['Status'] print("{} = {}".format("Status",textractStatus)) textractTimestamp = str(int(float(message['Timestamp'])/1000)) print("{} = {}".format("Timestamp",textractTimestamp)) textractAPI = message['API'] print("{} = {}".format("API", textractAPI)) textractJobTag = message['JobTag'] print("{} = {}".format("JobTag", textractJobTag)) documentLocation = message['DocumentLocation'] textractS3ObjectName = documentLocation['S3ObjectName'] print("{} = {}".format("S3ObjectName", textractS3ObjectName)) textractS3Bucket = documentLocation['S3Bucket'] print("{} = {}".format("S3Bucket", textractS3Bucket)) bucket = textractS3Bucket document_path = textractS3ObjectName[:textractS3ObjectName.rfind("/")] if textractS3ObjectName.find("/") >= 0 else "" document_name = textractS3ObjectName[textractS3ObjectName.rfind("/")+1:textractS3ObjectName.rfind(".")] if textractS3ObjectName.find("/") >= 0 else textractS3ObjectName[:textractS3ObjectName.rfind(".")] document_type = textractS3ObjectName[textractS3ObjectName.rfind(".")+1:].upper() if document_path == "": upload_prefix = textractJobId else: upload_prefix = "{}/{}".format(document_path, textractJobId) print("upload_prefix = " + upload_prefix) num_pages, documentBlocks = GetTextDetectionResult(textract, textractJobId) if documentBlocks is not None and len(documentBlocks) > 0: print("{} Blocks retrieved".format(len(documentBlocks))) #Extract lines of texts into a Python dictionary by parsing the raw JSON from Textract blocks = groupBlocksByType(documentBlocks) document_text, num_lines = extractTextBody(blocks) #Generate JSON document using form fields information json_document = "{}-text.json".format(document_name) json_file = open("/tmp/"+json_document,'w+') json_file.write(json.dumps(document_text, indent=4, sort_keys=True)) json_file.close() s3.meta.client.upload_file("/tmp/"+json_document, bucket, "{}/{}".format(upload_prefix,json_document)) try: response = dynamodb.update_item( TableName=table_name, Key={ 'JobId':{'S':textractJobId}, 'JobType':{'S':'TextDetection'} }, ExpressionAttributeNames={"#tf": "TextFiles", "#jst": "JobStatus", "#jct": "JobCompleteTimeStamp", "#nl": "NumLines", "#np": "NumPages"}, UpdateExpression='SET #tf = list_append(#tf, :text_files), #jst = :job_status, #jct = :job_complete, #nl = :num_lines, #np = :num_pages', ExpressionAttributeValues={ ":text_files": {"L": [{"S": "{}/{}".format(upload_prefix,json_document)}]}, ":job_status": {"S": textractStatus}, ":job_complete": {"N": str(textractTimestamp)}, ":num_lines": {"N": str(num_lines)}, ":num_pages": {"N": str(num_pages)} } ) except Exception as e: print('DynamoDB Insertion Error is: {0}'.format(e)) else: try: response = dynamodb.update_item( TableName=table_name, Key={ 'JobId':{'S':textractJobId}, 'JobType':{'S':'TextDetection'} }, ExpressionAttributeNames={"#jst": "JobStatus", "#jct": "JobCompleteTimeStamp"}, UpdateExpression='SET #jst = :job_status, #jct = :job_complete', ExpressionAttributeValues={ ":job_status": {"S": textractStatus}, ":job_complete": {"N": str(textractTimestamp)} } ) except Exception as e: print('DynamoDB Insertion Error is: {0}'.format(e)) s3_result = s3.meta.client.list_objects_v2(Bucket=bucket, Prefix="{}/".format(upload_prefix), Delimiter = "/") if 'Contents' in s3_result: for key in s3_result['Contents']: if key['Key'].endswith("json"): file_list.append("https://s3.amazonaws.com/{}/{}".format(bucket, key['Key'])) while s3_result['IsTruncated']: continuation_key = s3_result['NextContinuationToken'] s3_result = s3.meta.client.list_objects_v2(Bucket=bucket, Prefix="{}/".format(upload_prefix), Delimiter="/", ContinuationToken=continuation_key) for key in s3_result['Contents']: if key['Key'].endswith("json"): file_list.append("https://s3.amazonaws.com/{}/{}".format(bucket, key['Key'])) print(file_list) return file_list