# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 import json import boto3 from datetime import datetime import lambda_helper import os textract_client = boto3.client("textract") def lambda_handler(event, context): notification_message = json.loads(event['Records'][0]['Sns']['Message']) job_id = notification_message['JobId'] document_path = "s3://{}/{}".format(notification_message['DocumentLocation']['S3Bucket'], notification_message['DocumentLocation']['S3ObjectName']) print("document_path is {}".format(document_path)) job_status = notification_message['Status'] completed_time_stamp = notification_message['Timestamp'] / 1000 completed_time = datetime.fromtimestamp( completed_time_stamp).strftime('%Y-%m-%d %H:%M:%S') lambda_helper.update_metadata_with_status( job_id, document_path, job_status, completed_time) collection_of_textract_responses = lambda_helper.get_text_results_from_textract(job_id) total_text_with_info, font_sizes_and_line_numbers = lambda_helper.get_the_text_with_required_info(collection_of_textract_responses) headers, header_and_its_line_numbers = get_headers_info( total_text_with_info) print("headers identified are {}".format(','.join(headers))) headers_to_paragraphs = get_header_to_paragraph_data( header_and_its_line_numbers, total_text_with_info) lambda_helper.update_paragraphs_info_in_dynamodb( headers_to_paragraphs, document_path) return json.dumps(headers_to_paragraphs) ''' This method is used for the pages where headers and paragraphs start a different line indent. Takes the input of header indentation and paragraph indentation. ''' def get_headers_info(total_text_with_info): # Identified headers and their line numbers header_and_its_line_numbers = {} headers = [] height_total = 0.0 height_count = 0.0 height_average = 0.0 for block in total_text_with_info: height_total += block['font_height'] height_count += 1.0 height_average = height_total / height_count * 0.97 is_lastblock_header = False last_header = "" for block in total_text_with_info: if block['font_height'] > height_average: if len(block['text']) > 1: new_header_text = block['text'] if is_lastblock_header: headers.remove(last_header) print('join consecutive header {} + {}'.format(last_header, new_header_text)) new_header_text = last_header + " " + new_header_text headers.append(new_header_text) header_and_its_line_numbers[new_header_text] = block['line_number'] is_lastblock_header = True last_header = new_header_text print('header:{}:{}:{}'.format(block['font_height'],height_average,last_header)) else: print('not header:{}:{}:{}'.format(block['font_height'],height_average, block['text'])) is_lastblock_header = False return headers, header_and_its_line_numbers ''' This method takes the line numbers of headers and total text as input. For each header, it collects the text till the start of next header and assigns them as paragraph data ''' def get_header_to_paragraph_data(header_and_its_line_numbers, total_text_with_info): header_list_iterator = iter(header_and_its_line_numbers) header = next(header_list_iterator, None) headers_to_paragraphs = {} while header: header_line_number = header_and_its_line_numbers[header] current_header = header header = next(header_list_iterator, None) paragraph_data = [] content_count = 0 if header: next_header_line_number = header_and_its_line_numbers[header] for each_line in total_text_with_info: if (each_line['line_number'] > header_line_number) and (each_line['line_number'] < next_header_line_number): paragraph_data.append(each_line['text']) content_count += 1 else: for each_line in total_text_with_info: if each_line['line_number'] > header_line_number: paragraph_data.append(each_line['text']) content_count += 1 if content_count > 0: headers_to_paragraphs[current_header] = " ".join(paragraph_data) else: print('skipping header {}'.format(current_header)) return headers_to_paragraphs