# # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 # # Permission is hereby granted, free of charge, to any person obtaining a copy of this # software and associated documentation files (the "Software"), to deal in the Software # without restriction, including without limitation the rights to use, copy, modify, # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # import time import boto3 textract = boto3.client('textract') class TextExtractor(): def extract_text(self, jobId): """ Extract text from document corresponding to jobId and generate a list of pages containing the text """ textract_result = self.__get_textract_result(jobId) pages = {} self.__extract_all_pages(jobId, textract_result, pages, []) return pages def __get_textract_result(self, jobId): """ retrieve textract result with job Id """ result = textract.get_document_text_detection( JobId=jobId ) return result def __extract_all_pages(self, jobId, textract_result, pages, page_numbers): """ extract page content: build the pages array, recurse if response is too big (when NextToken is provided by textract) """ blocks = [x for x in textract_result['Blocks'] if x['BlockType'] == "LINE"] for block in blocks: if block['Page'] not in page_numbers: page_numbers.append(block['Page']) pages[block['Page']] = { "Number": block['Page'], "Content": block['Text'] } else: pages[block['Page']]['Content'] += " " + block['Text'] nextToken = textract_result.get("NextToken", "") if nextToken != '': textract_result = textract.get_document_text_detection( JobId=jobId, NextToken=nextToken ) self.__extract_all_pages(jobId, textract_result, pages, page_numbers)