# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 """Example integration for (Py)Tesseract as a custom OCR engine """ # Python Built-Ins: from logging import getLogger import os from statistics import mean from tempfile import TemporaryDirectory from typing import Iterable, List, Optional # External Dependencies: import pandas as pd import pytesseract # Local Dependencies: from .base import BaseOCREngine, generate_response_json, OCRGeometry, OCRLine, OCRPage, OCRWord from ..image_utils import Document logger = getLogger("eng_tesseract") if os.environ.get("TESSDATA_PREFIX") is None: os.environ["TESSDATA_PREFIX"] = "/opt/conda/share/tessdata" class TesseractEngine(BaseOCREngine): """Tesseract-based engine for custom SageMaker OCR endpoint option""" engine_name = "tesseract" def process(self, raw_doc: Document, languages: Optional[Iterable[str]] = None) -> dict: ocr_pages = [] with TemporaryDirectory() as tmpdir: raw_doc.set_workspace(tmpdir) for ixpage, page in enumerate(raw_doc.get_pages()): logger.debug(f"Serializing page {ixpage + 1}") page_ocr = pytesseract.image_to_data( page.file_path, output_type=pytesseract.Output.DATAFRAME, lang="+".join(self.default_languages if languages is None else languages), pandas_config={ # Need this explicit override or else pages containing only a single number # can sometimes have text column interpreted as numeric type: "dtype": {"text": str}, }, ) ocr_pages += self.dataframe_to_ocrpages(page_ocr) return generate_response_json(ocr_pages, self.engine_name) @classmethod def dataframe_to_ocrpages(cls, ocr_df: pd.DataFrame) -> List[OCRPage]: """Convert a Tesseract DataFrame to a list of OCRPage ready for Textract-like serialization Tesseract TSVs / PyTesseract DataFrames group detections by multiple levels: Page, block, paragraph, line, word. Columns are: level, page_num, block_num, par_num, line_num, word_num, left, top, width, height, conf, text. Each level is introduced by a record, so for example there will be an initial record with (level=1, page_num=1, block_num=0, par_num=0, line_num=0, word_num=0)... And then several others before finally getting down to the first WORD record (level=5, page_num=1, block_num=1, par_num=1, line_num=1, word_num=1). Records are assumed to be sorted in order, as indeed they are direct from Tesseract. """ # First construct an indexable list of page geometries, as we'll need these to normalize # other entity coordinates from absolute pixel values to 0-1 range: # (Note: In fact this function will often be called with only one page_num at a time) page_dims = ( ocr_df[ocr_df["level"] == 1] .groupby("page_num") .agg( { "left": "min", "top": "min", "width": "max", "height": "max", "page_num": "count", } ) ) # There should be exactly one level=1 record per page in the dataframe. After checking # this, we can dispose the "page_num" count column. if (page_dims["page_num"] > 1).sum() > 0: raise ValueError( "Tesseract DataFrame had duplicate entries for these page_nums at level 1: %s" % page_dims.index[page_dims["page_num"] > 0].values[:20] ) page_dims.drop(columns="page_num", inplace=True) # We need to collapse the {block, paragraph} levels of Tesseract hierarchy to preserve only # PAGE, LINE and WORD for consistency with Textract. Here we'll assume the DataFrame is in # its original Tesseract sort order, allowing iteration through the records to correctly # roll the entities up. Although iterating through large DataFrames isn't generally a # performant practice, this could always be balanced with specific parallelism if wanted: # E.g. processing multiple pages at once. pages = { num: OCRPage([]) # Initialise all pages first with no text for num in sorted(ocr_df[ocr_df["level"] == 1]["page_num"].unique()) } cur_page_num = None page_lines = [] cur_line_id = None line_words = [] # Tesseract LINE records (level 4) don't have a confidence (equals -1), so we'll use the # average over the included WORDs as a heuristic. They *do* have T/L/H/W geometry info, but # we'll ignore that for the sake of code simplicity and let OCRLine infer it from the union # of all WORD bounding boxes. add_line = lambda words: ( page_lines.append(OCRLine(mean(w.confidence for w in words), words)) ) # Loop through all WORD records, ignoring whitespace-only ones that Tesseract likes to yield words_df = ocr_df[ocr_df["level"] == 5].copy() words_df["text"] = words_df["text"].str.strip() words_df = words_df[words_df["text"].str.len() > 0] for _, rec in words_df.iterrows(): line_id = (rec.block_num, rec.par_num, rec.line_num) page_num = rec.page_num if cur_line_id != line_id: # Start of new LINE - add previous one to result: if cur_line_id is not None: add_line(line_words) cur_line_id = line_id line_words = [] if cur_page_num != page_num: # Start of new PAGE - add previous one to result: if cur_page_num is not None: pages[cur_page_num].add_lines(page_lines) cur_page_num = page_num page_lines = [] # Parse this record into a WORD: page_dim_rec = page_dims.loc[page_num] line_words.append( OCRWord( rec.text, rec.conf, OCRGeometry.from_bbox( # Word geometries, too, need normalizing by page dimensions. (rec.top - page_dim_rec.top) / page_dim_rec.height, (rec.left - page_dim_rec.left) / page_dim_rec.width, rec.height / page_dim_rec.height, rec.width / page_dim_rec.width, ), ) ) # End of last line and last page: Add any remaining content. if len(line_words): add_line(line_words) if len(page_lines): pages[cur_page_num].add_lines(page_lines) return [page for page in pages.values()]