# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 """Common definitions for integrating custom OCR engines to the stack. This module defines base classes which custom OCR integrations should conform to, and tools for standardising OCR output in an Amazon Textract-like format. """ # Python Built-Ins: from __future__ import annotations from abc import ABC, abstractmethod from typing import Dict, Iterable, List, Optional from uuid import uuid4 # Local Dependencies: from ..preproc import Document class BaseOCREngine(ABC): """Base class for a custom OCR engine""" def __init__(self, default_languages: Iterable[str]): """Create a BaseOCREngine Parameters ---------- default_languages : List/iterable of language codes that should be detected by default in documents, in case `process()` method doesn't specify at run time. """ self.default_languages = default_languages @abstractmethod def process(self, raw_doc: Document, languages: Optional[Iterable[str]] = None) -> dict: """OCR raw_doc and return Amazon Textract-like JSON This is the main method a custom OCR integration must implement, to process a document and return a compatible result. See `generate_response_json()` for output conversion tooling. """ raise NotImplementedError("OCREngine must implement process() method") def generate_guid() -> str: """Generate a GUID/UUID in a similar format to Amazon Textract response JSON""" return str(uuid4()) class OCRGeometry: """Amazon Textract-like object geometry data structure You may want to use the `from_*()` methods to create a full geometry from raw boxes/points. """ def __init__( self, top: float, left: float, height: float, width: float, polygon: List[Dict[str, float]] ): """Create an OCRGeometry For this direct constructor method, you must already have a self-consistent set of both bounding box and polygon information. For other cases, see factory methods instead. Amazon Textract-like geometries use coordinates normalized to page size, so should typically be in the range 0-1. """ if height >= 2.0 or width >= 2.0: raise ValueError( "For consistency with Amazon Textract, OCR object coordinates should be relative " "to page canvas and therefore approximately in range 0-1. Got height=%s, width=%s" % (height, width) ) self.top = top self.left = left self.height = height self.width = width self.polygon = polygon @classmethod def from_bbox(cls, top: float, left: float, height: float, width: float) -> OCRGeometry: """Create an OCRGeometry from a (page-normalized) bounding box The geometry's polygon will be initialized to exactly match the bounding box. To produce an Amazon Textract-like result, your T/L/H/W coordinates should be relative to the page itself: I.e. all usually in the range 0-1. """ right = left + width bottom = top + height polygon = [ {"X": left, "Y": top}, {"X": right, "Y": top}, {"X": right, "Y": bottom}, {"X": left, "Y": bottom}, ] return cls(top=top, left=left, height=height, width=width, polygon=polygon) @classmethod def from_polygon_list(cls, points: List[List[float]]) -> OCRGeometry: """Create an OCRGeometry from a list of [x,y] coordinate tuples defining a polygon The geometry's bounding box will be automatically inferred from the polygon coordinates. To produce an Amazon Textract-like result, your T/L/H/W coordinates should be relative to the page itself: I.e. all usually in the range 0-1. """ x_coords = [p[0] for p in points] y_coords = [p[1] for p in points] top = min(x_coords) left = min(y_coords) return cls( top=top, left=left, height=max(y_coords) - top, width=max(x_coords) - left, polygon=[{"X": p[0], "Y": p[1]} for p in points], ) @classmethod def union_bboxes(cls, *geometries: OCRGeometry) -> OCRGeometry: """Create an OCRGeometry for the box bounding several 'child' geometries This will produce a rectangular box, regardless of the polygon shape of input geometry objects. """ top = min(g.top for g in geometries) left = min(g.left for g in geometries) bottom = max(g.top + g.height for g in geometries) right = max(g.left + g.width for g in geometries) return cls.from_bbox( top=top, left=left, height=(bottom - top), width=(right - left), ) def to_json(self) -> dict: """Render this geometry as an Amazon Textract-like JSON-able dictionary""" return { "BoundingBox": { "Width": self.width, "Height": self.height, "Left": self.left, "Top": self.top, }, "Polygon": [{k: v for k, v in point.items()} for point in self.polygon], } class OCRWord: """Amazon Textract-like representation of a detected word on page""" def __init__( self, text: str, confidence: float, geometry: OCRGeometry, text_type: Optional[str] = None ): """Create an OCRWord TODO: Should this and other classes validate/enforce confidence>1.0? Parameters ---------- text : Text of the detected word confidence : 0-100 scaled confidence score for OCR of the detected word geometry : Position & shape of the detected word on the page text_type : Optional 'HANDWRITING' or 'PRINTED' specifier, if available """ self.id = generate_guid() self.text = text self.confidence = confidence self.geometry = geometry self.text_type = text_type def to_json(self) -> dict: """Render this word as an Amazon Textract-like JSON-able dictionary""" return { "Id": self.id, "BlockType": "WORD", "Confidence": self.confidence, "Geometry": self.geometry.to_json(), # "Page": This will be added in post-processing by generate_response_json() "Text": self.text, **({} if self.text_type is None else {"TextType": self.text_type}), } class OCRLine: """Amazon Textract-like representation of a line of text""" def __init__( self, confidence: float, words: Iterable[OCRWord], geometry: Optional[OCRGeometry] = None, ): """Create an OCRLine Parameters ---------- confidence : 0-100 scaled confidence with which this text line was detected words : List/iterable of word objects within this line geometry : (Optional) If a geometry for the line is not explicitly provided, the bounding box enclosing all the `words` will be used. """ self.id = generate_guid() self.confidence = confidence self.words = words self._geometry = geometry @property def geometry(self) -> OCRGeometry: return self._geometry or OCRGeometry.union_bboxes(*(w.geometry for w in self.words)) def to_blocks(self) -> List[dict]: """Render this line as list of Amazon Textract-like JSON-able blocks""" word_blocks = [w.to_json() for w in self.words] line_block = { "Id": self.id, "BlockType": "LINE", "Confidence": self.confidence, "Text": " ".join(w.text.strip() for w in self.words), "Geometry": self.geometry.to_json(), # "Page" will be added in post-processing by generate_response_json() "Relationships": [ { "Type": "CHILD", "Ids": [word.id for word in self.words], }, ], } return [line_block] + word_blocks class OCRPage: """Amazon Textract-like representation of a processed page/image""" def __init__(self, lines: Iterable[OCRLine], geometry: Optional[OCRGeometry] = None): """Create an OCRPage Parameters ---------- lines : List/iterable of text lines within the page geometry : Optional override geometry of the page. Defaults to a 0,0,1,1 box consistent with Amazon Textract. """ self.id = generate_guid() self.geometry = geometry or OCRGeometry.from_bbox(0, 0, 1, 1) self.lines = lines def add_lines(self, lines: List[OCRLine]) -> None: """Add text lines to an already-created OCRPage""" self.lines += lines def to_blocks(self) -> List[dict]: """Render this page as list of Amazon Textract-like JSON-able blocks""" child_blocks = [b for line in self.lines for b in line.to_blocks()] page_block = { "Id": self.id, "BlockType": "PAGE", "Geometry": self.geometry.to_json(), # "Page" will be added in post-processing by generate_response_json() "Relationships": [ { "Type": "CHILD", "Ids": [line.id for line in self.lines], }, ], } return [page_block] + child_blocks def generate_response_json(pages: List[OCRPage], engine_name: str) -> dict: """Create an Amazon-Textract-like, JSON-able response dict for an OCR result Parameters ---------- pages : List of OCRPage objects describing detected lines and words of text with their positions engine_name : Custom OCR engine identifier, which will be reported as an alternative model version in the result. """ page_blocks_by_page = [page.to_blocks() for page in pages] for page_ix, blocks in enumerate(page_blocks_by_page): for block in blocks: block["Page"] = page_ix + 1 return { "DetectDocumentTextModelVersion": f"custom-{engine_name}", "DocumentMetadata": {"Pages": len(pages)}, "JobStatus": "SUCCEEDED", "Blocks": [block for page in page_blocks_by_page for block in page], }