# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 """Parsers, data models, and utilities for our custom (OCR-oriented) components This module contains code for parsing and translating data objects from our custom OCR review SMGT task template, ready for consolidation into the final output manifest. """ # Python Built-Ins: from __future__ import annotations from dataclasses import dataclass from enum import Enum import json from logging import getLogger import re from typing import List, Optional # Local Dependencies: from smgt import BaseJsonable, BaseObjectParser, SMGTOutputBoundingBox logger = getLogger("data_model") class OCRReviewStatus(str, Enum): """Ternary status for OCR transcription review""" correct = "correct" unclear = "unclear" wrong = "wrong" @dataclass class SMGTOCREntity(BaseJsonable, BaseObjectParser): """BBox+transcript review OCR entity annotation, as used in consolidation This class `parse()`s from raw template output format and serializes to final output manifest format - so it's a bit specific to consolidation/post-processing Lambda as written. Attributes ---------- detection_id : Auto-generated identifier assigned to each bounding box cluster/group by the UI template. ocr_status : Parsed status of the OCR transcription review (correct, unclear, wrong). box_ixs : Indexes of the bounding boxes in the main crowd-bounding-box result that this entity corresponds to. class_id : Numeric ID of the entity type/class (either this or string label should be known). label : String name of the entity type/class (either this or the numeric class_id should be known). raw_text : The raw text for the entity as detected by OCR tool. target_text : The target/normalized text as overridden by the user. """ detection_id: str ocr_status: OCRReviewStatus box_ixs: List[int] class_id: Optional[int] = None label: Optional[str] = None raw_text: Optional[str] = None target_text: Optional[str] = None @classmethod def find_detection_ids(cls, parent_obj: dict) -> List[str]: """Find all auto-generated entity/detection IDs in top-level custom task output data Because of the mechanics of the SM Crowd HTML Elements and the template, there are multiple keys in the annotation output storing each entity's raw data. This function discovers available entity/detection IDs in a result. Parameters ---------- parent_obj : Top-level annotation data object as output by the UI task template, containing multiple fields. """ return sorted( set( map( lambda m: m.group(1), filter( lambda m: m, map( lambda key: re.match(r"ocr-(.*)-[a-z]+", key, flags=re.IGNORECASE), parent_obj.keys(), ), ), ), ), ) @classmethod def parse( cls, parent_obj: dict, detection_id: str, boxes: Optional[List[SMGTOutputBoundingBox]] = None, ) -> SMGTOCREntity: """Parse the entity with given ID from the *whole annotation object* Use the `find_detection_ids()` method to look up available IDs in the top-level annotation data, then this parser to extract each ID. Parameters ---------- parent_obj : Top-level annotation data object as output by the UI task template, containing multiple fields. detection_id : Specific entity/group ID to extract for this entity boxes : If provided, these will simply be used to validate the tagged `boxIxs` in the entity annotation are within range of the crowd-bounding-box tool's output. Raises ------ ValueError If missing data or inconsistencies prevent the entity from being parsed from raw data. """ meta_field_key = f"ocr-{detection_id}-meta" if meta_field_key not in parent_obj: raise ValueError( "OCR annotation metadata key %s not found in raw data" % meta_field_key, ) meta = json.loads(parent_obj[meta_field_key]) box_ixs = meta["boxIxs"] if len(box_ixs) < 1: raise ValueError( "OCR annotation has no linked box annotations: %s" % detection_id, ) label = meta.get("label") class_id = meta.get("labelId") raw_text = meta.get("ocrText") if boxes is not None: n_boxes = len(boxes) illegal_box_ixs = [ix >= 0 and ix < n_boxes for ix in box_ixs] if len(illegal_box_ixs) > 0: raise ValueError( "OCR annotation '%s' links to boxIxs outside the range 0-%s: %s" % (detection_id, n_boxes, illegal_box_ixs) ) if label is None: label = boxes[box_ixs[0]].label if class_id is None: class_id = boxes[box_ixs[0]].class_id OCR_STATUSES = tuple(s.value for s in OCRReviewStatus) # String enum to Tuple[str] ocr_status_fields = [f"ocr-{detection_id}-{status}" for status in OCR_STATUSES] unknown_statuses = [ s for ix, s in enumerate(OCR_STATUSES) if ocr_status_fields[ix] not in parent_obj ] if len(unknown_statuses): logger.warning( "OCR annotation %s could not determine whether the following statuses were " "selected: %s", detection_id, unknown_statuses, ) selected_statuses = [ s for ix, s in enumerate(OCR_STATUSES) if parent_obj.get(ocr_status_fields[ix], {}).get("on") ] n_selected_statuses = len(selected_statuses) if n_selected_statuses == 1: parsed_status = OCRReviewStatus[selected_statuses[0]] elif n_selected_statuses >= 1: logger.warning( "OCR annotation %s selected %s statuses: %s. Marking as 'unclear'", detection_id, n_selected_statuses, selected_statuses, ) parsed_status = OCRReviewStatus.unclear else: # (0 selected statuses) logger.warning( # TODO: push warnings through to output manifest? "Missing OCR review status for annotation %s. Assuming 'unclear'", detection_id, ) parsed_status = OCRReviewStatus.unclear if parsed_status == OCRReviewStatus.correct: target_text = raw_text else: correction_field_key = f"ocr-{detection_id}-override" target_text = parent_obj.get(correction_field_key) if parsed_status == OCRReviewStatus.wrong and correction_field_key not in parent_obj: logger.warning( "OCR annotation %s tagged as 'wrong', but target text field %s is missing", detection_id, correction_field_key, ) return SMGTOCREntity( detection_id=detection_id, ocr_status=parsed_status, box_ixs=box_ixs, class_id=class_id, label=label, raw_text=raw_text, target_text=target_text, ) def to_jsonable(self) -> dict: return { k: v for k, v in { "detectionId": self.detection_id, "ocrStatus": self.ocr_status, "boxIxs": self.box_ixs, "classId": self.class_id, "label": self.label, "rawText": self.raw_text, "targetText": self.target_text, }.items() if v is not None } @dataclass class SMGTWorkerAnnotation(BaseJsonable, BaseObjectParser): """One worker's full annotation for a page using the custom bbox+transcript review task UI This class `parse()`s from raw template output format and serializes to final output manifest format - so it's a bit specific to consolidation/post-processing Lambda as written. Attributes ---------- boxes : Parsed SMGT crowd-bounding-box boxes as labelled entities : Parsed OCR "entities" (bounding box groupings with transcription accuracy reviews) image_height : Input image height in pixels image_width : Input image width in pixels image_depth : Input image number of channels (usually 1 grayscale or 3 RGB) if known. """ boxes: List[SMGTOutputBoundingBox] entities: List[SMGTOCREntity] image_height: int image_width: int image_depth: Optional[int] = None @classmethod def parse( cls, obj: dict, class_list: Optional[List[str]] = None, crowd_bounding_box_name: str = "boxtool", ) -> SMGTWorkerAnnotation: boxtool_data = obj[crowd_bounding_box_name] image_props = boxtool_data["inputImageProperties"] image_height = image_props["height"] image_width = image_props["width"] image_depth = image_props.get("depth") boxes = [ SMGTOutputBoundingBox.parse(box, class_list=class_list) for box in boxtool_data["boundingBoxes"] ] entity_detection_ids = SMGTOCREntity.find_detection_ids(obj) entities = [] for det_id in entity_detection_ids: try: entities.append(SMGTOCREntity.parse(obj, det_id)) except Exception: logger.exception("Failed to load annotated entity %s", det_id) # TODO: Propagate failed entity extractions as warnings to output too? return cls( boxes=boxes, entities=entities, image_height=image_height, image_width=image_width, image_depth=image_depth, ) def to_jsonable(self) -> dict: img_meta = {"height": self.image_height, "width": self.image_width} if self.image_depth is not None: img_meta["depth"] = self.image_depth return { # Image metadata and bounding boxes in format compatible with built-in BBox task: "image_size": [img_meta], "annotations": [box.to_jsonable() for box in self.boxes], # Additional data for OCR transcription reviews: "entities": [entity.to_jsonable() for entity in self.entities], }