#!/usr/bin/env python3 import argparse import os import io import boto3 import glob import json import sys from typing import List from textractcaller.t_call import Textract_Features, Textract_Types, call_textract, is_tiff from textractoverlayer.t_overlay import DocumentDimensions, get_bounding_boxes from textractprettyprinter.t_pretty_print import Pretty_Print_Table_Format, Textract_Pretty_Print, get_string from textractoverlayer.image_tools import get_filename_from_document from textracthelper._version import __version__ from PIL import Image, ImageSequence import textracthelper._version import logging import trp.trp2 as t2 logger = logging.getLogger(__name__) parser = argparse.ArgumentParser() input_doc_or_example_or_stdin = parser.add_mutually_exclusive_group(required=True) input_doc_or_example_or_stdin.add_argument("--input-document", help="s3 object (s3://) or file from local filesystem") input_doc_or_example_or_stdin.add_argument("--input-glob", help="local filesystem glob to go through many files") input_doc_or_example_or_stdin.add_argument("--example", dest='use_example', action='store_true', help="using the example document to call Textract") input_doc_or_example_or_stdin.set_defaults(use_example=False) input_doc_or_example_or_stdin.add_argument("--stdin", dest='inputfromstdin', action='store_true', help="receive JSON from stdin") input_doc_or_example_or_stdin.set_defaults(inputfromstdin=False) parser.add_argument( "--overlay-document", help= "point to overlay document when input is --stdin and overlay is requested. Similar to --input-document could be on s3:// or local file", type=str) parser.add_argument( "--features", nargs='+', type=str, choices=["FORMS", "TABLES"], help="features to call Textract with. Will trigger call to AnalyzeDocument instead of DetectDocumentText") parser.add_argument("--pretty-print", nargs='+', type=str, choices=["WORDS", "LINES", "FORMS", "TABLES"], help="") parser.add_argument("--pretty-print-table-format", type=str, choices=[ "csv", "plain", "simple", "github", "grid", "fancy_grid", "pipe", "orgtbl", "jira", "presto", "pretty", "psql", "rst", "mediawiki", "moinmoin", "youtrack", "html", "unsafehtml", "latex", "latex_raw", "latex_booktabs", "latex_longtable", "textile", "tsv" ], default='github', help="which format to output the pretty print information to. Only effects FORMS and TABLES") parser.add_argument("--overlay", nargs='+', type=str, choices=["WORD", "LINE", "FORM", "KEY", "VALUE", "TABLE", "CELL"], help="defines what bounding boxes to draw on the output") parser.add_argument("--pop-up-overlay-output", dest='showoutput', action='store_true', help="shows image with overlay") parser.set_defaults(showoutput=False) parser.add_argument("--use-polygon", dest='usepolygon', action='store_true', help="uses polygon instead of bounding boxes") parser.set_defaults(usepolygon=False) parser.add_argument( "--overlay-text", dest="showoverlaytext", action="store_true", help= "shows image with WORD or LINE text overlay. When both WORD and LINE overlay are specified, WORD text will be overlayed" ) parser.set_defaults(showoverlaytext=False) parser.add_argument( "--overlay-confidence", dest='showconfidence', action='store_true', # TODO: Default to showing WORD confidence if --overlay-text not specified? help="shows image with confidence overlay. Only supported for WORD or LINE") parser.set_defaults(showconfidence=False) parser.add_argument("--overlay-output-folder", type=str, default=None, help="output with bounding boxes to folder") parser.add_argument("--version", action='version', version='%(prog)s {version}'.format(version=__version__), help="print version information") parser.add_argument("--no-stdout", dest='showstdout', action='store_false', help="no output to stdout") parser.set_defaults(showstdout=True) show_logs = parser.add_mutually_exclusive_group(required=False) show_logs.add_argument("-v", dest='showinfo', action='store_true', help=">=INFO level logging output to stderr") show_logs.set_defaults(showinfo=False) show_logs.add_argument("-vv", dest='showdebug', action='store_true', help=">=DEBUG level logging output to stderr") show_logs.set_defaults(showdebug=False) args = parser.parse_args() input_document: str = args.input_document overlay_document: str = args.overlay_document input_glob: str = args.input_glob features_arg = args.features pretty_print_arg = args.pretty_print overlay_output_folder = args.overlay_output_folder overlay_arg = args.overlay showoutput = args.showoutput usepolygon = args.usepolygon showstdout = args.showstdout use_example = args.use_example showdebug = args.showdebug showinfo = args.showinfo use_stdin = args.inputfromstdin show_overlay_text = args.showoverlaytext show_overlay_confidence = args.showconfidence pretty_print_table_format_arg = args.pretty_print_table_format if showinfo or showdebug: formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler = logging.StreamHandler() handler.setFormatter(formatter) handler.setLevel(logging.INFO) logger.setLevel(logging.INFO) logger.addHandler(handler) caller_logger = logging.getLogger('textractcaller') caller_logger.propagate = True caller_logger.setLevel(logging.INFO) caller_logger.addHandler(handler) printer_logger = logging.getLogger('textractprettyprinter') printer_logger.propagate = True printer_logger.setLevel(logging.INFO) printer_logger.addHandler(handler) logger.info("log level: INFO") if showdebug: handler.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG) caller_logger.setLevel(logging.DEBUG) printer_logger.setLevel(logging.DEBUG) logger.debug("current log level: DEBUG") if use_example: SCRIPT_DIR = os.path.dirname(os.path.abspath(textracthelper._version.__file__)) input_document = os.path.join(SCRIPT_DIR, "examples/employmentapp.png") features = None if features_arg: features = [Textract_Features[x] for x in features_arg] pretty_print_table_format = Pretty_Print_Table_Format[pretty_print_table_format_arg] logger.debug(f"pretty_print_table_format: {pretty_print_table_format}") pretty_print = None if pretty_print_arg: pretty_print = [Textract_Pretty_Print[x] for x in pretty_print_arg] if overlay_arg: overlay = [Textract_Types[x] for x in overlay_arg] else: overlay = list() exit_code = 0 if input_document and overlay_document: print( "both --input-document and --overlay-document do not make sense at the moment. --overlay-document is only used with --stdin" ) exit_code = 1 if not use_stdin and pretty_print_arg and "FORMS" in pretty_print_arg and (not features_arg or "FORMS" not in features_arg): print("should pretty-print FORMS but is not requested as --features") exit_code = 1 if not use_stdin and pretty_print_arg and "TABLES" in pretty_print_arg and (not features_arg or "TABLES" not in features_arg): print("should pretty-print TABLES but is not requested as --features") exit_code = 1 if not use_stdin and overlay_arg and any([x for x in ["CELL", "TABLE"] if x in overlay_arg ]) and (not features_arg or "TABLES" not in features_arg): print("should overlay TABLE or CELL but is not requested as --features TABLES") exit_code = 1 if not use_stdin and showoutput and overlay_arg and any([x for x in ["FORM", "KEY", "VALUE"] if x in overlay_arg ]) and (not features_arg or "FORMS" not in features_arg): print("should overlay FORM or KEY or VALUE but FORMS not requested as --features FORMS") exit_code = 1 if showoutput and not overlay_arg: print("asked for --pop-up-overlay-output or --overlay-output-folder, but not --overlay arguments given") exit_code = 1 if exit_code > 0: exit(1) logger.debug(f"calling Textract") if use_stdin: input_documents = [json.load(sys.stdin)] else: if input_glob: input_documents = glob.glob(input_glob) else: input_documents = [input_document] for input_document in input_documents: is_s3_document: bool = False s3_bucket: str = "" s3_key: str = "" if len(input_document) > 7 and input_document.lower().startswith("s3://"): is_s3_document = True s3_bucket, s3_key = input_document.replace("s3://", "").split("/", 1) logger.debug(f"s3_bucket: {s3_bucket}, s3_key: {s3_key}") ext: str = "" # TODO: have to clean this up, too much reuse of the same variable name for different purposes if use_stdin: doc = input_document input_document = overlay_document else: _, ext = os.path.splitext(input_document) logger.debug(f"input_document: '{input_document}'") # check if TIFF multi-page, if so force async force_async = False if is_tiff(input_document): num_pages = 0 if is_s3_document: s3 = boto3.client('s3') o = s3.get_object(Bucket=s3_bucket, Key=s3_key) input_bytes = o.get('Body').read() f = io.BytesIO(input_bytes) else: f = input_document img = Image.open(f) for i, page in enumerate(ImageSequence.Iterator(img)): num_pages += 1 if num_pages > 1: force_async = True logger.warning(f"multi-page TIFF, have to do async") if force_async: doc = call_textract(input_document=input_document, features=features, force_async_api=True) else: doc = call_textract(input_document=input_document, features=features) logger.debug(f"receivedTextract response") if showstdout: if pretty_print: print(get_string(textract_json=doc, output_type=pretty_print, table_format=pretty_print_table_format)) else: print(json.dumps(doc)) logger.debug(f"overlay_output_folder or showoutput: {overlay_output_folder or showoutput}") logger.debug(f"usepolygon: {usepolygon}") if overlay_output_folder or showoutput: logger.debug(f"overlay") colors = { "WORD": (128, 128, 0), "LINE": (0, 128, 128), "FORM": (128, 0, 128), "KEY": (255, 0, 0), "VALUE": (0, 255, 255), "CELL": (255, 0, 255), "TABLE": (255, 255, 0) } from PIL import Image, ImageDraw, ImageFont if is_s3_document: s3 = boto3.client('s3') o = s3.get_object(Bucket=s3_bucket, Key=s3_key) input_bytes = o.get('Body').read() f = io.BytesIO(input_bytes) image: Image.Image = Image.open(f) file_name, suffix = os.path.splitext(os.path.basename(s3_key)) else: image = Image.open(input_document) file_name, suffix = os.path.splitext(os.path.basename(input_document)) rgb_im = image.convert('RGB') draw = ImageDraw.Draw(rgb_im) document_dimension: DocumentDimensions = DocumentDimensions(doc_width=image.size[0], doc_height=image.size[1]) polygon_array: List[List[List[t2.TPoint]]] = list() if not usepolygon: bounding_box_list = get_bounding_boxes(textract_json=doc, document_dimensions=[document_dimension], overlay_features=overlay) else: t_doc: t2.TDocument = t2.TDocumentSchema().loads(json.dumps(doc)) # type: ignore for page in t_doc.pages: word_blocks = t_doc.get_blocks_by_type(block_type_enum=t2.TextractBlockTypes.LINE, page=page) # word_blocks = t_doc.get_blocks_by_type(block_type_enum=t2.TextractBlockTypes.WORD, page=page) polygon_page_array: List[List[t2.TPoint]] = list() for word in word_blocks: point_1: t2.TPoint = t2.TPoint(x=word.geometry.polygon[0].x, y=word.geometry.polygon[0].y) point_1.scale(doc_width=document_dimension.doc_width, doc_height=document_dimension.doc_height) point_2: t2.TPoint = t2.TPoint(x=word.geometry.polygon[1].x, y=word.geometry.polygon[1].y) point_2.scale(doc_width=document_dimension.doc_width, doc_height=document_dimension.doc_height) point_3: t2.TPoint = t2.TPoint(x=word.geometry.polygon[2].x, y=word.geometry.polygon[2].y) point_3.scale(doc_width=document_dimension.doc_width, doc_height=document_dimension.doc_height) point_4: t2.TPoint = t2.TPoint(x=word.geometry.polygon[3].x, y=word.geometry.polygon[3].y) point_4.scale(doc_width=document_dimension.doc_width, doc_height=document_dimension.doc_height) polygon_page_array.append([point_1, point_2, point_3, point_4]) draw.polygon(xy=[(point_1.x, point_1.y), (point_2.x, point_2.y), (point_3.x, point_3.y), (point_4.x, point_4.y)], width=2) polygon_array.append(polygon_page_array) suppress_line_text_overlay = False # use WORD text overlay when both LINE and WORD overlays are specified if show_overlay_text and len([x for x in ['WORD', 'LINE'] if x in overlay_arg]) == 2: suppress_line_text_overlay = True if not usepolygon: for bbox in bounding_box_list: box_color = colors[bbox.box_type.name] draw.rectangle(xy=(bbox.xmin, bbox.ymin, bbox.xmax, bbox.ymax), outline=box_color, width=2) if show_overlay_text and (bbox.box_type.name == 'WORD' or (bbox.box_type.name == 'LINE' and not suppress_line_text_overlay)): logger.debug(f"bbox type {bbox.box_type.name}: {bbox.text}, confidence: {bbox.confidence}") overlay_text = bbox.text box_height = round((bbox.ymax - bbox.ymin) * 0.8) if show_overlay_confidence: overlay_text += f" {bbox.confidence}%" try: font = ImageFont.truetype('Roboto-Regular.ttf', size=box_height) draw.text(xy=(bbox.xmin, bbox.ymin - box_height), text=overlay_text, fill=box_color, align='center', font=font) except: # use default font draw.text(xy=(bbox.xmin, bbox.ymin - box_height), text=overlay_text, fill=box_color, align='center') if showoutput: rgb_im.show() if overlay_output_folder: if not os.path.exists(overlay_output_folder): os.makedirs(overlay_output_folder, exist_ok=True) file_name, suffix = get_filename_from_document(input_document=input_document) output_types = "_".join(overlay_arg) output_file_name = os.path.join(overlay_output_folder, f"{file_name}_boxed_{output_types}_{suffix}") output_format = suffix[1:] if output_format.upper() == "JPG": output_format = "JPEG" rgb_im.save(output_file_name, output_format)