import boto3 # Document documentName = "two-column-image.jpg" # Amazon Textract client textract = boto3.client('textract') # Call Amazon Textract with open(documentName, "rb") as document: response = textract.detect_document_text( Document={ 'Bytes': document.read(), } ) #print(response) # Detect columns and print lines columns = [] lines = [] for item in response["Blocks"]: if item["BlockType"] == "LINE": column_found=False for index, column in enumerate(columns): bbox_left = item["Geometry"]["BoundingBox"]["Left"] bbox_right = item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"] bbox_centre = item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]/2 column_centre = column['left'] + column['right']/2 if (bbox_centre > column['left'] and bbox_centre < column['right']) or (column_centre > bbox_left and column_centre < bbox_right): #Bbox appears inside the column lines.append([index, item["Text"]]) column_found=True break if not column_found: columns.append({'left':item["Geometry"]["BoundingBox"]["Left"], 'right':item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]}) lines.append([len(columns)-1, item["Text"]]) lines.sort(key=lambda x: x[0]) for line in lines: print (line[1])