###################################################################################################################### # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # # # Licensed under the Apache License, Version 2.0 (the License). You may not use this file except in compliance # # with the License. A copy of the License is located at # # # # http://www.apache.org/licenses/LICENSE-2.0 # # # # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES # # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions # # and limitations under the License. # ##################################################################################################################### from helper import AwsHelper, DynamoDBHelper import boto3 import datastore from helper import S3Helper import json import uuid import csv def getPageResponse(request): documentsTable = request["documentsTable"] outputTable = request["outputTable"] documentId = request["documentId"] page = request["page"] ds = datastore.DocumentStore(documentsTable, outputTable) doc = ds.getDocument(documentId) if(doc and doc["documentStatus"] == "SUCCEEDED"): fileName = "{}-analysis/{}/page-{}-response.json".format(doc["objectName"], doc["documentId"], page) responseJson = json.loads(S3Helper.readFromS3(doc["bucketName"], fileName)) doc["textractResponse"] = responseJson output = {} if(doc): output = doc return output def parsePairs(file): file = file.splitlines() if type(file) == str else file reader = csv.reader(file, delimiter=',') ignore = True fields = [] for row in reader: if(ignore is True): ignore = False else: fields.append({ "key": row[0] if len(row) > 0 else '', "value": row[1] if len(row) > 1 else '', }) return fields def getPageForm(request): documentsTable = request["documentsTable"] outputTable = request["outputTable"] documentId = request["documentId"] page = request["page"] ds = datastore.DocumentStore(documentsTable, outputTable) doc = ds.getDocument(documentId) if(doc and doc["documentStatus"] == "SUCCEEDED"): fileName = "{}-analysis/{}/page-{}-forms.csv".format(doc["objectName"], doc["documentId"], page) file = S3Helper.readFromS3(doc["bucketName"], fileName) doc["textractResponse"] = parsePairs(file) output = {} if(doc): output = doc return output def getTableFromPath(path): pathName = os.getcwd() return csv.reader(open(os.path.join(pathName, path), "rU"), delimiter=',') def getTableFromString(fileStr): return csv.reader(fileStr.splitlines(), delimiter=',') def parseTables(table): comprehend = getComprehend() tables = [] tablesIndex = -1 rowIndex = 1 for row in table: if(len(row) == 1 and row[0] == 'Table'): tablesIndex += 1 rowIndex = 1 tables.append([]) else: columnIndex = 1 for cell in row: preProcess = { "text": cell, "comprehend": comprehend["entities"](cell) } processedLine = processLine(preProcess) cellInstance = { "RowIndex": rowIndex, "ColumnIndex": columnIndex, "RowSpan": 1, "ColumnSpan": 1, "content": cell, "tokens": processedLine["phrases"] } columnIndex += 1 tables[tablesIndex].append(cellInstance) return tables def getPageTable(request): documentsTable = request["documentsTable"] outputTable = request["outputTable"] documentId = request["documentId"] page = request["page"] ds = datastore.DocumentStore(documentsTable, outputTable) doc = ds.getDocument(documentId) if(doc and doc["documentStatus"] == "SUCCEEDED"): fileName = "{}-analysis/{}/page-{}-tables.csv".format(doc["objectName"], doc["documentId"], page) file = S3Helper.readFromS3(doc["bucketName"], fileName) tables = parseTables(getTableFromString(file)) output = { "tables": [] } if(tables): output["tables"] = tables return output # Comprehend def getComprehend(languageCode="en"): ss = boto3.Session() region = ss.region_name client = AwsHelper().getClient('comprehend', region) def entities(text): return client.detect_entities( Text=text, LanguageCode=languageCode ) return { "entities": entities } def parseKey(obj, key, default): return obj[key] if key in obj else default def parsePhrase(start, end, textString, entity = "null"): return { "text": textString[slice(start, end)], "entity": entity } def processLine(line, entityList={"null":0}): text = line["text"] comprehend = line["comprehend"] if "comprehend" in line else {} entities = comprehend["Entities"] if "Entities" in comprehend else [] previousEntity = None phrases = [] i = 0 currentEntity = None while(i < len(entities)): currentEntity = entities[i] if(previousEntity is None and currentEntity["BeginOffset"] != 0): phrases.append(parsePhrase(0, currentEntity["BeginOffset"], text)) entityList["null"] += 1 elif(previousEntity is not None): phrases.append(parsePhrase(previousEntity["EndOffset"], currentEntity["BeginOffset"], text)) entityList["null"] += 1 phrases.append(parsePhrase(currentEntity["BeginOffset"], currentEntity["EndOffset"], text, currentEntity["Type"])) previousEntity = currentEntity if(currentEntity["Type"] not in entityList): entityList[currentEntity["Type"]] = 1 else: entityList[currentEntity["Type"]] += 1 i += 1 if(currentEntity is not None and len(text)-1 > currentEntity["EndOffset"]): phrases.append(parsePhrase(currentEntity["EndOffset"], len(text), text)) entityList["null"] += 1 if(currentEntity is None): phrases.append(parsePhrase(0, len(text), text)) entityList["null"] += 1 return { "phrases": phrases, "entityList": entityList } def processLines(lines): entityList={"null":0} processedLines=[] i = 0 while i < len(lines): line = processLine(lines[i], entityList) entityList = line["entityList"] processedLines.append(line["phrases"]) i += 1 return { "lines": processedLines, "entities": entityList } def processPairs(pairs): entityList={"null":0} processedPairs=[] keyList = {} i = 0 while i < len(pairs): key = processLine(pairs[i]["key"], entityList) entityList = key["entityList"] value = processLine(pairs[i]["value"], entityList) entityList = value["entityList"] keyText = pairs[i]["key"]["text"] if(keyText not in keyList): keyList[keyText] = 0 keyList[keyText] += 1 processedPairs.append({ "key": { "text": keyText, "tokens": key["phrases"] }, "value": { "tokens": value["phrases"] } }) i += 1 return { "pairs": processedPairs, "keys": keyList } def table(request): return getPageTable(request) def text(request): comprehend = getComprehend() document = getPageResponse(request) textractResponse = parseKey(document, 'textractResponse', []) lines = list(filter(lambda block: block["BlockType"] == "LINE" , textractResponse)) lines = list(map(lambda line: line["Text"], lines)) lines = list(map(lambda line: {"text": line, "comprehend": comprehend["entities"](line)}, lines)) return processLines(lines) def form(request): comprehend = getComprehend() document = getPageForm(request) textractResponse = parseKey(document, 'textractResponse', []) pairs = list(map(lambda pair: {"key": { "text": pair["key"], "comprehend": [] if pair["value"] == "" else comprehend["entities"](pair["key"]) }, "value":{ "text": pair["value"], "comprehend": [] if pair["value"] == "" else comprehend["entities"](pair["value"]) } }, textractResponse)) return processPairs(pairs) # Redaction def redact(request): response = {} errors = {} params = parseKey(request, "params", {}) documentId = parseKey(params, "documentId", None) page = parseKey(params, "page", 1) redactType = parseKey(params, "type", None) types = { "table": table, "text": text, "form": form, } if(documentId is None): errors["documentId"] = "no documentId provided in query" response["errors"] = errors return response request["documentId"] = documentId request["page"] = page if(redactType is not None): return types[redactType](request) else: responses = {} for item in types: response = types[item](request) for property in response: responses[property] = response[property] return responses