import os import json import boto3 import sys from botocore.exceptions import ClientError #Maximum number of strings in a STRING_LIST type attribute allowed by Kendra, default is 10, it can be increased on request elimit = 10 #The minimum value we want to consider of the confidence score of recognized entity as returned by Comprehend Medical min_score = 0.97 #Avoid exceeding Comprehend detect_entites API UTF 8 character limit #compre_text_size = 3950 compre_text_size = 4000 client = boto3.client(service_name='comprehend') #List of categories recognized by Comprehend Medical categories = ["COMMERCIAL_ITEM", "DATE", "EVENT", "LOCATION", "ORGANIZATION", "OTHER", "PERSON", "QUANTITY", "TITLE"] #The function to read a wikipedia page as a text file and detect entities in it using Medical Comprehend def document_handler(fname): doc_text = open(fname, 'r').read() #List of JSON objects to store entities entity_data = dict() #List of observed text strings recognized as categories category_text = dict() #Frequency of each text string text_frequency = dict() for et in categories: entity_data[ et ] = [] category_text[ et ] = [] text_frequency[ et ] = dict() #Make detect_entities_v2 call in a loop to work with the text limit #This splitting approach is likely a liitle too naive for a production application as it can potentially split entities, for i in range(0, len(doc_text), compre_text_size): try: entities = client.detect_entities(Text=doc_text[i:i+compre_text_size], LanguageCode='en') except Exception as e: print("Exiting - detect_entities terminated with exception", e, file=sys.stderr) sys.exit(1) for e in entities["Entities"]: #For each of the recognized entities take only those that have confidence score higher than min_score, #are printable, dont contain quotes and are previously unseen if ((e["Score"] > min_score) and (e["Text"].isprintable()) and (not "\"" in e["Text"]) and (not e["Text"].upper() in category_text[e["Type"]])): #Append the text to entity data to be used for a Kendra custom attribute entity_data[e["Type"]].append(e["Text"]) #Keep track of text in upper case so that we don't treat the same text written in different cases differently category_text[e["Type"]].append(e["Text"].upper()) #Keep track of the frequency of the text so that we can take the text with highest frequency of occurrance text_frequency[e["Type"]][e["Text"].upper()] = 1 elif (e["Text"].upper() in category_text[e["Type"]]): #Keep track of the frequency of the text so that we can take the text with highest frequency of occurrance text_frequency[e["Type"]][e["Text"].upper()] += 1 #The Kendra attribute metadata JSON object to be populated attributes = dict() metadata = dict() for et in categories: metadata[et] = [] #Take at most elimit number of recognized text strings having the highest frequency of occurrance el = [pair[0] for pair in sorted(text_frequency[et].items(), key=lambda item: item[1], reverse=True)][0:elimit] for d in entity_data[et]: if (d.upper() in el): metadata[et].append(d) #Use the input filename to determine the wikipedia page URL npfile = fname.split('.')[0] npname = npfile.split('/')[len(npfile.split('/')) - 1] npreplace = npname.replace("#", "/") metadata["_source_uri"] = "https://en.wikipedia.org/wiki/" + npreplace attributes["Attributes"] = metadata print(json.dumps(attributes, sort_keys=True, indent=4)) ##main ##The script takes the filename relative to local directory as input and outputs json to stdout if (len(sys.argv) != 2): print("Usage: ", sys.argv[0], " \n"); exit(1) else: document_file = sys.argv[1] document_handler(document_file)