""" Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. SPDX-License-Identifier: MIT-0. """ from elasticsearch import Elasticsearch, RequestsHttpConnection import requests from aws_requests_auth.aws_auth import AWSRequestsAuth from requests_aws4auth import AWS4Auth import base64 from s3transfer.manager import TransferManager import os import os.path import sys import boto3 import json import io from io import BytesIO import sys try: from urllib.parse import unquote_plus except ImportError: from urllib import unquote_plus print('setting up boto3') root = os.environ["LAMBDA_TASK_ROOT"] sys.path.insert(0, root) print(boto3.__version__) print('core path setup') s3 = boto3.resource('s3') s3client = boto3.client('s3') print('initializing comprehend') comprehend = boto3.client(service_name='comprehend', region_name='us-east-1') print('done') host= os.environ['esDomain'] print("ES DOMAIN IS..........") region = 'us-east-1' # e.g. us-west-1 service = 'es' credentials = boto3.Session().get_credentials() def connectES(): print ('Connecting to the ES Endpoint {0}') awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) try: es = Elasticsearch( hosts=[{'host': host, 'port': 443}], http_auth = awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection) return es except Exception as E: print("Unable to connect to {0}") print(E) exit(3) print("sucess seting up es") print("setting up Textract") # get the results textract = boto3.client( service_name='textract', region_name= 'us-east-1', endpoint_url='https://textract.us-east-1.amazonaws.com', ) print("Textract Set UP") # --------------- Main Lambda Handler ------------------ def handler(event, context): print("Received event: " + json.dumps(event, indent=2)) # Get the object from the event and show its content type bucket = event['Records'][0]['s3']['bucket']['name'] key = unquote_plus(event['Records'][0]['s3']['object']['key']) print("key is"+key) print("bucket is"+bucket) text="" textvalues=[] textvalues_entity={} try: s3.Bucket(bucket).download_file(Key=key,Filename='/tmp/{}') # Read document content with open('/tmp/{}', 'rb') as document: imageBytes = bytearray(document.read()) print("Object downloaded") #Analyze the text using TEXTRACT #textract = AwsHelper().getClient('textract') response = textract.analyze_document(Document={'Bytes': imageBytes},FeatureTypes=["TABLES", "FORMS"]) blocks=response['Blocks'] for block in blocks: if block['BlockType'] == 'LINE': text += block['Text']+"\n" print(text) # Extracting Key Phrases sentiment_response = comprehend.detect_key_phrases(Text=text, LanguageCode='en') KeyPhraseList=sentiment_response.get("KeyPhrases") for s in KeyPhraseList: textvalues.append(s.get("Text")) detect_entity= comprehend.detect_entities(Text=text, LanguageCode='en') EntityList=detect_entity.get("Entities") for s in EntityList: textvalues_entity.update([(s.get("Type").strip('\t\n\r'),s.get("Text").strip('\t\n\r'))]) s3url= 'https://s3.console.aws.amazon.com/s3/object/'+bucket+'/'+key+'?region=us-east-1' searchdata={'s3link':s3url,'KeyPhrases':textvalues,'Entity':textvalues_entity,'text':text} print(searchdata) print("connecting to ES") es=connectES() #es.index(index="resume-search", doc_type="_doc", body=searchdata) es.index(index="document", doc_type="_doc", body=searchdata) print("data uploaded to Elasticsearch") return 'keyphrases Successfully Uploaded' except Exception as e: print(e) print('Error: ') raise e