# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 import json import boto3 import sys import pdfplumber ''' The purpose of this Lambda function is to extract text from PDF documents stored in the S3 bucket to convert documents from PDF to plain text and start a topic detection job using the Amazon Comprehend service. For the transformation of PDF to plain text, the PDFplumber library is used due to its ease of use and accuracy in extraction. ''' comprehend = boto3.client('comprehend') s3 = boto3.client('s3') #Function responsible for transforming from PDF to plain text def pdf2text(name_of_the_item): with pdfplumber.open(r'/tmp/'+name_of_the_item) as pdf: num_pages_per_document = pdf.pages all_text = '' for pdf_page in num_pages_per_document: single_page_text = pdf_page.extract_text() all_text = all_text + '\n' + single_page_text dominant_language = comprehend.detect_dominant_language( Text=pdf.pages[4].extract_text() ) name_of_the_item = name_of_the_item.split('.') s3.put_object( Body=all_text, Bucket='xxxxxxxxxxxxx', #Please replace with your bucket name Key='whitepapers-text/'+name_of_the_item[0]+'.txt' ) return dominant_language def lambda_handler(event, context): list_of_objects = s3.list_objects_v2( Bucket='axxxxxxxxxxxxxxxxxx', #Please replace with your bucket name Prefix='whitepapers' ) list_of_objects['Contents'].pop(0) documents_languages = [] for x in list_of_objects['Contents']: exception = x['Key'].split('.') if 'pdf' in exception: name_of_the_item = x['Key'].split('/') with open('/tmp/'+name_of_the_item[1], 'wb') as data: s3.download_fileobj('xxxxxxxxxxxxxxxxxxxxxx', x['Key'], data) #Please replace with your bucket name dominant_language = pdf2text(name_of_the_item[1]) documents_languages.append(dominant_language) else: pass topic_detection = comprehend.start_topics_detection_job( InputDataConfig={ 'S3Uri': 's3://xxxxxxxxxxxxxxxxxxxxxx/whitepapers-text/', #Replace for your S3 URI 'InputFormat': 'ONE_DOC_PER_FILE', }, OutputDataConfig={ 'S3Uri': 's3://xxxxxxxxxxxxxxxxxx/whitepapers-text/' #Replace for your S3 URI }, DataAccessRoleArn='arn:aws:iam::xxxxxxxxxx:role/comprehend-role', #Please replace with your role's arn JobName='aprendiendoaws-comprehend-topics-detection', NumberOfTopics=10 ) return { 'TopicDetectionID': topic_detection['JobId'], 'DocumentsLanguages': documents_languages }