import json import base64 import boto3 import zipfile from io import BytesIO from datetime import datetime from pdf2image import convert_from_bytes from PyPDF2 import PdfFileReader, PdfFileWriter from textractcaller.t_call import call_textract from textractprettyprinter.t_pretty_print import get_lines_string def call_textract_on_image(textract, image, i): # return JSON response containing the text extracted from the image print(f"Inputting image {i} into Textract") buf = BytesIO() image.save(buf, format='JPEG') byte_string = buf.getvalue() return call_textract(input_document=byte_string, boto3_textract_client=textract) def call_comprehend(text, comprehend, endpoint_arn): # send in raw text for a document as well as the Comprehend custom classification model ARN # returns JSON response containing the document's predicted class print("Inputting text into Comprehend model") return comprehend.classify_document( Text=text, EndpointArn=endpoint_arn ) def add_page_to_class(i, _class, pages_by_class): # appends a page number (integer) to list of page numbers (value in key-value pair) # stores information on how the original multi-page input PDF file is divided by class # stores page numbers in order, so the final outputted multi-class PDF pages will be in order if _class in pages_by_class: pages_by_class[_class].append(i) else: pages_by_class[_class] = [i] print(f"Added page {i} to {_class}\n") def create_output_pdfs(input_pdf_content, pages_by_class): # loops through each class in the pages_by_class dictionary to get all of the input PDF page numbers # creates new PDF for each class using the corresponding input PDF's pages input_pdf_buffer = BytesIO(input_pdf_content) input_pdf = PdfFileReader(input_pdf_buffer, strict=False) output_zip_buffer = BytesIO() with zipfile.ZipFile(output_zip_buffer, "w") as zip_archive: for _class in pages_by_class: output = PdfFileWriter() page_numbers = pages_by_class[_class] for page_num in page_numbers: output.addPage(input_pdf.getPage(page_num)) output_buffer = BytesIO() output.write(output_buffer) with zip_archive.open(f"{_class}.pdf", 'w') as output_pdf: output_pdf.write(output_buffer.getvalue()) print(f"Created PDF for {_class}") return output_zip_buffer def split_input_pdf_by_class(input_pdf_content, endpoint_arn, _id): # loops through each page of the inputted multi-page PDF # converts single-page PDF into an image and uploads it to the S3 bucket # image in S3 is inputted into the Textract API; text is extracted, JSON is parsed # raw text is inputted into the Comprehend model API using its endpoint ARN # JSON response is parsed to find the predicted class # the input PDF's page number is assigned to the predicted class in the pages_by_class dictionary textract = boto3.client('textract') comprehend = boto3.client('comprehend') pages_by_class = {} # converts PDF into images images = convert_from_bytes(input_pdf_content) # process each image for i, image in enumerate(images): textract_response = call_textract_on_image(textract, image, i) raw_text = get_lines_string(textract_json=textract_response) comprehend_response = call_comprehend(raw_text, comprehend, endpoint_arn) _class = comprehend_response['Classes'][0]['Name'] add_page_to_class(i, _class, pages_by_class) print("Input PDF has been split up and classified\n") return pages_by_class def lambda_handler(event, context): if event['path'] == '/': return { "statusCode": 200, "statusDescription": "200 OK", "isBase64Encoded": False, "headers": { "Content-Type": "text/html" }, "body": "This is the Document Splitter API." } else: request_body = event['queryStringParameters'] endpoint_arn = request_body['endpoint_arn'] bucket_name = request_body['bucket_name'] _id = datetime.now().strftime("%Y%m%d%H%M%S") s3 = boto3.client('s3') if event['path'] == '/s3_file': input_pdf_uri = request_body['input_pdf_uri'] input_pdf_key = input_pdf_uri.split(bucket_name + "/", 1)[1] s3_response_object = s3.get_object(Bucket=bucket_name, Key=input_pdf_key) input_pdf_content = s3_response_object['Body'].read() elif event['path'] == '/local_file': encoded_data = event['body'] decoded_data = base64.standard_b64decode(encoded_data) input_pdf_content = b"%PDF" + decoded_data.split(b"\r\n\r\n%PDF", 1)[1] # pages_by_class is a dictionary # key is class name; value is list of page numbers belonging to the key class pages_by_class = split_input_pdf_by_class(input_pdf_content, endpoint_arn, _id) output_zip_buffer = create_output_pdfs(input_pdf_content, pages_by_class) output_key_name = f"workflow2_output_documents_{_id}.zip" s3.put_object(Body=output_zip_buffer.getvalue(), Bucket=bucket_name, Key=output_key_name, ContentType='application/zip') output_zip_file_s3_uri = f"s3://{bucket_name}/{output_key_name}" return { "statusCode": 200, "statusDescription": "200 OK", "isBase64Encoded": False, "headers": { "Content-Type": "text/html" }, "body": json.dumps( { 'output_zip_file_s3_uri': output_zip_file_s3_uri } ) }