# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
import boto3
import sys
import json
import argparse
import logging
import unicodedata


class ComprehendProcessor:
    # Global Variables
    # Array containing the HTML of each blogs paginated blog post titles and previews

    logger = logging.getLogger()
    comprehend = None

    def __init__(self):
        super().__init__()
        self.comprehend = boto3.client(
            service_name='comprehend')

    def process(self, post: dict):
        # Split the text into paragraphs to allow for better processing
        paragraphs = post['post'].split('\n\n')
        post_list = []

        # For each paragraph truncate it to a 5000 byte UTF-8 encoded string that is required by Comprehend
        for p in paragraphs:
            if '\n' in p:
                post_list.append(truncateUTF8length(p, 5000))

        results = []
        i = 0
        # Loop through paragraphs and detect entities in batches of 25
        while i < len(post_list):
            res = self.comprehend.batch_detect_entities(
                TextList=post_list[i:i+25], LanguageCode='en')
            i += 25
            for r in res['ResultList']:
                if len(r['Entities']) > 0:
                    results.append(r['Entities'])

        # Create the entities key if it does not exist
        if not 'entities' in post.keys():
            post['entities'] = []

        # Individually append the entities
        for r in results:
            for i in r:
                post['entities'].append(i)


def truncateUTF8length(unicodeStr, maxsize):
    return str(unicodeStr.encode("utf-8")[:maxsize], "utf-8", errors="ignore")