###
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: MIT-0
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy of this
 # software and associated documentation files (the "Software"), to deal in the Software
 # without restriction, including without limitation the rights to use, copy, modify,
 # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
 # permit persons to whom the Software is furnished to do so.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
 # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 # Copyright Amazon.com, Inc. and its affiliates. All Rights Reserved.
#   SPDX-License-Identifier: MIT
######
import pandas as pd
import boto3
import re
import os
from awsUtils import getExtractedDataFromS3


patternIncompleteLine = re.compile(r"([A-Z]*[\.!?]$)", re.M)


class PatternDetector:
    def parseHeaderFooter(self, dataFiltered, occurrence):
        SKIP_PAGES = os.environ['SKIP_PAGES']
        skipPage = SKIP_PAGES.split(",")
        # top 5
        i = 0
        for index, row in dataFiltered.iterrows():
            # skip 2/3 pages
            txt = str(row["Text"])
            # print("header:",txt)
            elemFound = [ele for ele in skipPage if (ele in txt)]
            if elemFound:
                continue

            pair = occurrence.get(txt)
            if pair is None:
                occurrence[txt] = 1
            else:
                pair += 1
                occurrence.update({txt: pair})

            i += 1
            if i > 3:
                break

        dfLast10 = dataFiltered.iloc[-3:]
        for index, row in dfLast10.iterrows():
            txt = str(row["Text"])
            # print("footer:", txt)
            pair = occurrence.get(txt)
            if pair is None:
                occurrence[txt] = 1
            else:
                pair += 1
                occurrence.update({txt: pair})

        return occurrence

    def regexHeaderOrFooter(self, line):
        if len(patternIncompleteLine.findall(line)) == 0:
            length = len(line.split(" "))
            if line[0].isdigit() or line.isupper() or length < 2:
                return True
        return False

    def identifyHeaderFooterPattern(self, bucketName, prefixPath, totalPages):
        headerFooterPattern = []
        i = 1
        page = 1

        occurrence = {}

        try:
            fullScan = True
            while True:
                prefix = f"{prefixPath}/{i}"
                dataFiltered = getExtractedDataFromS3(bucketName, prefix)
                pages = dataFiltered["Page"].unique()
                pages.sort()
                for page in pages:

                    data = dataFiltered[dataFiltered["Page"] == page]
                    # data.sort_values(by=['Top'], ascending=True, inplace=True)

                    totalRow = data.shape[0]
                    if totalRow == 0:
                        break
                    # print("header/footer data for page : ", i , " totalRows :", totalRow)
                    occurrence = self.parseHeaderFooter( data, occurrence )

                    # print("header/footer map ", len(occurrence))
                    #page = 1
                    if (page >= totalPages or page > 20) and fullScan:
                        for k, v in occurrence.items():
                            if v > 10 or v >= totalPages:
                                fullScan = False
                                break
                if not fullScan:
                    break
                i += 1
                # break
        except Exception as e:
            print("end of processing :", i)
            print(e)

        filterdHeaderFooter = {}
        for k, v in occurrence.items():
            if v > 10 or v >= totalPages:
                filterdHeaderFooter[k] = v

        [print(k, ":", v) for k, v in filterdHeaderFooter.items()]
        return filterdHeaderFooter