import os
import re
import argparse
import json
import boto3
from bs4 import BeautifulSoup
from langchain.document_loaders import PDFMinerPDFasHTMLLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter,CharacterTextSplitter


smr_client = boto3.client("sagemaker-runtime")
parameters = {
  "max_length": 2048,
  "temperature": 0.01,
  "num_beams": 1, # >1可能会报错，"probability tensor contains either `inf`, `nan` or element < 0"； 即使remove_invalid_values=True也不能解决
  "do_sample": False,
  "top_p": 0.7,
  "logits_processor" : None,
  # "remove_invalid_values" : True
}

'''
1. pip install pdfminer.six
'''

def split_pdf_to_snippet(pdf_path):
    loader = PDFMinerPDFasHTMLLoader(pdf_path)
    data = loader.load()[0]

    soup = BeautifulSoup(data.page_content,'html.parser')
    content = soup.find_all('div')

    cur_fs = None
    cur_text = ''
    snippets = []   # first collect all snippets that have the same font size
    for c in content:
        sp = c.find('span')
        if not sp:
            continue
        st = sp.get('style')
        if not st:
            continue
        fs = re.findall('font-size:(\d+)px',st)
        if not fs:
            continue
        fs = int(fs[0])
        if not cur_fs:
            cur_fs = fs
        if fs == cur_fs:
            cur_text += c.text
        else:
            snippets.append((cur_text,cur_fs))
            cur_fs = fs
            cur_text = c.text
    snippets.append((cur_text,cur_fs))

    cur_idx = -1
    semantic_snippets = []
    # Assumption: headings have higher font size than their respective content
    for s in snippets:
        # if current snippet's font size > previous section's heading => it is a new heading
        if not semantic_snippets or s[1] > semantic_snippets[cur_idx].metadata['heading_font']:
            metadata={'heading':s[0], 'content_font': 0, 'heading_font': s[1]}
            metadata.update(data.metadata)
            semantic_snippets.append(Document(page_content='',metadata=metadata))
            cur_idx += 1
            continue
        
        # if current snippet's font size <= previous section's content => content belongs to the same section (one can also create
        # a tree like structure for sub sections if needed but that may require some more thinking and may be data specific)
        if not semantic_snippets[cur_idx].metadata['content_font'] or s[1] <= semantic_snippets[cur_idx].metadata['content_font']:
            semantic_snippets[cur_idx].page_content += s[0]
            semantic_snippets[cur_idx].metadata['content_font'] = max(s[1], semantic_snippets[cur_idx].metadata['content_font'])
            continue
        
        # if current snippet's font size > previous section's content but less tha previous section's heading than also make a new 
        # section (e.g. title of a pdf will have the highest font size but we don't want it to subsume all sections)
        metadata={'heading':s[0], 'content_font': 0, 'heading_font': s[1]}
        metadata.update(data.metadata)
        semantic_snippets.append(Document(page_content='',metadata=metadata))
        cur_idx += 1

    return semantic_snippets

def fontsize_mapping(heading_fonts_arr):
    heading_fonts_set = list(set(heading_fonts_arr))
    heading_fonts_set.sort(reverse=True)
    idxs = range(len(heading_fonts_set))
    font_idx_mapping = dict(zip(heading_fonts_set,idxs))
    return font_idx_mapping

def split_pdf(pdf_path):
    semantic_snippets = split_pdf_to_snippet(pdf_path)
    heading_fonts_arr = [ item.metadata['heading_font'] for item in semantic_snippets ]
    heading_arr = [ item.metadata['heading'] for item in semantic_snippets ]

    fontsize_dict = fontsize_mapping(heading_fonts_arr)

    for idx, snippet in enumerate(semantic_snippets):
        font_size = heading_fonts_arr[idx]
        heading_stack = []
        heading_info = {"font_size":heading_fonts_arr[idx], "heading":heading_arr[idx], "fontsize_idx" : fontsize_dict[font_size]}
        heading_stack.append(heading_info)
        for id in range(0,idx)[::-1]:
            if font_size < heading_fonts_arr[id]:
                font_size = heading_fonts_arr[id]
                heading_info = {"font_size":font_size, "heading":heading_arr[id], "fontsize_idx" : fontsize_dict[font_size]}
                heading_stack.append(heading_info)
            
        snippet_info = {
            "heading" : heading_stack,
            "content" : snippet.page_content
        }
        yield snippet_info

def summarize(content, chunk_size = 512, llm_endpoint=""):
    summary = content
    if llm_endpoint and len(content) > chunk_size:
        # todo: call LLM to summarize
        prompt_template = """对下面反引号这段文档进行摘要，字数不超过{}

        ```
        {}
        ```
        摘要:
        """

        prompt = prompt_template.format(chunk_size, content[:1536])

        response_model = smr_client.invoke_endpoint(
            EndpointName=llm_endpoint,
            Body=json.dumps(
            {
                "inputs": prompt,
                "parameters": parameters,
                "history" : []
            }
            ),
            ContentType="application/json",
        )

        json_ret = json.loads(response_model['Body'].read().decode('utf8'))
        summary = json_ret['outputs']
    
    return summary

def convert_snippetJson2markdown(snippet_info, max_level=3):
    mk_head = ""
    p_head = ""
    for item in snippet_info["heading"][0:max_level][::-1]:
        mk_head += "#"
        head = "{} {}".format(mk_head, item["heading"].replace('\n',''))
        p_head += "{}\n".format(head)
    
    p_content = "{}\n{}".format(p_head, snippet_info['content'])

    return p_content

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_file', type=str, default='./1.pdf', help='input file')
    parser.add_argument('--output_dir', type=str, default='./', help='output file')
    parser.add_argument('--sep', type=str, default='=====', help='separtor')
    parser.add_argument('--title_level', type=int, default=4, help='keep the tiltes of level')
    parser.add_argument('--chunk_size', type=int, default=128, help='chunk_size')
    parser.add_argument('--llm_endpoint', type=str, default="", help='llm_endpoint')
    args = parser.parse_args()
    pdf_path = args.input_file
    kg_dir = args.output_dir
    kg_name = os.path.basename(pdf_path).replace('.pdf','.txt')
    separtor = args.sep
    max_title_level = args.title_level
    chunk_size = args.chunk_size
    llm_endpoint = args.llm_endpoint
    idx = 1

    f_name = "{}/{}".format(kg_dir, kg_name)
    out_f = open(f_name, 'w')
    snippet_arr = []
    for snippet_info in split_pdf(pdf_path):
        snippet_arr.append(snippet_info)
        
        # p_content = convert_snippetJson2markdown(snippet_info, max_title_level)
        # out_f.write(summarize(p_content, chunk_size, llm_endpoint))
        # out_f.write("\n")
        # out_f.write(separtor)
        # out_f.write("\n")
    all_info = json.dumps(snippet_arr, ensure_ascii=False)
    out_f.write(all_info)
    out_f.close()
    print("finish separation of {}".format(pdf_path))