import argparse
import os
import re
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import csv


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--i2b2_dir', required=True)
    parser.add_argument('--save_dir', required=True)
    parser.add_argument('--header_dir', required=False)
    parser.add_argument('--admission_only', default=False)
    parser.add_argument('--seed', default=123, type=int)
    return parser.parse_args()


def filter_admission_text(i2b2_dir, header_dir, save_dir) -> pd.DataFrame:
    """
    Filter text information by section and only keep sections that are known on admission time.
    """

    # For both Set1 and Set2, extract section headers and their corresponding span information for the i2b2 data
    headings_set1 = header_info_extraction(header_dir, 'Set1')
    headings_set2 = header_info_extraction(header_dir, 'Set2')

    # Retrieve unique headers and corresponding section text's span information
    sec_text_span = unique_headings(headings_set1, headings_set2, save_dir)

    # List of section headers under admission group
    admission_filter = ["chief complaint", "physical examination", "impression", "history of present illness",
                        "past medical history", "allergies", "review of systems", "family history", "social history",
                        "reason for visit", "past surgical history", "current medications",
                        "social history and family history", "medications on admission", "major problems", "history",
                        "past medical history and social history",
                        "narrative history", "reason for consult", "changes to allergies", "attending note",
                        "therapy rendered course in ed", "chief complaint and history of present illness",
                        "transfer medications", "interim history", "reason for admission", "other problems",
                        "labs on admission", "present illness", "medications", "problems", "interval history", "habits",
                        "medications at home"]

    # List of mandatory sections
    mandatory_sections = ["CHIEF COMPLAINT: ", "IMPRESSION: ", "HISTORY OF PRESENT ILLNESS: ", "PAST MEDICAL HISTORY: ",
                          "REASON FOR VISIT: ", "MAJOR PROBLEMS: ", "HISTORY: ",
                          "PAST MEDICAL HISTORY AND SOCIAL HISTORY: ", "NARRATIVE HISTORY: ", "REASON FOR CONSULT: ",
                          "ATTENDING NOTE: ", "CHIEF COMPLAINT AND HISTORY OF PRESENT ILLNESS: ",
                          "REASON FOR ADMISSION: ", "PRESENT ILLNESS: "]

    filenames = []
    instances = []
    admission_text = ''
    first_rec = True

    # Combine the required sections of a clinical note under column "ADMISSION_TEXT"
    for index, row in sec_text_span.iterrows():
        if row['new_heading'] in admission_filter:

            if first_rec:
                filename = row['file']
                first_rec = False

            # Extract the corresponding XML file for i2b2 data
            XML_FOLDER = 'training-RiskFactors-Complete-' + row['set_name']
            XML_FILE_PATH = os.path.join(i2b2_dir, XML_FOLDER)

            f_name = row['file'][:-3] + "xml"
            file_full_path = os.path.join(XML_FILE_PATH, f_name)
            # Parse the XML file to extract the text information
            tree = ET.parse(file_full_path)
            root = tree.getroot()
            note = root[0].text

            # Retrieve the beginning and end information for sections
            section_start = row['sec_start']
            if row['sec_end'] < 0:
                section_end = len(note)
            else:
                section_end = row['sec_end']
            filenames.append(row['file'])

            text = note[section_start:section_end].strip()
            replace_stars_underscores = re.compile(r"[-{3}+|***+|___+]")
            text = replace_stars_underscores.sub(" ", text).strip()
            combine_whitespace = re.compile(r"\s+")
            text = combine_whitespace.sub(" ", text).strip()

            # Drop the section text if blank
            if (text != ""):
                section_text = row['new_heading'].upper(
                ) + ': ' + text.replace('\n', ' ').strip()
            else:
                continue

            # Combine the different sections of a clinical note in one string
            if row['file'] == filename:
                admission_text = admission_text + '\n\n' + section_text.strip()
            else:
                # Drop the clinical note if none of the mandatory sections are included, else add to the output dataframe
                if any(section in admission_text for section in mandatory_sections):
                    instance = filename[:-
                    4], filename[:3], admission_text.strip()
                    instances.append(instance)

                admission_text = section_text
                filename = row['file']

    # instance represents a record of the output file including the ROW_ID, SUBJECT_ID and ADMISSION_TEXT
    instance = filename[:-4], filename[:3], admission_text.strip()
    instances.append(instance)

    adm_text_df = pd.DataFrame(
        instances, columns=['ROW_ID', 'SUBJECT_ID', 'ADMISSION_TEXT'])

    return adm_text_df


def unique_headings(headings_set1: pd.DataFrame, headings_set2: pd.DataFrame,
                    i2b2_headings_dict_path: str) -> pd.DataFrame:
    """
    Creates unique headings across all files.
    Creates columns which include information regarding the beginning and ending of a section's text.
    """

    # Combine section header information for both sets
    frames = [headings_set1, headings_set2]
    all_headings = pd.concat(frames)

    # Remove punctuation characters from section headers
    all_headings['heading'] = [
        (re.sub('[////\:*?"<>|#-=_]', ' ', elem)).strip() for elem in all_headings.heading]

    # Import the dictionary which maps existing section headers of both sets to new unique headers
    headings_dict_df = pd.read_csv(i2b2_headings_dict_path)
    mapping_dict = headings_dict_df.set_index('old_heading')['new_heading'].to_dict()

    # Use the imported dictionary to create column 'new heading' which includes unique headings
    all_headings['new_heading'] = all_headings['heading'].map(mapping_dict)
    all_headings['new_heading'] = np.where(all_headings['new_heading'].isnull(
    ), all_headings['heading'], all_headings['new_heading'])

    # Convert the data type of span columns from string to integer
    all_headings['span_end'] = all_headings['span_end'].astype(np.int64)
    all_headings['span_start'] = all_headings['span_start'].astype(np.int64)

    # Create the columns which include the character information for section beginning and ending
    all_headings['sec_start'] = all_headings['span_end'] + 1
    sorted_span = all_headings.groupby(["file"], sort=False).apply(
        lambda x: x.sort_values(["span_start"])).reset_index(drop=True)
    shifted = sorted_span.groupby(
        "file").shift(-1).drop(['heading', 'span_end', 'new_heading', 'sec_start'], axis=1)

    # Use lag functionality to get each section's last character position
    sec_text_span = sorted_span.join(
        shifted.rename(columns=lambda x: x + "_lag"))
    sec_text_span['span_start_lag'] = sec_text_span['span_start_lag'].fillna(
        -1)
    sec_text_span['span_start_lag'] = sec_text_span['span_start_lag'].astype(
        np.int64)
    sec_text_span['sec_end'] = sec_text_span['span_start_lag'] - 1

    return sec_text_span


def header_info_extraction(header_dir, set_name) -> pd.DataFrame:
    """
    Extracts the heading and span information using the section header data available for the i2b2 data
    """
    file_path = os.path.join(header_dir, set_name)

    # Names of clinical record files in alphabetical order:
    all_files = os.listdir(file_path)
    all_files.sort()

    instances = []

    for filename in all_files:

        # Read files:

        path = os.path.join(file_path, filename)
        file = open(path, 'r', encoding='utf-8').read()

        # Convert text to token sequences:
        token_sequences = [sentence.split() for sentence in file.split('\n')]

        # Assign the token position for the first word of the section heading
        word_pos = 4

        # Extract the heading, span information,filename and the corresponding set name into a Dataframe
        for seq in token_sequences:
            heading = seq[word_pos]
            span_start = seq[2]
            span_end = seq[3]
            while len(seq) - 1 != word_pos:
                word_pos = word_pos + 1
                heading = heading + ' ' + seq[word_pos]

            instance = heading.lower(), span_start, span_end, filename, set_name
            instances.append(instance)
            word_pos = 4

    headings_df = pd.DataFrame(
        instances, columns=['heading', 'span_start', 'span_end', 'file', 'set_name'])
    return headings_df


def save_i2b2_split_patient_wise(df, label_column, save_dir, task_name, seed, column_list=None):
    """
    Splits a i2b2 dataframe into 70/10/20 train, val, test with no patient occuring in more than one set.
    Uses ROW_ID as ID column and save to save_path.
    """
    if column_list is None:
        column_list = ["ID", "ADMISSION_TEXT", label_column]

    np.random.seed(seed)

    # Make a split per patient, so that no patients reoccur in one of the eval sets
    unique_patients = df.SUBJECT_ID.unique()

    np.random.shuffle(unique_patients)
    data_split = np.split(unique_patients, [int(.7 * len(unique_patients)),
                                            int(.8 * len(unique_patients))])

    # Use row id as general id
    df = df.rename(columns={'ROW_ID': 'ID'})

    # Create path to task data
    os.makedirs(save_dir, exist_ok=True)

    # Save splits to data folder
    for i, split_name in enumerate(["train", "val", "test"]):
        split_set = df[df.SUBJECT_ID.isin(data_split[i])].sample(
            frac=1, random_state=seed)[column_list]

        # lower case column names
        split_set.columns = map(str.lower, split_set.columns)

        split_set.to_csv(os.path.join(save_dir, "{}_{}.csv".format(task_name, split_name)),
                         index=False,
                         quoting=csv.QUOTE_ALL)