import seaborn as sns 
import matplotlib.pyplot  as plt
import pandas as pd
import boto3, botocore
from tqdm import tqdm
import boto3

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

## retrieve the top medical conditions based on confidence score
## nFeature: number of features to retrieve
## threshold: confidence score from comprehend
def retrieve_mcList(df, nFeature=20,threshold=0.9):
    ## change all terms to lower case
    df['MEDICAL_CONDITION']=df['MEDICAL_CONDITION'].str.lower()
    ## 
    df=df.replace(['hemostatic','hematoma','Hemostasis'],'hemostasis')
    df=df.replace(['wounds','masses','lesions','polyps'],['wound' ,'mass','lesion','polyp'])
    mcList=df[df.Score>=threshold].MEDICAL_CONDITION.value_counts().index[:nFeature].to_list()
    
    
    return mcList, df


def mc_barplot(df, threshold_score=0.9,topN=20):
    df_mcs_vc = df.MEDICAL_CONDITION[df.Score>0.9].value_counts()

    df_mcs_top = df_mcs_vc[:topN,]
    plt.figure(figsize=(20,5))
    chart=sns.barplot(df_mcs_top.index, df_mcs_top.values, alpha=0.8)
    chart.set_xticklabels(chart.get_xticklabels(), rotation=45)
    plt.title(f'top {topN} medical conditions in the patients')
    plt.ylabel('Number of Occurrences', fontsize=12)
    plt.xlabel('occurance', fontsize=14)
    plt.show()
    return df_mcs_vc


#########*************************##########

## Function to extract a single record
def extractMC_v2(json_file):
    
    ## initialize the list variables for medical_conditions, scores and traits
    medical_conditions=[]
    scores=[]
    traits=[]
    for row in json_file['Entities']:
        # 
        if row['Category'] == "MEDICAL_CONDITION":
            medical_conditions.append(row['Text'])#  += row['Text'] + ' '
            scores.append(row['Score'])
            trait='NaN'
            if row['Traits']:
                #print(row['Traits'],row['Text'] )
                trait=row['Traits'][0]['Name']
                
            traits.append(trait)
        df_mc = pd.DataFrame({'MEDICAL_CONDITION': pd.Series(medical_conditions), 'Score':pd.Series(scores),'Trait':pd.Series(traits)})
    return df_mc

## extract medical conditions in a batch

def extractMCbatch(transcriptionList,patientIDList):
    df_final = pd.DataFrame()
    #patient_id=100
    #assert(len(transcriptionList)==len(patientIDList)):
    if(len(transcriptionList)!=len(patientIDList)):
        return("Error! different length!")

    
    for item,patient_id in zip(transcriptionList,patientIDList):
        
        df_ind = extractMC_v2(item)
        df_ind['ID']=patient_id
        df_final=df_final.append(df_ind)
      
    
    # remove the duplicated entries 
    df_final=df_final.sort_values(by=['ID','MEDICAL_CONDITION']).drop_duplicates(['ID','MEDICAL_CONDITION'],keep='last')

    #print(df_final.shape)
    
    return df_final


## this function will 
## this function will extract the subpopulation given dataframe, medical_speciality 
def subpopulation_comprehend(df, medical_specialty,sampleSize=200):
    ## select the sub population
    df_sub=df[df.medical_specialty==medical_specialty ].reset_index()
    #df_sub.head()

    ## sample from the population
    df_sub_sub=df_sub.sample(n=sampleSize, random_state=123)
    print("original data shape is ",df_sub_sub.shape)

    ## remove missing entries
    df_sub_sub=df_sub_sub[df_sub_sub.transcription.notna()==True]
    print("data shape after removing missing entries is ",df_sub_sub.shape)   
    
    cm  = boto3.client(service_name='comprehendmedical', use_ssl=True, region_name = 'us-east-1')
    #idx=0
    #print("df_sub_sub['transcription'] ", len(df_sub_sub['transcription']))
    patient_ids=df_sub_sub['id'].to_list()
    ## comprehend processing
    # transcrption_list=df_sub_sub['id'].to_list()
    transcrption_list=[]
    for text in tqdm(df_sub_sub['transcription']):
        #print(idx)
        #print("----------------")
        #print("analyzing:", text)
        comprehend_result = cm.detect_entities_v2(Text = text)
        #print(len(comprehend_result))
        transcrption_list.append(comprehend_result)
        
        
    return transcrption_list, patient_ids


def corrPlot(df):
    plt.figure(figsize=(15,15))
    corr = df.iloc[:,1:].corr() ## skip the 1st column as it is the patient_id
    ax = sns.heatmap(
        corr, 
        vmin=-1, vmax=1, center=0,
        cmap=sns.diverging_palette(20, 220, n=200),
        square=True
        )
    ax.set_xticklabels(ax.get_xticklabels(),
        rotation=45,
        horizontalalignment='right'
        );
    
    return

##### function to interate the medical conditions and then convert to a wide formate
def dataframe_convert(df_raw,df_final, condition ):
    
    #step1: get the sub dataframe 
    df_sub = df_raw[df_raw.MEDICAL_CONDITION==condition]
    
    #step2: iterate the sub dataframe and fill the information into the native ones
    for index, row  in df_sub.iterrows():
        #print(row)
        sid=row.ID
       
        #print("condition is:",condition)
        df_final.loc[df_final.ID == sid,condition]=row.Score
        #print("Processed: ",df_final.columns)
    return df_final


##### the function to convert dataframe of medical conditions from long format to wide format


colname_mc=['nontender', 'foreign body', 'edema', 'alert', 'murmur',
       'chest pain', 'vomiting', 'hiatal hernia', 'distress', 'hemostasis',
       'carpal tunnel syndrome', 'endometriosis', 'weakness', 'pain', 'mass',
       'inflammation', 'polyp', 'bleeding', 'hypertension', 'supple', 'fever',
       'stenosis', 'wound', 'cyanosis', 'infection', 'erythema',
       'normocephalic', 'fracture', 'lesion', 'ulceration', 'nausea', 'cough',
       'tumor', 'soft', 'shortness of breath', 'injury', 'diabetes']


def df_mc_generator(df_mcs,colname_mc=colname_mc ,colname_other=['ID',"Label"] ):
    
    ## remove duplicate rows
    df_1 = df_mcs.drop_duplicates(subset=['ID']).copy()
   
    ## generate an empty dataframe first
    column_names=colname_other+colname_mc
    ## column names
    df_combined=pd.DataFrame(columns=column_names)
    ## copy ID and positive data from the original df
    df_combined[colname_other]=df_1[colname_other]
   
    ## loop to fill in the information for each condition
    for con in colname_mc:
        #print(df_combined.columns)
        df_combined = dataframe_convert(df_mcs,df_combined, con )
    df_combined = df_combined.fillna(0)
    df_combined["Label"] = df_combined["Label"].astype(int)
    
    return df_combined

def df_mc_generator_slim(df_mcs,colname_mc=colname_mc ,colname_other=['ID'] ): 
    
    ## remove duplicate rows
    if 'ID' not in df_mcs:
        df_mcs['ID']=0
    
    df_1 = df_mcs.drop_duplicates(subset=['ID']).copy()
    #print(colname_mc)
    ## generate an empty dataframe first
    column_names=colname_other+colname_mc
    ## column names
    df_combined=pd.DataFrame(columns=column_names)
    ## copy ID and positive data from the original df
    df_combined[colname_other]=df_1[colname_other]
   
    ## loop to fill in the information for each condition
    for con in colname_mc:
        #print(df_combined.columns)
        df_combined = dataframe_convert(df_mcs,df_combined, con )
    df_combined = df_combined.fillna(0)
    #df_combined["Label"] = df_combined["Label"].astype(int)
    
    return df_combined