from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import *
import json
from scipy.stats import gaussian_kde
import nltk
import scipy
import numpy
import nltk
nltk.download('punkt')

#configure logging
import logging 
logger = logging.getLogger('log')
logger.setLevel(logging.INFO)

class MedicalBertModel():
    def __init__(self,run_null_model=None):
        self.run_null_model=run_null_model
        pass
    
    def run_null(self,null_location='null_hypothesis.txt',score=None):
        '''run null hypothesis to get pvalue'''
        f_in=open(null_location,'r')
        hist_dists_null=json.load(f_in)
        #next, create the kernel density estimation (kde) for the null hypothesis
        kde= gaussian_kde(hist_dists_null)
        pvalue=self.get_pvalue(kde,score)
        return(pvalue)

    def get_pvalue(self,kde,score):
        '''given kde of the null hypothesis, get the probability of achieving that score or better'''
        logger.debug(score)
        pvalue=kde.integrate_box_1d(0,score)
        return(pvalue)

    def break_up_by_sentence(self,the_string):
        '''break up supporting documentation into separate sentences'''
        to_return=nltk.tokenize.sent_tokenize(the_string)
        return(to_return)

    def get_best_n_sentences(self,corpus,distances,max_to_return=5):
        '''get the top n closest sentences; where lower distance is better'''
        max_to_return=5
        if len(corpus) <5: #
            max_to_return=len(corpus) 
        the_indices=numpy.argsort(distances)
        top_n_indices=the_indices[0:max_to_return]
        top_n_dist=[distances[i] for i in top_n_indices]
        logger.debug(corpus)
        top_n_corpus=[corpus[i] for i in top_n_indices]
        top_n_pvalues=[]
        if self.run_null_model==True:
            for i in range(0,len(top_n_dist)):
                temp_result=self.run_null(score=top_n_dist[i])
                top_n_pvalues.append(temp_result)	
        list_to_return=[]
        for i in range(0,len(top_n_indices)):
            if self.run_null_model==True:
                the_dict={"rank":i+1,"matching_sentence":top_n_corpus[i],"pvalue":top_n_pvalues[i],"distance":top_n_dist[i]}
            else:
                the_dict={"rank":i+1,"matching_sentence":top_n_corpus[i],"distance":top_n_dist[i]}
            list_to_return.append(the_dict)
        return(list_to_return)
    
    def run_model_and_null(self,request):
        '''run model and null model if specified'''
        request_dict=json.loads(request)
        input_sentence=request_dict['input_sentence']
        the_paragraph=request_dict['input_paragraph']
        model_result=self.run_model(input_sentence,the_paragraph)
        return(model_result)
    
    def run_model(self,input_sentence,the_paragraph):
        #get the embedder
        word_embedding_model = models.Transformer('emilyalsentzer/Bio_ClinicalBERT')
        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)
        embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
        # create a corpus of every individual sentence within the paragraph
        corpus=self.break_up_by_sentence(the_paragraph)
        corpus_embeddings = embedder.encode(corpus)
        # define the input sentence as the query
        queries = [input_sentence]
        query_embedding = embedder.encode(queries)
        # calculate the distance between the query embedding and the corpus embeddings
        distances = scipy.spatial.distance.cdist(query_embedding, corpus_embeddings, "cosine")[0]
        to_return=self.get_best_n_sentences(corpus,distances)
        return(to_return)


if __name__=='__main__':
    mbm=MedicalBertModel(run_null_model=False)
    mock_input=json.dumps({'input_sentence':'The patient is healthy.',"input_paragraph":"The patient's health is good. The patient does not have a fever. The patient is a 28 year old female."})
    result=mbm.run_model_and_null(mock_input)
    logger.info(f'Model result: {result}')