"""
Based on Amazon SageMaker's 'bring your own scikit container' 'serve' example
https://github.com/awslabs/amazon-sagemaker-examples/tree/master/advanced_functionality/scikit_bring_your_own
This is the file that implements a flask server to do inferences. Modify this file for your own inference.
"""

import flask
import os
import numpy as np
import io
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
import ctypes
import os
ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
import pycuda.driver as cuda
import pycuda.autoinit
import time

prefix = '/opt/ml/'
model_path = os.path.join(prefix, 'model')

import helpers.data_processing as dp
import helpers.tokenization as tokenization
vocab_file_path = os.path.join(model_path, "vocab.txt")
engine_path = os.path.join(model_path, "bert_large_384.engine")
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=True)
# The maximum number of tokens for the question. Questions longer than this will be truncated to this length.
max_query_length = 64
# When splitting up a long document into chunks, how much stride to take between chunks.
doc_stride = 128
# The maximum total input sequence length after WordPiece tokenization. 
# Sequences longer than this will be truncated, and sequences shorter 
max_seq_length = 384


# A singleton for holding the model. This simply loads the model and holds it.
# It has a predict function that does a prediction based on the model and the input data.

class ScoringService(object):
    model = None  # Where we keep the model when it's loaded

    @classmethod
    def get_model(cls):
        """Get the model object for this instance, loading it if it's not already loaded."""
        try:
            if (cls.model == None):
                
                cls.model = open(engine_path, "rb")
                
        except Exception as e:
            # Don't return any extra information
            cls.model = None

        return cls.model

    @classmethod
    def predict(cls, short_paragraph_text, question_text):
        """For the input, do the predictions and return them.
        Args:
            input (a pandas dataframe): The data on which to do the predictions. There will be
                one prediction per row in the dataframe"""
        
        # Extract tokens from the paragraph
        doc_tokens = dp.convert_doc_tokens(short_paragraph_text)
        # Extract features from the paragraph and question
        features = dp.convert_examples_to_features(doc_tokens, question_text, tokenizer, max_seq_length, doc_stride, max_query_length)
        
        # Load the BERT-Large Engine
        with open(engine_path, "rb") as f, \
            trt.Runtime(TRT_LOGGER) as runtime, \
            runtime.deserialize_cuda_engine(f.read()) as engine, \
            engine.create_execution_context() as context:
            
            # We use batch size 1.
            input_shape = (max_seq_length, 1)
            input_nbytes = trt.volume(input_shape) * trt.int32.itemsize
            
            # Allocate device memory for inputs.
            d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]
            # Create a stream in which to copy inputs/outputs and run inference.
            stream = cuda.Stream()
            
            # Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)
            # Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.
            for binding in range(3):
                context.set_binding_shape(binding, input_shape)
            assert context.all_binding_shapes_specified
            
            # Allocate output buffer by querying the size from the context. This may be different for different input shapes.
            h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)
            d_output = cuda.mem_alloc(h_output.nbytes)
           
            print("\nRunning Inference now...")
            eval_start_time = time.time()

            # Copy inputs
            cuda.memcpy_htod_async(d_inputs[0], features["input_ids"], stream)
            cuda.memcpy_htod_async(d_inputs[1], features["segment_ids"], stream)
            cuda.memcpy_htod_async(d_inputs[2], features["input_mask"], stream)
            
            # Run inference
            context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
            
            # Transfer predictions back from GPU
            cuda.memcpy_dtoh_async(h_output, d_output, stream)
            
            # Synchronize the stream
            stream.synchronize()
            
            eval_time_elapsed = time.time() - eval_start_time
            h_output = h_output.transpose((1,0,2,3,4))
            
            print("-----------------------------")
            print("Running Inference at {:.3f} Sentences/Sec".format(1.0/eval_time_elapsed))
            print("-----------------------------")
        return h_output, doc_tokens, features, 1.0/eval_time_elapsed


# The flask app for serving predictions
app = flask.Flask(__name__)


@app.route('/ping', methods=['GET'])
def ping():
    """Determine if the container is working and healthy. In this sample container, we declare
    it healthy if we can load the model successfully."""
    health = ScoringService.get_model() is not None  # You can insert a health check here

    status = 200 if health else 404
    return flask.Response(response='\n', status=status, mimetype='application/json')


@app.route('/invocations', methods=['POST'])
def transformation():
    """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert
    it to a pandas data frame for internal use and then convert the predictions back to CSV (which really
    just means one prediction per line, since there's a single column.
    """

    # Convert from json
    if (not flask.request.content_type == 'application/json'):
        return flask.Response(response='This predictor only supports json data. We have a request of type '+flask.request.content_type, status=415, mimetype='text/plain')
    print("Getting request.")
    json_data=flask.request.get_json(force = True)
    short_paragraph_text = json_data["short_paragraph_text"]
    question_text = json_data["question_text"]
    
    
    print("Got request, starting prediction.")
    # Do prediction
    h_output, doc_tokens, features, sentences_sec = ScoringService.predict(short_paragraph_text, question_text)
    print("Finished prediction.")
    result = ""
    for index, batch in enumerate(h_output):
        start_logits = batch[:, 0]
        end_logits = batch[:, 1]

        # The total number of n-best predictions to generate in the nbest_predictions.json output file
        n_best_size = 20

        # The maximum length of an answer that can be generated. This is needed 
        #  because the start and end predictions are not conditioned on one another
        max_answer_length = 30


        (prediction, nbest_json, scores_diff_json) = \
            dp.get_predictions(doc_tokens, features, start_logits, end_logits, n_best_size, max_answer_length)

        result += "Answer: '{}'".format(prediction) + " with prob: {:.3f}% ".format(nbest_json[0]['probability'] * 100.0) + "at {:.3f} Sentences/Sec.".format(sentences_sec)
        

    return flask.Response(response=result, status=200, mimetype='text/plain')