######################################################################
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. #
# SPDX-License-Identifier: MIT-0                                     #
######################################################################

import os
import random
import time
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import torch
from essential_generators import DocumentGenerator
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from common_settings import default_max_length, default_batch_size, default_model_name

max_length = int(os.getenv('MAX_LENGTH', default_max_length))
batch_size = int(os.getenv('BATCH_SIZE', default_batch_size))
model_name = (os.getenv('MODEL_NAME', default_model_name))

ts_model_file = '%s_gpu_%d_%d.pt'%(model_name, max_length, batch_size)

# Benchmark test parameters - Number of models, threads, total number of requests
num_models = 1  # num_models <= number of cores (4 for inf1.xl and inf1.2xl, 16 for inf1.6xl)
num_threads = 2  # Setting num_threads to num_models works well.
num_requests = 10000
num_request_samples = 10
half_precision = True
print('Max Length: %d, Batch Size: %d, Model Name: %s, Half Precision: %r'%(max_length, batch_size, model_name, half_precision))

# Create a pipeline with the given model
model_dict = dict()
model_dict['return_dict'] = False
# Get tokenizer and create encoded inputs
tokenizer = AutoTokenizer.from_pretrained(model_name)

gen = DocumentGenerator()
sequence_list = []
encoded_input_list = []
for _ in np.arange(num_request_samples):
    sequence = gen.sentence()
    encoded_inputs = tokenizer.encode_plus(sequence, max_length=max_length, padding='max_length', truncation=True,
                                           return_tensors='pt')
    sequence_list.append(sequence)
    encoded_input_list.append(encoded_inputs)


def load_model(file_name, torchscript):
    # Load modelbase
    with torch.cuda.amp.autocast(enabled=half_precision):
        if torchscript:
            model = torch.jit.load(file_name)
            model.eval()
            model = model.cuda()
        else:
            model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=False)
            model.eval()
            model = model.cuda()

    return model

latency_list = []

def task(model, encoded_inputs):
    global latency_list
    begin = time.time()


    with torch.cuda.amp.autocast(enabled=half_precision):
        input_ids_tensor = encoded_inputs['input_ids']
        batch_input_ids_tensor = torch.cat([input_ids_tensor] * batch_size)
        attention_mask_tensor = encoded_inputs['attention_mask']
        batch_attention_mask_tensor = torch.cat([attention_mask_tensor] * batch_size)
        ts_input = batch_input_ids_tensor.cuda(), batch_attention_mask_tensor.cuda()
        # neuron_input = encoded_input['input_ids'], encoded_input['attention_mask']
        _ = model(*ts_input)
        latency_time = time.time() - begin

        latency_list.append(latency_time)
    return


def benchmark(num_models, num_threads, num_requests, model_file, torchscript=True):
    # Load a model into each NeuronCore
    print('Loading Models To Memory')
    models = [load_model(model_file, torchscript) for _ in range(num_models)]
    tokenizers = [tokenizer for _ in range(num_models)]
    print('Starting benchmark')
    output_list = []
    begin = time.time()
    futures = []
    # Submit all tasks and wait for them to finish
    # https://stackoverflow.com/questions/51601756/use-tqdm-with-concurrent-futures
    with tqdm(total=num_requests) as pbar:
        with ThreadPoolExecutor(num_threads) as pool:
            for i in range(num_requests):
                # futures.append(pool.submit(task, models[i % len(models)], tokenizers[i % len(models)], random.choice(sequence_list)))
                futures.append(pool.submit(task, models[i % len(models)], random.choice(encoded_input_list)))
                # output_list.append(output.result())
            for _ in concurrent.futures.as_completed(futures):
                pbar.update(1)

    test_time = time.time() - begin

    # return test_time, np.array(output_list)
    return test_time


# test_time, latency_array = benchmark(num_models, num_threads, num_requests, neuron_model_file, torchscript=True)
test_time = benchmark(num_models, num_threads, num_requests, ts_model_file, torchscript=True)
print('Latency: %d samples: (P50, P90, P95)'%(len(latency_list)))
print(np.percentile(np.array(latency_list), [50, 90, 95]))
print('Total time taken for %d * (%d sentences) is %0.4f seconds' % (num_requests, batch_size, test_time))
print('Throughput (num_requests * batch_size /sec) = %0.4f' % (num_requests * batch_size / test_time))