######################################################################
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. #
# SPDX-License-Identifier: MIT-0                                     #
######################################################################

import os
import torch
import subprocess
import torch.neuron
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
import time
import random
from tqdm import tqdm
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from essential_generators import DocumentGenerator
from common_settings import default_max_length, default_batch_size, default_model_name

max_length = int(os.getenv('MAX_LENGTH', default_max_length))
batch_size = int(os.getenv('BATCH_SIZE', default_batch_size))
model_name = (os.getenv('MODEL_NAME', default_model_name))

# Setting up NeuronCore groups for inf1.6xlarge with 16 cores
num_neuron_chips = int(subprocess.getoutput('ls /dev/neuron* | wc -l'))
num_cores = 4 * num_neuron_chips
nc_env = ','.join(['1'] * num_cores)
print('Neuron Core Group Sizes: %s'%(nc_env))
os.environ['NEURONCORE_GROUP_SIZES'] = nc_env
os.environ['TOKENIZERS_PARALLELISM'] = 'False'

# Neuron file name
# neuron_model_file = 'distilbert-base-uncased-mnli-neuron-' + str(max_length) + '.pt'
# model_name = 'typeform/distilbert-base-uncased-mnli'
neuron_model_file = '%s_inf_%d_%d.pt'%(model_name, max_length, batch_size)

# Benchmark test parameters - Number of models, threads, total number of requests
num_models = num_cores  # num_models <= number of cores (4 for inf1.xl and inf1.2xl, 16 for inf1.6xl)
num_threads = num_cores  # Setting num_threads to num_models works well.
num_requests = 10000
num_request_samples = 10

# Create a pipeline with the given model
model_dict = dict()
model_dict['return_dict'] = False
# Get tokenizer and create encoded inputs
tokenizer = AutoTokenizer.from_pretrained(model_name)

gen = DocumentGenerator()
sequence_list = []
encoded_input_list = []
for _ in np.arange(num_request_samples):
    sequence = gen.sentence()
    encoded_inputs = tokenizer.encode_plus(sequence, max_length=max_length, padding='max_length', truncation=True,
                                           return_tensors='pt')
    sequence_list.append(sequence)
    encoded_input_list.append(encoded_inputs)


def load_model(file_name):
    # Load modelbase
    model = torch.jit.load(file_name)

    return model

latency_list = []

def task(model, encoded_inputs):
# def task(model, tokeniz, sentence):
    global latency_list
    begin = time.time()

    input_ids_tensor = encoded_inputs['input_ids']
    batch_input_ids_tensor = torch.cat([input_ids_tensor] * batch_size)
    attention_mask_tensor = encoded_inputs['attention_mask']
    batch_attention_mask_tensor = torch.cat([attention_mask_tensor] * batch_size)
    ts_input = batch_input_ids_tensor, batch_attention_mask_tensor
    _ = model(*ts_input)
    latency_time = time.time() - begin
    latency_list.append(latency_time)
    return


def benchmark(num_models, num_threads, num_requests, model_file):
    # Load a model into each NeuronCore
    print('Loading Models To Memory')
    models = [load_model(model_file) for _ in range(num_models)]
    print('Starting benchmark')
    output_list = []
    begin = time.time()
    futures = []
    # Submit all tasks and wait for them to finish
    # https://stackoverflow.com/questions/51601756/use-tqdm-with-concurrent-futures
    with tqdm(total=num_requests) as pbar:
        with ThreadPoolExecutor(num_threads) as pool:
            for i in range(num_requests):
                futures.append(pool.submit(task, models[i % len(models)], random.choice(encoded_input_list)))
                # output_list.append(output.result())
            print('Loaded Requests')
            for _ in concurrent.futures.as_completed(futures):
                pbar.update(1)
    test_time = time.time() - begin

    # return test_time, np.array(output_list)
    return test_time


# test_time, latency_array = benchmark(num_models, num_threads, num_requests, neuron_model_file)
test_time = benchmark(num_models, num_threads, num_requests, neuron_model_file)
print('Latency: %d samples: (P50, P90, P95)'%(len(latency_list)))
print(np.percentile(np.array(latency_list), [50, 90, 95]))
print('Total time taken for %d * (%d sentences) is %0.4f seconds' % (num_requests, batch_size, test_time))
print('Throughput (num_requests * batch_size /sec) = %0.4f' % (num_requests * batch_size/ test_time))
# print(bench_output[100])