# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.

import os
import json
import argparse
from tqdm import tqdm
import datasets
from datasets import load_dataset, load_metric, Dataset, Features
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, AutoTokenizer, default_data_collator

data_collator = default_data_collator

def prepare_train_features(examples, tokenizer, max_length, doc_stride):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    pad_on_right = tokenizer.padding_side == "right"

    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

def create_squad_dict(actual_squad):
    titles = []
    contexts = []
    ids = []
    questions = []
    answers = []
    for example in tqdm(actual_squad["data"]):
        title = example.get("title", "").strip()
        for paragraph in example["paragraphs"]:
            context = paragraph["context"].strip()
            for qa in paragraph["qas"]:
                question = qa["question"].strip()
                id_ = qa["id"]

                answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                answer_list = [answer["text"].strip() for answer in qa["answers"]]
                titles.append(title)
                contexts.append(context)
                questions.append(question)
                ids.append(id_)
                answers.append({
                        "answer_start": answer_starts,
                        "text": answer_list,
                    })

    dataset_dict = {
        "answers":answers,
        "context":contexts,
        "id":ids,
        "question":questions,
        "title":titles,
    }
    return dataset_dict


if __name__ == "__main__":
    
    # Sagemaker configuration
    print('Starting training...')
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--epochs", type=int, default=3)
    parser.add_argument("--train_batch_size", type=int, default=32)
    parser.add_argument("--eval_batch_size", type=int, default=64)
    parser.add_argument("--warmup_steps", type=int, default=500)
    parser.add_argument("--model_name", type=str)
    parser.add_argument("--learning_rate", type=float, default=5e-5)
    parser.add_argument("--weight_decay", type=float, default=0.01)
    parser.add_argument("--max_length", type=int, default=384)
    parser.add_argument("--doc_stride", type=int, default=128)

    # Data, model, and output directories
    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
    parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    parser.add_argument("--test_dir", type=str, default=None) # os.environ["SM_CHANNEL_TEST"]

    args, _ = parser.parse_known_args()
    
    print('model directory:', args.model_dir)
    print('train directory:', args.training_dir)
    print('output data directory', args.output_data_dir)
    
    os.system('echo training directory contents:')
    os.system(f'ls {args.training_dir}')
    
    hf_args = TrainingArguments(
    args.model_dir,
    evaluation_strategy = "epoch",
    learning_rate=args.learning_rate,
    per_device_train_batch_size=args.train_batch_size,
    per_device_eval_batch_size=args.eval_batch_size,
    num_train_epochs=args.epochs,
    weight_decay=args.weight_decay,
    )
    
    with open(args.training_dir+'/v2.0/dev-v2.0.json', 'r') as f:
        squad_dev = json.load(f)
    with open(args.training_dir+'/augmented_squad.json', 'r') as f:
        actual_squad = json.load(f)
        
    #datasets = load_dataset("squad_v2") ## NEED TO COMBINE WITH OUR LABELS
    dataset_dict = create_squad_dict(actual_squad)
    test_dataset_dict = create_squad_dict(squad_dev)
    
    squad_dataset = Dataset.from_dict(dataset_dict,
                                 features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "title": datasets.Value("string"),
                    "context": datasets.Value("string"),
                    "question": datasets.Value("string"),
                    "answers": datasets.features.Sequence(
                        {
                            "text": datasets.Value("string"),
                            "answer_start": datasets.Value("int32"),
                        }
                    ),
                    # These are the features of your dataset like images, labels ...
                }
            ))
    squad_test = Dataset.from_dict(test_dataset_dict,
                                 features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "title": datasets.Value("string"),
                    "context": datasets.Value("string"),
                    "question": datasets.Value("string"),
                    "answers": datasets.features.Sequence(
                        {
                            "text": datasets.Value("string"),
                            "answer_start": datasets.Value("int32"),
                        }
                    ),
                }
            ))

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(args.model_name)
    
#     features = prepare_train_features(datasets['train'][:5], tokenizer)
    tokenized_train = squad_dataset.map(prepare_train_features, batched=True, remove_columns=squad_dataset.column_names, fn_kwargs = {'tokenizer':tokenizer, 'max_length':args.max_length, 'doc_stride':args.doc_stride})
    tokenized_test = squad_test.map(prepare_train_features, batched=True, remove_columns=squad_test.column_names, fn_kwargs = {'tokenizer':tokenizer, 'max_length':args.max_length, 'doc_stride':args.doc_stride})

    trainer = Trainer(
        model,
        hf_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    trainer.train()
    
    if args.test_dir:
        # evaluate model
        eval_result = trainer.evaluate(eval_dataset=test_dataset)

        # writes eval result to file which can be accessed later in s3 ouput
        with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
            print(f"***** Eval results *****")
            for key, value in sorted(eval_result.items()):
                writer.write(f"{key} = {value}\n")

    # Saves the model to s3
    trainer.save_model(args.model_dir)
    os.system(f'ls {args.model_dir}')