#! usr/bin/env python3
# -*- coding:utf-8 -*-

# !pip install spacy
# !python -m spacy download en_core_web_sm


import tensorflow as tf
# from horovod.tensorflow.compression import Compression
import horovod.tensorflow as hvd
import tokenization

os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1" 
tf.logging.set_verbosity(tf.logging.ERROR)

flags = tf.flags
FLAGS = flags.FLAGS

## Required parameters
flags.DEFINE_string(
    "data_dir", None,
    "The input data dir. Should contain the .tsv files (or other data files) "
    "for the task.")
flags.DEFINE_string(
    "bert_config_file", None,
    "The config json file corresponding to the pre-trained BERT model. "
    "This specifies the model architecture.")

flags.DEFINE_string("task_name", "NER", "The name of the task to train.")

flags.DEFINE_string("vocab_file", None,
                    "The vocabulary file that the BERT model was trained on.")

flags.DEFINE_string(
    "output_dir", None,
    "The output directory where the model checkpoints will be written.")

## Other parameters

flags.DEFINE_string(
    "init_checkpoint", "albert_base_zh/albert_model.ckpt",
    "Initial checkpoint (usually from a pre-trained BERT model).")

flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text. Should be True for uncased "
    "models and False for cased models.")

flags.DEFINE_integer(
    "max_seq_length", 128,
    "The maximum total input sequence length after WordPiece tokenization. "
    "Sequences longer than this will be truncated, and sequences shorter "
    "than this will be padded.")

flags.DEFINE_bool("do_train", False, "Whether to run training.")

flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")

flags.DEFINE_bool(
    "do_predict", False,
    "Whether to run the model in inference mode on the test set.")

flags.DEFINE_bool("do_export", False, "Whether to export model to pb format.")

flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")

flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")

flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")

flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")

flags.DEFINE_float("num_train_epochs", 3.0,
                   "Total number of training epochs to perform.")

flags.DEFINE_float(
    "warmup_proportion", 0.1,
    "Proportion of training to perform linear learning rate warmup for. "
    "E.g., 0.1 = 10% of training.")

flags.DEFINE_integer("save_checkpoints_steps", 1000,
                     "How often to save the model checkpoint.")

flags.DEFINE_integer("iterations_per_loop", 1000,
                     "How many steps to make in each estimator call.")

flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")

tf.flags.DEFINE_string(
    "tpu_name", None,
    "The Cloud TPU to use for training. This should be either the name "
    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
    "url.")

tf.flags.DEFINE_string(
    "tpu_zone", None,
    "[Optional] GCE zone where the Cloud TPU is located in. If not "
    "specified, we will attempt to automatically detect the GCE project from "
    "metadata.")

tf.flags.DEFINE_string(
    "gcp_project", None,
    "[Optional] Project name for the Cloud TPU-enabled project. If not "
    "specified, we will attempt to automatically detect the GCE project from "
    "metadata.")

tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")

flags.DEFINE_integer(
    "num_tpu_cores", 8,
    "Only used if `use_tpu` is True. Total number of TPU cores to use.")


# Create the output directory where all the results are saved.
# output_dir = os.path.join(working_dir, 'output')
# tf.gfile.MakeDirs(output_dir)

# The config json file corresponding to the pre-trained BERT model.
# This specifies the model architecture.
bert_config_file = os.path.join(DATA_DIR_BIOBERT, 'bert_config.json')

# The vocabulary file that the BERT model was trained on.

init_checkpoint = os.path.join(DATA_DIR_BIOBERT, 'model.ckpt')

batch_size = 1
params = dict([('batch_size', batch_size)])
  
# The maximum total input sequence length after WordPiece tokenization. 
# Sequences longer than this will be truncated, and sequences shorter than this will be padded.
max_seq_length = 128

output_dir = os.path.join(working_dir, 'results')

train_batch_size=64

num_train_epochs=1
num_warmup_steps=0
max_seq_length=128
hvd_rank = 0

verbose_logging = True
# Set to True if the dataset has samples with no answers. For SQuAD 1.1, this is set to False
version_2_with_negative = False

# The total number of n-best predictions to generate in the nbest_predictions.json output file.
n_best_size = 20

# The maximum length of an answer that can be generated. 
# This is needed  because the start and end predictions are not conditioned on one another.
max_answer_length = 30

# The initial learning rate for Adam
learning_rate = 5e-6

# Total batch size for training
train_batch_size = 3

# Proportion of training to perform linear learning rate warmup for
warmup_proportion = 0.1

# # Total number of training epochs to perform (results will improve if trained with epochs)
num_train_epochs = 1

from utils.utils import LogEvalRunHook, LogTrainRunHook, setup_xla_flags


#hvd
hvd.init()
tmp_filenames = [os.path.join(notebooks_dir, "train.tf_record{}".format(i)) for i in range(hvd.size())]
num_examples_per_rank = len(train_examples) // hvd.size()
remainder = len(train_examples) % hvd.size()
if hvd.rank() < remainder:
    start_index = hvd.rank() * (num_examples_per_rank+1)
    end_index = start_index + num_examples_per_rank + 1
else:
    start_index = hvd.rank() * num_examples_per_rank + remainder
    end_index = start_index + (num_examples_per_rank)

tf.compat.v1.logging.info("hvd.size()",hvd.size())


#tokenize
vocab_file = os.path.join(DATA_DIR_BIOBERT, 'vocab.txt')

# Should be True for uncased models and False for cased models.
# The BioBERT available in NGC is uncased
do_lower_case = True

# Validate the casing config consistency with the checkpoint name.
tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)

# Create the tokenizer.
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

bert_config_file = os.path.join(DATA_DIR_BIOBERT, 'bert_config.json')
bert_config = modeling.BertConfig.from_json_file(bert_config_file)

processor = BC5CDRProcessor()
label_list = processor.get_labels()

id2label = {}
for (i, label) in enumerate(label_list, 1):
    id2label[i] = label


# config = tf.ConfigProto(log_device_placement=True) 
# run_config = tf.estimator.RunConfig(
#       model_dir=output_dir,
#       session_config=config,
#       save_checkpoints_steps=1000,
#       keep_checkpoint_max=1)


train_examples = processor.get_test_examples(notebooks_dir, file_name='train.tsv')
num_train_steps = 1 #int(len(train_examples) / train_batch_size * num_train_epochs)

start_index = 0
end_index = len(train_examples)
tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]


filed_based_convert_examples_to_features(train_examples[start_index:end_index], 
                                         label_list, max_seq_length, tokenizer, tmp_filenames[hvd_rank])

# predict_examples = processor.get_test_examples(notebooks_dir, file_name='input.tsv')
# tf.compat.v1.logging.info("***** Running training *****")
# tf.compat.v1.logging.info("  Num examples = %d", len(predict_examples))

train_input_fn = file_based_input_fn_builder(
        input_file=tmp_filenames, #train_file,
        batch_size=train_batch_size,
        seq_length=max_seq_length,
        is_training=True,
        drop_remainder=True,
        hvd=None if not FLAGS.horovod else hvd)

global_batch_size = train_batch_size
training_hooks = []
training_hooks.append(LogTrainRunHook(global_batch_size, 0))

model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(),
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_one_hot_embeddings=False,
        hvd=None if not FLAGS.horovod else hvd,
        amp=FLAGS.amp)

print("***** Running training *****")
print("  Num examples = %d", len(train_examples))
print("  Batch size = %d", FLAGS.train_batch_size)
print("  Num steps = %d", num_train_steps)