from datasets import load_dataset
from pathlib import Path
from transformers import DistilBertTokenizerFast
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
import transformers
import numpy as np
from datasets import load_metric
import pandas as pd
import os
import subprocess

transformers.logging.set_verbosity_info()

def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)

    return texts, labels

def main():
    model_dir = "/opt/ml/processing/model"
    test_path = "/opt/ml/processing/input/test"
    
    subprocess.run("tar -xvf /opt/ml/processing/model/model.tar.gz -C /opt/ml/processing/model", shell=True)
    
    test_texts, test_labels = read_imdb_split(test_path)
    tokenizer = DistilBertTokenizerFast.from_pretrained(model_dir)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="tf")
    
    test_dataset = tf.data.Dataset.from_tensor_slices((
        dict(test_encodings),
        test_labels
    ))

    training_args = TFTrainingArguments(
            output_dir='./results',
            do_train = False,
            do_predict = True
        )
    model = TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_dir)
    
    trainer = TFTrainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args                  # training arguments, defined above
    )
    
    pred, _, metric_pred = trainer.predict(test_dataset=test_dataset)
    pred_label = np.argmax(pred, axis=1)
    
    pd_pred = pd.DataFrame({
        "label": test_labels,
        "inference": pred_label
    })
    
    pd_pred.to_csv('/opt/ml/processing/output/pred.csv', index=False)
    

if __name__ == "__main__":
    main()