import argparse
import csv
import logging
import numpy as  np
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import traceback

logging.basicConfig(level=logging.INFO)
LOGGER = logging.getLogger(__name__)

BASE_PATH = os.path.join("/", "opt", "ml")
PROCESSING_PATH = os.path.join(BASE_PATH, "processing")
PROCESSING_PATH_INPUT = os.path.join(PROCESSING_PATH, "input")
PROCESSING_PATH_OUTPUT = os.path.join(PROCESSING_PATH, "output")

def clean_text(text):
    text = text.lower()

    text = text.lstrip()
    text = text.rstrip()

    text = re.sub("\[.*?\]", "", text)
    text = re.sub("https?://\S+|www\.\S+", "", text)
    text = re.sub("\n", "", text)
    text = " ".join(filter(lambda x:x[0]!="@", text.split()))

    text = text.replace("u'", "'")

    text = text.encode("ascii", "ignore")
    text = text.decode()

    word_list = text.split(' ')

    for word in word_list:
        if isinstance(word, bytes):
            word = word.decode("utf-8")

    text = " ".join(word_list)

    if not any(c.isalpha() for c in text):
        return ""
    else:
        return text

def extract_data(file_path, percentage=100):
    try:
        files = [f for f in listdir(file_path) if isfile(join(file_path, f)) and f.endswith(".csv")]
        LOGGER.info("{}".format(files))

        frames = []

        for file in files:
            df = pd.read_csv(
                os.path.join(file_path, file),
                sep=",",
                quotechar='"',
                quoting=csv.QUOTE_ALL,
                escapechar='\\',
                encoding='utf-8',
                error_bad_lines=False
            )

            df = df.head(int(len(df) * (percentage / 100)))

            frames.append(df)

        df = pd.concat(frames)

        return df
    except Exception as e:
        stacktrace = traceback.format_exc()
        LOGGER.error("{}".format(stacktrace))

        raise e

def load_data(df, file_path, file_name, header=True):
    try:
        if not os.path.exists(file_path):
            os.makedirs(file_path)

        path = os.path.join(file_path, file_name + ".csv")

        LOGGER.info("Saving file in {}".format(path))

        df.to_csv(
            path,
            index=False,
            header=header,
            quoting=csv.QUOTE_ALL,
            encoding="utf-8",
            escapechar="\\",
            sep=","
        )
    except Exception as e:
        stacktrace = traceback.format_exc()
        LOGGER.error("{}".format(stacktrace))

        raise e

def transform_data(df):
    try:
        df = df[["text", "Sentiment"]]

        LOGGER.info("Original count: {}".format(len(df.index)))

        df = df.dropna()

        df["text"] = df["text"].apply(lambda x: clean_text(x))
        df['text'] = df['text'].map(lambda x: x.strip())
        df['text'] = df['text'].replace('', np.nan)
        df['text'] = df['text'].replace(' ', np.nan)

        df['Sentiment'] = df['Sentiment'].map(lambda x: x.strip())
        df['Sentiment'] = df['Sentiment'].replace('', np.nan)
        df['Sentiment'] = df['Sentiment'].replace(' ', np.nan)

        df["Sentiment"] = df["Sentiment"].map({"Negative": 0, "Neutral": 1, "Positive": 2})

        df = df.dropna()

        df = df.rename(columns={'Sentiment': 'labels'})

        df = df[["text", "labels"]]

        LOGGER.info("Current count: {}".format(len(df.index)))

        return df
    except Exception as e:
        stacktrace = traceback.format_exc()
        LOGGER.error("{}".format(stacktrace))

        raise e

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    args = parser.parse_args()

    LOGGER.info("Arguments: {}".format(args))

    df = extract_data(PROCESSING_PATH_INPUT, 100)

    df = transform_data(df)

    data_train, data_test = train_test_split(df, test_size=0.2)

    load_data(data_train, os.path.join(PROCESSING_PATH_OUTPUT, "train"), "train")
    load_data(data_test, os.path.join(PROCESSING_PATH_OUTPUT, "test"), "test")

    ## Creating test dataset for batch inference
    data_test = data_test.drop('labels', axis=1)
    load_data(data_test, os.path.join(PROCESSING_PATH_OUTPUT, "inference"), "data", False)