#!/usr/bin/python3

# A sample train component that trains a simple scikit-learn decision tree model.
# This implementation works in File mode and makes no assumptions about the input file names.
# Input is specified as CSV with a data point in each row and the labels in the first column.

from __future__ import print_function

import os

import json
import pickle
import sys
import traceback
import argparse
import time
import pandas as pd
import csv
from catboost import CatBoostClassifier, Pool as CatboostPool, cv
from itertools import chain
from pandas.api.types import CategoricalDtype
from sklearn import metrics

prefix = '/opt/ml/'

input_path = prefix + 'input/data'
output_path = os.path.join(prefix, 'output')
model_path = os.path.join(prefix, 'model')
param_path = os.path.join(prefix, 'input/config/hyperparameters.json')

# This algorithm has a single channel of input data called 'train'. Since we run in
# File mode, the input files are copied to the directory specified here.
train_channel_name = 'train'
validation_chanel_name = 'validation'
training_path = os.path.join(input_path, train_channel_name)
validation_path = os.path.join(input_path, validation_chanel_name)
## added new column browse node

cols = ['verified_purchase','review_headline','review_body','product_title','helpful_votes']

numerical_cols = ['helpful_votes']
categorical_cols = ['verified_purchase']

text_cols = ['review_headline', 'review_body', 'product_title']

def ret_pool_obj(X):
    pool_obj = CatboostPool(
        data=X,
        label=None,
        text_features=text_cols,
        cat_features=categorical_cols,
        feature_names=list(X.columns)
    )
    return pool_obj


def fit_catboost(X_train, y_train, weight, catboost_params={}, verbose=100):
    learn_pool = CatboostPool(
        X_train,
        y_train,
        weight=weight,
        text_features=text_cols,
        cat_features=categorical_cols,
        feature_names=list(X_train.columns)
    )
    catboost_default_params = {
        'iterations': 25000,
        'learning_rate': 0.05,
        'eval_metric': 'AUC',
        'task_type': 'GPU'
    }

    catboost_default_params.update(catboost_params)

    model = CatBoostClassifier(**catboost_params)
    model.fit(learn_pool, verbose=verbose)

    return model


def pre_process(df):
    df.fillna(
        value={'review_headline': '', 'review_body': '', 'product_title': ''}, inplace=True)
    df.fillna(
        value={'verified_purchase': 'Unk'}, inplace=True)

    df.fillna(-9999, inplace=True)
    return df


def pre_process_cat(df, obj_col_categories):
    for col_name in categorical_cols:
        print(col_name)
        cat_type = CategoricalDtype(categories=obj_col_categories.get(col_name), ordered=False)
        df[col_name] = df[col_name].astype(cat_type, copy=False)
    col_names = df.columns
    category_col_names = col_names[df.dtypes == 'category']
    # Fill Unk values in new data (categories column) with -9999

    for category in category_col_names:
        df[category].fillna('Unk', inplace=True)
    return df


# The function to execute the train.
def train(args):
    print('Starting the train.')
    try:
        print(training_path)
        with open(param_path, 'r') as tc:
            trainingParams = json.load(tc)

        print(training_path)
        print(validation_path)
        hpo_flag = args.HPO

        print(hpo_flag)
        # Take the set of files and read them all into a single pandas dataframe
        input_files = [os.path.join(training_path, file) for file in os.listdir(training_path)]
        if len(input_files) == 0:
            raise ValueError(('There are no files in {}.\n' +
                              'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                              'the data specification in S3 was incorrectly specified or the role specified\n' +
                              'does not have permission to access the data.').format(training_path, train_channel_name))
        #  change to this when you have headers in the data.
        raw_data = [
            pd.read_csv(file, sep='\t', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_ALL) for
            file in input_files]
        raw_df = pd.concat(raw_data)

        print('read the data', raw_df.dtypes)
        raw_df = pre_process(raw_df)
        #raw_df['bow'] = raw_df[combined_cols].apply(lambda x: ' '.join(x), axis=1)
        train_data = raw_df.copy(deep=True)

        print('Training data shape after revison: ', train_data.shape)

        print('After reading the data:')
        if (hpo_flag == 1):
            input_files = [os.path.join(validation_path, file) for file in os.listdir(validation_path)]
            if len(input_files) == 0:
                raise ValueError(('There are no files in {}.\n' +
                                  'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                                  'the data specification in S3 was incorrectly specified or the role specified\n' +
                                  'does not have permission to access the data.').format(validation_path,
                                                                                         validation_chanel_name))
            # data reading from validation set
            raw_data = [
                pd.read_csv(file, sep='\t', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_ALL) for
                file in input_files]
            raw_df = pd.concat(raw_data)
            raw_df = pre_process(raw_df)
            validation_data = raw_df.copy(deep=True)
            print('validation data', validation_data.shape)

        print('Training data shape :- ', train_data.shape)
        print('Training data columns :- ', train_data.columns)

        obj_col_categories = {}
        for col_name in categorical_cols:
            cat = [list(train_data[col_name].value_counts().to_dict().keys()) + ['Unk']]
            print(col_name,': Categorical Levels: ',len(cat[0]))
            obj_col_categories[col_name] = set(chain.from_iterable(cat))
            cat_type = CategoricalDtype(categories=obj_col_categories.get(col_name), ordered=False)
            train_data[col_name] = train_data[col_name].astype(cat_type, copy=False)
            train_data[col_name].fillna('Unk', inplace=True)

        start_time = time.time()
        print("-Time to fit-- %s seconds ---" % (time.time() - start_time))

        print('Hyper-parameter optimization flag : ', hpo_flag)

        iterations = trainingParams.get('iterations', None)
        if iterations is not None:
            iterations = int(iterations)
        else:
            iterations = 50000

        learning_rate = trainingParams.get('learning_rate', None)
        if learning_rate is None:
            learning_rate = 0.05
        else:
            learning_rate = float(learning_rate)

        max_depth = trainingParams.get('max_depth', None)
        if max_depth is None:
            max_depth = 10
        else:
            max_depth = int(max_depth)

        max_ctr_complexity = trainingParams.get('max_ctr_complexity', None)
        if max_ctr_complexity is None:
            max_ctr_complexity = 15
        else:
            max_ctr_complexity = int(max_ctr_complexity)

        if (hpo_flag == 1):
            print('Files in the validation channel')
            validation_data = pre_process_cat(validation_data, obj_col_categories)
            print("-Time to Train HPO =1 -- %s seconds ---" % (time.time() - start_time))

        print('Training Started:- ')
        # scale_pos_weight_fct = train_data['target'].value_counts()[0]/train_data['target'].value_counts()[1]
        scale_pos_weight_fct = train_data['target'].value_counts()
        print("scale_pos_weight_fct:", scale_pos_weight_fct)
        weight = [1.0] * train_data.shape[0]  ## setting

        params = {
            'iterations': iterations,  ##50000
            'depth': max_depth,  # 10
            'learning_rate': learning_rate,  ## 0.05,
            'loss_function': 'Logloss',
            'eval_metric': 'PRAUC:use_weights=False',
            'random_seed': 42,
            'border_count': 254,
            'logging_level': 'Verbose',
            'use_best_model': False,
            'early_stopping_rounds': 500,  # 10% of the total training epoch is a reasonable choice
            'task_type': "GPU",
            'one_hot_max_size': 253,
            'metric_period': 100,
            "text_processing": {
                "tokenizers": [
                    {
                        'tokenizer_id': 'Sense',
                        'separator_type': 'BySense',
                        "lowercasing": "true",

                    }
                ],
                "dictionaries": [{
                    "dictionary_id": "CharTriGram",
                    "token_level_type": "Letter",
                    "gram_order": "3",
                    "max_dictionary_size": "20000",
                }],
                "feature_processing": {
                    "default": [{
                        "tokenizers_names": ["Sense"],
                        "dictionaries_names": ["CharTriGram"],
                        "feature_calcers": ["BoW:top_tokens_count=1000", "NaiveBayes"]
                    }]
                }
            }
        }


        model = fit_catboost(train_data[numerical_cols + categorical_cols + text_cols], train_data["target"], weight,
                             catboost_params=params)

        print("-Time to Train HPO =0 -- %s seconds ---" % (time.time() - start_time))

        if (hpo_flag == 1):
            input_pool = ret_pool_obj(validation_data[numerical_cols + categorical_cols + text_cols])
            prob = model.predict_proba(input_pool)[:, 1]
            precision, recall, thresholds = metrics.precision_recall_curve(validation_data['target'].values, prob)
            auc = metrics.auc(recall, precision)
            print('auc:', auc)

        model.save_model(os.path.join(model_path, 'model-classification-prod'), format="cbm", export_parameters=None,
                         pool=None)
        with open(os.path.join(model_path, 'obj_col_categories.pkl'), 'wb') as out:
            print('Saving the model: ')
            pickle.dump(obj_col_categories, out)

        print('Training complete.')
    except Exception as e:
        # Write out an error file. This will be returned as the failureReason in the
        # DescribeTrainingJob result.
        trc = traceback.format_exc()
        with open(os.path.join(output_path, 'failure'), 'w') as s:
            s.write('Exception during train: ' + str(e) + '\n' + trc)
        # Printing this causes the exception to be in the train job logs, as well.
        print('Exception during train: ' + str(e) + '\n' + trc, file=sys.stderr)
        # A non-zero exit code causes the train job to be marked as Failed.
        sys.exit(255)


def get_parser():
    parser = argparse.ArgumentParser(
        description="Script to build Q2X model."
    )

    # sample code to get a hyperparam
    parser.add_argument("--HPO",
                        help="Marketplace like us, uk.",
                        type=int,
                        default=1)

    return parser


if __name__ == "__main__":
    print('WE IN HERE BBY')
    args = get_parser().parse_args()
    train(args)
    sys.exit(0)