#  Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License").
#  You may not use this file except in compliance with the License.
#  A copy of the License is located at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  or in the "license" file accompanying this file. This file is distributed
#  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
#  express or implied. See the License for the specific language governing
#  permissions and limitations under the License.

from __future__ import print_function

import argparse
import os

from io import StringIO
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor

import joblib
import json
import sys
import csv
import pickle
from my_custom_library import cross_validation, cross_validation_catboost
from sagemaker_containers import _content_types
import xgboost as xgb
from sklearn.metrics import mean_squared_error


model_file_name = 'catboost-regressor-model.dump'


if __name__ == "__main__":
    print("Training Started")
    parser = argparse.ArgumentParser()

    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    parser.add_argument("--validation", type=str, default=os.environ["SM_CHANNEL_VALIDATION"])
    parser.add_argument("--num_round", type=int, default=os.environ.get("SM_HP_num_round"))
    parser.add_argument("--max_depth", type=int, default=5)
    parser.add_argument("--eta", type=float, default=0.2)
    parser.add_argument("--objective", type=str, default="reg:squarederror")
    parser.add_argument("--k_fold", type=int, default=5)
    
    args = parser.parse_args()
    print("Got Args: {}".format(args))

    # Take the set of files and read them all into a single pandas dataframe
    train_input_files = [os.path.join(args.train, file) for file in os.listdir(args.train)]
    if len(train_input_files) == 0:
        raise ValueError(
            (
                    "There are no files in {}.\n"
                    + "This usually indicates that the channel ({}) was incorrectly specified,\n"
                    + "the data specification in S3 was incorrectly specified or the role specified\n"
                    + "does not have permission to access the data."
            ).format(args.train, "train")
        )
    raw_data = [pd.read_csv(file, header=None, engine="python") for file in train_input_files]
    train_df = pd.concat(raw_data)

    validation_input_files = [os.path.join(args.validation, file) for file in os.listdir(args.validation)]
    if len(validation_input_files) == 0:
        raise ValueError(
            (
                    "There are no files in {}.\n"
                    + "This usually indicates that the channel ({}) was incorrectly specified,\n"
                    + "the data specification in S3 was incorrectly specified or the role specified\n"
                    + "does not have permission to access the data."
            ).format(args.train, "train")
        )
    raw_data = [pd.read_csv(file, header=None, engine="python") for file in validation_input_files]
    validation_df = pd.concat(raw_data)

    # Assumption is that the label is the last column
    print('building training and validation datasets')
    X_train = train_df.iloc[:, :-1].values
    y_train = train_df.iloc[:, -1:].values
    X_validation = validation_df.iloc[:, :-1].values
    y_validation = validation_df.iloc[:, -1:].values
    
    
    """
    Define and Train catboost
    """
    
    K = args.k_fold
    
    catboost_hyperparameters = {
        "max_depth": args.max_depth,
        "eta": args.eta,
    }

    rmse_list, model_catboost = cross_validation_catboost(train_df, K, catboost_hyperparameters)
    k_fold_avg = sum(rmse_list) / len(rmse_list)
    print(f"RMSE average across folds for CatBoost model: {k_fold_avg}")

    # generate model predictions against the validation dataset
    pred_catboost = model_catboost.predict(X_validation)


    # persist model
    path = os.path.join(args.model_dir, model_file_name)
    print('saving model file to {}'.format(path))
    model_catboost.save_model(path)
    
    
    """
    Train the XGBoost model
    """

    hyperparameters = {
        "max_depth": args.max_depth,
        "eta": args.eta,
        "objective": args.objective,
        "num_round": args.num_round,
    }

    rmse_list, model_xgb = cross_validation(train_df, K, hyperparameters)
    k_fold_avg = sum(rmse_list) / len(rmse_list)
    print(f"RMSE average across folds for XGBoost model: {k_fold_avg}")
    
    # get the prediction results against the validation dataset of the xgboost model
    dtest = xgb.DMatrix(X_validation)
    pred_xgb = model_xgb.predict(dtest,
                                  ntree_limit=getattr(model_xgb, "best_ntree_limit", 0),
                                  validate_features=False)

    # generate the mean of the results predicted by the two models and calculate the rmse
    pred_mean = np.mean(np.array([pred_catboost, pred_xgb]), axis=0)
    val_rmse = mean_squared_error(y_validation, pred_mean, squared=False)
    print(f"Final evaluation result: validation-rmse:{val_rmse}")
    

    model_location = args.model_dir + "/xgboost-model"
    pickle.dump(model_xgb, open(model_location, "wb"))
    print("Stored trained model at {}".format(model_location))


    print("Training Completed")


def input_fn(input_data, content_type):
    dtype=None
    payload = StringIO(input_data)
    
    return np.genfromtxt(payload, dtype=dtype, delimiter=",")

def model_fn(model_dir):
    """Deserialized and return fitted model

    Note that this should have the same name as the serialized model in the main method
    """
    catboost_model = CatBoostRegressor()
    catboost_model.load_model(os.path.join(model_dir, model_file_name))
    
    model_file = "xgboost-model"
    model = pickle.load(open(os.path.join(model_dir, model_file), "rb"))
    
    all_model = [catboost_model, model]
    return all_model


def predict_fn(input_data, model):

    predictions_catb = model[0].predict(input_data)
    print("catboost results:")
    print(predictions_catb)

    dtest = xgb.DMatrix(input_data)
    predictions_xgb = model[1].predict(dtest,
                                          ntree_limit=getattr(model, "best_ntree_limit", 0),
                                          validate_features=False)
    print("xgboost results:")
    print(predictions_xgb)
    
    
    return np.mean(np.array([predictions_catb, predictions_xgb]), axis=0)