# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
#     http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from __future__ import absolute_import, print_function

import argparse
import os

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # Data and model checkpoints directories
    parser.add_argument("--objective", type=str, default="reg:squarederror")
    parser.add_argument("--colsample-bytree", type=float, default=0.3)
    parser.add_argument("--learning-rate", type=float, default=0.1)
    parser.add_argument("--max-depth", type=int, default=5)
    parser.add_argument("--reg-alpha", type=int, default=10)
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])

    args = parser.parse_args()

    # Load the Boston housing data into pandas data frame
    boston = load_boston()
    data = pd.DataFrame(boston.data)
    data.columns = boston.feature_names
    data["PRICE"] = boston.target

    # Convert Pandas dataframe to XGBoost DMatrix for better performance (used later).
    X, y = data.iloc[:, :-1], data.iloc[:, -1]
    data_dmatrix = xgb.DMatrix(data=X, label=y)

    # Create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # Create regressor object by using SKLearn API
    xg_reg = xgb.XGBRegressor(
        objective=args.objective,
        colsample_bytree=args.colsample_bytree,
        learning_rate=args.learning_rate,
        max_depth=args.max_depth,
        reg_alpha=args.reg_alpha,
        n_estimators=args.n_estimators,
    )

    # Train and save the model
    xg_reg.fit(X_train, y_train)
    model_path = os.path.join(args.model_dir, "xgb-boston.model")
    xg_reg.get_booster().save_model(model_path)

    # Make predictions and calculate RMSE
    preds = xg_reg.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print("RMSE: %f" % (rmse))

    # We can look at the feature importance and store the graph as an image.
    if not os.path.exists(args.output_data_dir):
        os.makedirs(args.output_data_dir)

    ax = xgb.plot_importance(xg_reg)
    fig = ax.figure
    fig.set_size_inches(5, 5)
    fig.savefig(os.path.join(args.output_data_dir, "feature-importance-plot.png"))

    # Finally, lets do a bit of cross-validation by using native XGB functionality (keeping some parameters constant, so
    # that we don't have a huge input list for this simple example.
    params = {
        "objective": args.objective,
        "colsample_bytree": args.colsample_bytree,
        "learning_rate": args.learning_rate,
        "max_depth": args.max_depth,
        "alpha": args.reg_alpha,
    }
    cv_results = xgb.cv(
        dtrain=data_dmatrix,
        params=params,
        nfold=5,
        num_boost_round=50,
        early_stopping_rounds=10,
        metrics="rmse",
        as_pandas=True,
        seed=100,
    )

    cv_results.to_csv(os.path.join(args.output_data_dir, "cv_results.csv"))