#!/usr/bin/env python

# A sample training component that trains a simple scikit-learn decision tree model.
# This implementation works in File mode and makes no assumptions about the input file names.
# Input is specified as CSV with a data point in each row and the labels in the first column.

from __future__ import print_function

import json
import os
import pickle
import sys
import traceback
import logging

import pandas as pd
from sklearn import tree

from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())
# These are the paths to where SageMaker mounts interesting things in your container.

prefix = '/opt/ml/'

input_path = os.path.join(prefix, 'input/data')
output_path = os.path.join(prefix, 'output')
model_path = os.path.join(prefix, 'model')
param_path = os.path.join(prefix, 'input/config/hyperparameters.json')

# This algorithm has a single channel of input data called 'training'. Since we run in
# File mode, the input files are copied to the directory specified here.
channel_name_training='training'
training_path = os.path.join(input_path, channel_name_training)

channel_name_validation='validation'
validation_path = os.path.join(input_path, channel_name_validation)

# The function to execute the training.
def train():
    print('Starting the training.')
    try:
        # Read in any hyperparameters that the user passed with the training job
        with open(param_path, 'r') as tc:
            trainingParams = json.load(tc)

        # Take the set of files and read them all into a single pandas dataframe
        input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ]
        if len(input_files) == 0:
            raise ValueError(('There are no files in {}.\n' +
                              'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                              'the data specification in S3 was incorrectly specified or the role specified\n' +
                              'does not have permission to access the data.').format(training_path, channel_name_training))
        raw_data = [ pd.read_csv(file, header=None) for file in input_files ]
        train_data = pd.concat(raw_data)

        # labels are in the first column
        train_y = train_data.iloc[:,0]
        train_X = train_data.iloc[:,1:]

        # Here we only support a single hyperparameter. Note that hyperparameters are always passed in as
        # strings, so we need to do any necessary conversions.
        max_leaf_nodes = trainingParams.get('max_leaf_nodes', None)
        if max_leaf_nodes is not None:
            max_leaf_nodes = int(max_leaf_nodes)
        max_depth = trainingParams.get('max_depth', None)
        if max_depth is not None:
            max_depth = int(max_depth)

        clf = tree.DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, max_depth=max_depth)
        clf = clf.fit(train_X, train_y)


        # save the model
        os.system('sudo chown -R 1000:100 ' + model_path)
        with open(os.path.join(model_path, 'decision-tree-model.pkl'), 'wb') as out:
            pickle.dump(clf, out)
            
        print('Training complete.')

        return clf
    except Exception as e:
        # Write out an error file. This will be returned as the failureReason in the
        # DescribeTrainingJob result.
        trc = traceback.format_exc()
        os.system('sudo chown -R 1000:100 ' + output_path)
        with open(os.path.join(output_path, 'failure'), 'w') as s:
            s.write('Exception during training: ' + str(e) + '\n' + trc)

        # Printing this causes the exception to be in the training job logs, as well.
        print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr)
        # A non-zero exit code causes the training job to be marked as Failed.
        sys.exit(255)

# The function to execute the validation.
def validation(clf):
    print('Starting the validation.')
    try:

        # Take the set of files and read them all into a single pandas dataframe
        input_files = [ os.path.join(validation_path, file) for file in os.listdir(validation_path) ]
        if len(input_files) == 0:
            raise ValueError(('There are no files in {}.\n' +
                              'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                              'the data specification in S3 was incorrectly specified or the role specified\n' +
                              'does not have permission to access the data.').format(validation_path, channel_name_validation))
        raw_data = [ pd.read_csv(file, header=None) for file in input_files ]
        validation_data = pd.concat(raw_data)

        # labels are in the first column
        train_y = validation_data.iloc[:,0]
        train_X = validation_data.iloc[:,1:]

        predictions = clf.predict(train_X)
        auc = roc_auc_score(train_y, predictions)
        logger.info(('auc:{}').format(auc))

    except Exception as e:
        # Write out an error file. This will be returned as the failureReason in the
        # DescribeTrainingJob result.
        trc = traceback.format_exc()
        os.system('sudo chown -R 1000:100 ' + output_path)
        with open(os.path.join(output_path, 'failure'), 'w') as s:
            s.write('Exception during training: ' + str(e) + '\n' + trc)
        # Printing this causes the exception to be in the training job logs, as well.
        print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr)
        # A non-zero exit code causes the training job to be marked as Failed.
        sys.exit(255)


if __name__ == '__main__':
    clf = train()
    validation(clf)

    # A zero exit code causes the job to be marked a Succeeded.
    sys.exit(0)