import logging
import json
import pandas as pd
import numpy as np
import argparse
import ssl

from sklearn import model_selection
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from tensorflow.python.lib.io import file_io


def create_parser():
    parser = argparse.ArgumentParser(description='IRIS Pipeline Metrics Visualization Example')

    parser.add_argument('--input_url', type=str.strip, required=False, help='The Input dataset url',
                        default='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')
    parser.add_argument('--input_column_names', type=list, required=False, help='The Input dataset column names',
                        default=['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'species'])
    parser.add_argument('--test_size', type=float, required=False,
                        help='The test dataset portion among the raw dataset', default=0.2)
    parser.add_argument('--train_random_seed', type=int, required=False, help='The training random seed', default=7)

    return parser


def preprocess(args):
    iris = pd.read_csv(args.input_url, names=args.input_column_names)
    array = iris.values
    X, y = array[:, 0:4], np.where(array[:, 4] == 'Iris-setosa', 1, 0)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=args.test_size,
                                                                        random_state=args.train_random_seed)

    return X_train, X_test, y_train, y_test


def sklearn_training(X_train, y_train):
    model = LogisticRegression()
    model.fit(X_train, y_train)

    return model


def inference(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_pred, y_test)

    y_score = model.decision_function(X_test)
    auc = roc_auc_score(y_test, y_score)

    return acc, auc


def write_metadata(acc, auc):
    metrics = {
        'metrics': [{
            'name': 'accuracy-score',  # The name of the metric. Visualized as the column name in the runs table.
            'numberValue': acc,  # The value of the metric. Must be a numeric value.
            'format': "PERCENTAGE",  # The optional format of the metric.
        },
        {
            'name': 'roc-auc-score',
            'numberValue': auc,
            'format': "PERCENTAGE"
        }]
    }

    with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
        json.dump(metrics, f)

    logging.info("Succeed in Writing Metrics")


def main(argv=None):
    parser = create_parser()
    args = parser.parse_args()

    logging.getLogger().setLevel(logging.INFO)

    # Bypass ssl verification
    ssl._create_default_https_context = ssl._create_unverified_context

    # Preprocess
    X_train, X_test, y_train, y_test = preprocess(args)

    # Sklearn Training
    sk_model = sklearn_training(X_train, y_train)

    # Inference
    acc, auc = inference(sk_model, X_test, y_test)

    # write into metadata json
    write_metadata(acc, auc)


if __name__ == "__main__":
    main()