# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from __future__ import print_function, absolute_import

import argparse
import numpy as np
import os
import gzip

from six import BytesIO

from sklearn.compose import make_column_transformer
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler


def create_preprocessing_pipeline(num_columns):
    preprocessor = make_column_transformer(
        (np.arange(num_columns), StandardScaler()),
        remainder='passthrough'
    )
    return preprocessor


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # Data and model checkpoints directories
    parser.add_argument("--epochs", type=int, default=-1)
    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    parser.add_argument("--test", type=str, default=os.environ["SM_CHANNEL_TEST"])
    args = parser.parse_args()

    # Load the data into memory as numpy arrays
    data_path = os.path.join(args.train, "mnist.npy.gz")
    with gzip.open(data_path, "rb") as f:
        data = np.load(f, allow_pickle=True)
    train_set = data[0]
    test_set = data[1]
    train_file = {'x': train_set[:, 1:], 'y': train_set[:, 0]}

    preprocessor = create_preprocessing_pipeline(train_file['x'].shape[1])
    preprocessor.fit(X=train_file['x'], y=train_file['y'])
    joblib.dump(preprocessor, os.path.join(args.model_dir, "model.joblib"))
    print("saved model!")


def input_fn(input_data, content_type):
    # Load the data into memory as numpy arrays
    buf = BytesIO(input_data)
    data = np.load(buf, allow_pickle=True)
    train_set = data[0]
    return train_set[:50, :]


def predict_fn(data, model):
    transformed = np.concatenate((data[:, 0].reshape(-1, 1), model.transform(data[:, 1:])), axis=1)
    return transformed


def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf