# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. # A copy of the License is located at # # http://www.apache.org/licenses/LICENSE-2.0 # # or in the "license" file accompanying this file. This file is distributed # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either # express or implied. See the License for the specific language governing # permissions and limitations under the License. from __future__ import print_function import argparse import os import numpy as np import joblib import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score import delta_sharing if __name__ == "__main__": print("Training Started") parser = argparse.ArgumentParser() # Hyperparameters are described here. In this simple example we are just including one hyperparameter. parser.add_argument("--max_leaf_nodes", type=int, default=-1) # Sagemaker specific arguments. Defaults are set in the environment variables. parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) args = parser.parse_args() print("Got Args: {}".format(args)) # Take the profile file, create a SharingClient, and read data from the delta lake table profile_files = [os.path.join(args.train, file) for file in os.listdir(args.train)] if len(profile_files) == 0: raise ValueError( ( "There are no files in {}.\n" + "This usually indicates that the channel ({}) was incorrectly specified,\n" + "the data specification in S3 was incorrectly specified or the role specified\n" + "does not have permission to access the data." ).format(args.train, "train") ) profile_file = profile_files[0] print(f'Found profile file: {profile_file}') # Create a SharingClient client = delta_sharing.SharingClient(profile_file) table_url = profile_file + "#delta_sharing.default.boston-housing" # Load the table as a Pandas DataFrame print('Loading boston-housing table from Delta Lake') train_data = delta_sharing.load_as_pandas(table_url) print(f'Train data shape: {train_data.shape}') # Drop null values - THIS SHOULD BE DONE IN PRE-PROCESSING STAGE AS BEST PRACTISE! train_data.dropna(inplace=True) # Split the data into training and testing sets X = train_data.iloc[:, 1:14] Y = train_data.iloc[:, 14] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5) print(f'X_train.shape: {X_train.shape}') print(f'X_test.shape: {X_test.shape}') print(f'Y_train.shape: {Y_train.shape}') print(f'Y_test.shape: {Y_test.shape}') linear_model = LinearRegression() linear_model.fit(X_train, Y_train) # model evaluation for training set y_train_predict = linear_model.predict(X_train) rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict))) r2 = r2_score(Y_train, y_train_predict) print("The model performance for training set") print("--------------------------------------") print(f'RMSE is {rmse}') print(f'R2 score is {r2}') print("\n") # Save model joblib.dump(linear_model, os.path.join(args.model_dir, "model.joblib")) print("Training Completed") def model_fn(model_dir): """Deserialized and return fitted model Note that this should have the same name as the serialized model in the main method """ clf = joblib.load(os.path.join(model_dir, "model.joblib")) return clf