# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 from __future__ import print_function import argparse import logging import os from io import StringIO import joblib import numpy as np import pandas as pd # DON'T FORGET TO INCLUDE ALL NECESSARY LIBRARIES!!!!!!!!!!! from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix, classification_report, recall_score, precision_score, f1_score import xgboost as xgb # DON'T FORGET TO INCLUDE ALL NECESSARY LIBRARIES!!!!!!!!!!! logger = logging.getLogger() logger.setLevel(logging.INFO) logger.addHandler(logging.StreamHandler()) # ACTION 1 # Copy your `numeric_features` and your `categorical_features` here numeric_features = [] categorical_features = [] # END ACTION 1 # --- # EXAMPLE """"" numeric_features = [ 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]'] categorical_features = ['Type'] """"" # END EXAMPLE # --- if __name__ == "__main__": parser = argparse.ArgumentParser() # Hyperparameters are described here # YOU CAN EXTEND THIS SECTION TO YOUR FAVOR, e.g. # ADD MORE HYPERPARAMETERS THAT ARE USED IN THIS CONTAINER # EXAMPLE: parser.add_argument("--my-example-hp", type=str, default="awesome") parser.add_argument("--random_state", type=int, default=42) parser.add_argument("--n_estimators", type=int, default=500) parser.add_argument("--max_depth", type=int, default=12) parser.add_argument("--n_jobs", type=int, default=-1) # HELPER SECTION # SageMaker specific arguments. Defaults are set in the environment variables. parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR")) parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) parser.add_argument("--validation", type=str, default=os.environ["SM_CHANNEL_VALIDATION"]) args = parser.parse_args() # Take the set of files and read them all into a single pandas dataframe train = [os.path.join(args.train, file) for file in os.listdir(args.train)] if len(train) == 0: raise ValueError( ( "There are no files in {}.\n" + "This usually indicates that the channel ({}) was incorrectly specified,\n" + "the data specification in S3 was incorrectly specified or the role specified\n" + "does not have permission to access the data." ).format(args.train, "train") ) # Read DataFrames into array and concatenate them into one DF train_data = [pd.read_csv(file) for file in train] train_data = pd.concat(train_data) # Take the set of files and read them all into a single pandas dataframe validation = [os.path.join(args.validation, file) for file in os.listdir(args.validation)] if len(validation) == 0: raise ValueError( ( "There are no files in {}.\n" + "This usually indicates that the channel ({}) was incorrectly specified,\n" + "the data specification in S3 was incorrectly specified or the role specified\n" + "does not have permission to access the data." ).format(args.validation, "train") ) # Read DataFrames into array and concatenate them into one DF validation_data = [pd.read_csv(file) for file in validation] validation_data = pd.concat(validation_data) # END HELPER SECTION # ACTION 2 # The data that will be read in contains all columns, i.e. your # features and your target. Remember in preprocessing we set the # target column as the very first one in the DataFrame. # Task: Create a X_train, y_train, X_val and y_val object using the # `train_data` and `validation_data` # END ACTION 2 # --- # EXAMPLE """"" X_train, y_train = train_data.iloc[:, 1:], train_data.iloc[:, 0] X_val, y_val = validation_data.iloc[:, 1:], validation_data.iloc[:, 0] """"" # END EXAMPLE # --- # ACTION 3 # Copy and paste your model code in here: # END ACTION 3 # --- # EXAMPLE """"" numeric_transformer = Pipeline( steps=[ ("scaler", StandardScaler()), ] ) categorical_transformer = OneHotEncoder(handle_unknown="ignore") preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ] ) clf = xgb.XGBClassifier( n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.random_state, n_jobs=args.n_jobs) le = LabelEncoder() y_train = le.fit_transform(y_train) y_val = le.transform(y_val) model = Pipeline( steps=[ ("preprocessor", preprocessor), ("classifier", clf)] ).fit(X_train, y_train) print("model score: %.3f" % model.score(X_val, y_val)) y_pred = model.predict(X_val) y_hat = model.predict(X_train) print("In Sample") print(classification_report(y_train, y_hat, zero_division=1)) print(confusion_matrix(y_train, y_hat)) print("Out of Sample") print(classification_report(y_val, y_pred, zero_division=1)) print(confusion_matrix(y_val, y_pred), "\n") print(f"train-recall:{recall_score(y_train, y_hat, average='macro', zero_division=True)};") print(f"validation-recall:{recall_score(y_val, y_pred, average='macro', zero_division=True)};") print(f"train-precision:{precision_score(y_train, y_hat, average='macro', zero_division=True)};") print(f"validation-precision:{precision_score(y_val, y_pred, average='macro', zero_division=True)};") print(f"train-f1:{f1_score(y_train, y_hat, average='macro', zero_division=True)};") print(f"validation-f1:{f1_score(y_val, y_pred, average='macro', zero_division=True)};") X = pd.concat(objs=[X_train, X_val], axis=0) y = pd.concat(objs=[pd.DataFrame(y_train), pd.DataFrame(y_val)], axis=0) model = model.fit(X, y) """"" # END EXAMPLE # --- # Helper section - no action required! # Save the model and config_data to the model_dir so that it can be loaded by model_fn joblib.dump(model, os.path.join(args.model_dir, "model.joblib")) # Print Success logger.info("Saved model!") def input_fn(input_data, content_type="text/csv"): """Parse input data payload. Args: input_data (pandas.core.frame.DataFrame): A pandas.core.frame.DataFrame. content_type (str): A string expected to be 'text/csv'. Returns: df: pandas.core.frame.DataFrame """ try: if "text/csv" in content_type: df = pd.read_csv(StringIO(input_data)) return df elif "application/json" in content_type: df = pd.read_json(StringIO(input_data.decode("utf-8"))) return df else: df = pd.read_csv(StringIO(input_data.decode("utf-8"))) return df except ValueError as e: raise logger.error(f"ValueError {e}") def output_fn(prediction, accept="text/csv"): """Format prediction output. Args: prediction (pandas.core.frame.DataFrame): A DataFrame with predictions. accept (str): A string expected to be 'text/csv'. Returns: df: str (in CSV format) """ return prediction.to_csv(index=False) def predict_fn(input_data, model): """Preprocess input data. Args: input_data (pandas.core.frame.DataFrame): A pandas.core.frame.DataFrame. model: A model Returns: output: pandas.core.frame.DataFrame """ # Read your model and config file output = pd.DataFrame(model.predict(input_data)) return output def model_fn(model_dir): """Deserialize fitted model. This simple function takes the path of the model, loads it, deserializes it and returns it for prediction. Args: model_dir (str): A string that indicates where the model is located. Returns: model: """ # Load the model and deserialize model = joblib.load(os.path.join(model_dir, "model.joblib")) return model