import argparse import os import logging from sklearn.externals import joblib import numpy as np import sys import pandas as pd from sklearn import tree # Adapted from: # https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-python-sdk/scikit_learn_iris/scikit_learn_estimator_example_with_batch_transform_outputs.ipynb logger = logging.getLogger() logger.setLevel(logging.DEBUG) logger.addHandler(logging.StreamHandler(sys.stdout)) def model_fn(model_dir): clf = joblib.load(os.path.join(model_dir, "model.joblib")) return clf if __name__ == '__main__': parser = argparse.ArgumentParser() # Hyperparameters are described here. In this simple example we are just including one hyperparameter. parser.add_argument('--max_leaf_nodes', type=int, default=-1) # Sagemaker specific arguments. Defaults are set in the environment variables. parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN']) args = parser.parse_args() # Take the set of files and read them all into a single pandas dataframe input_files = [ os.path.join(args.train, file) for file in os.listdir(args.train) ] if len(input_files) == 0: raise ValueError(('There are no files in {}.\n' + 'This usually indicates that the channel ({}) was incorrectly specified,\n' + 'the data specification in S3 was incorrectly specified or the role specified\n' + 'does not have permission to access the data.').format(args.train, "train")) raw_data = [ pd.read_csv(file, engine="python") for file in input_files ] train_data = pd.concat(raw_data) # labels are in the first column train_y = train_data.iloc[:, 0] train_X = train_data.iloc[:, 1:] # Here we support a single hyperparameter, 'max_leaf_nodes'. Note that you can add as many # as your training my require in the ArgumentParser above. max_leaf_nodes = args.max_leaf_nodes # Now use scikit-learn's decision tree classifier to train the model. clf = tree.DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes) clf = clf.fit(train_X, train_y) # Print the coefficients of the trained classifier, and save the coefficients joblib.dump(clf, os.path.join(args.model_dir, "model.joblib"))