#!/usr/bin/env python # A sample training component that trains a simple scikit-learn decision tree model. # This implementation works in File mode and makes no assumptions about the input file names. # Input is specified as CSV with a data point in each row and the labels in the first column. from __future__ import print_function import json import os import pickle import sys import traceback import logging import pandas as pd from sklearn import tree from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score logger = logging.getLogger() logger.setLevel(logging.INFO) logger.addHandler(logging.StreamHandler()) # These are the paths to where SageMaker mounts interesting things in your container. prefix = '/opt/ml/' input_path = os.path.join(prefix, 'input/data') output_path = os.path.join(prefix, 'output') model_path = os.path.join(prefix, 'model') param_path = os.path.join(prefix, 'input/config/hyperparameters.json') # This algorithm has a single channel of input data called 'training'. Since we run in # File mode, the input files are copied to the directory specified here. channel_name_training='training' training_path = os.path.join(input_path, channel_name_training) channel_name_validation='validation' validation_path = os.path.join(input_path, channel_name_validation) # The function to execute the training. def train(): print('Starting the training.') try: # Read in any hyperparameters that the user passed with the training job with open(param_path, 'r') as tc: trainingParams = json.load(tc) # Take the set of files and read them all into a single pandas dataframe input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ] if len(input_files) == 0: raise ValueError(('There are no files in {}.\n' + 'This usually indicates that the channel ({}) was incorrectly specified,\n' + 'the data specification in S3 was incorrectly specified or the role specified\n' + 'does not have permission to access the data.').format(training_path, channel_name_training)) raw_data = [ pd.read_csv(file, header=None) for file in input_files ] train_data = pd.concat(raw_data) # labels are in the first column train_y = train_data.iloc[:,0] train_X = train_data.iloc[:,1:] # Here we only support a single hyperparameter. Note that hyperparameters are always passed in as # strings, so we need to do any necessary conversions. max_leaf_nodes = trainingParams.get('max_leaf_nodes', None) if max_leaf_nodes is not None: max_leaf_nodes = int(max_leaf_nodes) max_depth = trainingParams.get('max_depth', None) if max_depth is not None: max_depth = int(max_depth) clf = tree.DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, max_depth=max_depth) clf = clf.fit(train_X, train_y) # save the model os.system('sudo chown -R 1000:100 ' + model_path) with open(os.path.join(model_path, 'decision-tree-model.pkl'), 'wb') as out: pickle.dump(clf, out) print('Training complete.') return clf except Exception as e: # Write out an error file. This will be returned as the failureReason in the # DescribeTrainingJob result. trc = traceback.format_exc() os.system('sudo chown -R 1000:100 ' + output_path) with open(os.path.join(output_path, 'failure'), 'w') as s: s.write('Exception during training: ' + str(e) + '\n' + trc) # Printing this causes the exception to be in the training job logs, as well. print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) # A non-zero exit code causes the training job to be marked as Failed. sys.exit(255) # The function to execute the validation. def validation(clf): print('Starting the validation.') try: # Take the set of files and read them all into a single pandas dataframe input_files = [ os.path.join(validation_path, file) for file in os.listdir(validation_path) ] if len(input_files) == 0: raise ValueError(('There are no files in {}.\n' + 'This usually indicates that the channel ({}) was incorrectly specified,\n' + 'the data specification in S3 was incorrectly specified or the role specified\n' + 'does not have permission to access the data.').format(validation_path, channel_name_validation)) raw_data = [ pd.read_csv(file, header=None) for file in input_files ] validation_data = pd.concat(raw_data) # labels are in the first column train_y = validation_data.iloc[:,0] train_X = validation_data.iloc[:,1:] predictions = clf.predict(train_X) auc = roc_auc_score(train_y, predictions) logger.info(('auc:{}').format(auc)) except Exception as e: # Write out an error file. This will be returned as the failureReason in the # DescribeTrainingJob result. trc = traceback.format_exc() os.system('sudo chown -R 1000:100 ' + output_path) with open(os.path.join(output_path, 'failure'), 'w') as s: s.write('Exception during training: ' + str(e) + '\n' + trc) # Printing this causes the exception to be in the training job logs, as well. print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) # A non-zero exit code causes the training job to be marked as Failed. sys.exit(255) if __name__ == '__main__': clf = train() validation(clf) # A zero exit code causes the job to be marked a Succeeded. sys.exit(0)