#!/usr/bin/env python # A sample training component that trains a simple HDBSCAN model. # This implementation works in File mode and makes no assumptions about the input file names. import json import os import pickle import sys import traceback import hdbscan import pandas as pd prefix = '/opt/ml/' input_path = prefix + 'input/data' output_path = os.path.join(prefix, 'output') train_channel_name = 'train' validation_channel_name = 'validation' model_path = os.path.join(prefix, 'model') model_file_name = 'hdbscan-model.pkl' train_path = os.path.join(input_path, train_channel_name) param_path = os.path.join(prefix, 'input/config/hyperparameters.json') # The function to execute the training. def train(): print('Starting the training.') try: # Read in any hyperparameters that the user passed with the training job print('Reading hyperparameters data: {}'.format(param_path)) with open(param_path) as json_file: hyperparameters_data = json.load(json_file) print('hyperparameters_data: {}'.format(hyperparameters_data)) # Take the set of train files and read them all into a single pandas dataframe train_input_files = [os.path.join(train_path, file) for file in os.listdir(train_path)] if len(train_input_files) == 0: raise ValueError(('There are no files in {}.\n' + 'This usually indicates that the channel ({}) was incorrectly specified,\n' + 'the data specification in S3 was incorrectly specified or the role specified\n' + 'does not have permission to access the data.').format(train_path, train_channel_name)) print('Found train files: {}'.format(train_input_files)) raw_data = [pd.read_csv(file, header=None) for file in train_input_files] train_df = pd.concat(raw_data) min_cluster_size = hyperparameters_data.get('min_cluster_size', None) if min_cluster_size is not None: min_cluster_size = int(min_cluster_size) clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, cluster_selection_method='eom') print("Start HDBSCAN clustering...") clusterer = clusterer.fit(train_df) labels = clusterer.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print("Clustering finished, found", n_clusters_, 'clusters') print(n_noise_, "samples marked as noise (not in any cluster)") # save the model with open(os.path.join(model_path, 'hdbscan-model.pkl'), 'wb') as out: pickle.dump(clusterer, out) print("model {} saved.".format('hdbscan-model.pkl')) print('Training complete.') except Exception as e: # Write out an error file. This will be returned as the failureReason in the # DescribeTrainingJob result. trc = traceback.format_exc() with open(os.path.join(output_path, 'failure'), 'w') as s: s.write('Exception during training: ' + str(e) + '\n' + trc) # Printing this causes the exception to be in the training job logs, as well. print('Exception during training: ' + str(e) + '\n' + trc) # A non-zero exit dependencies causes the training job to be marked as Failed. sys.exit(255) if __name__ == '__main__': train() # A zero exit dependencies causes the job to be marked a Succeeded. sys.exit(0)