#!/usr/bin/env python # Amazon Machine Learning Samples # Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Amazon Software License (the "License"). You may not use # this file except in compliance with the License. A copy of the License is # located at # # http://aws.amazon.com/asl/ # # or in the "license" file accompanying this file. This file is distributed on # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or # implied. See the License for the specific language governing permissions and # limitations under the License. import base64 import boto import json import os import logging import config logger = logging.getLogger(config.APP_NAME) class Fold(object): """ This class represents a 'Fold' in K-fold Cross-validation. A fold understands the ordinal number in its sequence of Fold objects, so that it is able to draw the splitting range, and the splitting complement flag for DataRearrangement string. The instance of a fold can use Amazon ML's API to create an evaluation datasource, a training datasource, an ML model and an evaluation. """ def __init__(self, data_spec=None, this_fold=None, kfolds=None): """ Construct this instance of Fold. Args: data_spec: the named tuple object that wraps dataset related parameters. this_fold: the integer number indicating the ordinal number of this instance in its sequence of Fold objects. kfolds: the integer number representing the number of folds. """ self.data_spec = data_spec self.this_fold = this_fold self.kfolds = kfolds self.fold_ordinal = self.this_fold + 1 # fold_ordinal counting from 1 self.train_ds_id = None self.train_ds_rearrange = None self.eval_ds_id = None self.eval_ds_rearrange = None self.ml_id = None self.ev_id = None def build(self): """ Builds the necessary entities on Amazon ML. """ self._ml = boto.connect_machinelearning() self.create_datasources() self.create_ml_model() self.create_eval() def __str__(self): """ Returns the string representing this fold object. The string includes the IDs of entities newly created on Amazon ML, as well as the DataRearrangement string for each newly created datasource. """ return """\n\ Fold {fold_ordinal} of {kfolds}: -Training Datasource ID: {train_ds_id} -Training Datasource Rearrangement: {train_ds_rearrange} -Evaluation Datasource ID: {eval_ds_id} -Evaluation Datasource Rearrangement: {eval_ds_rearrange} -ML Model ID: {ml_id} -Evaluation ID: {ev_id}""".format(**self.__dict__) def create_datasources(self): """ Creates datasource for model training and evaluation on Amazon ML. """ # create training datasource for this fold self.train_ds_id = "ds-" + base64.b32encode(os.urandom(10)).decode( "ascii") self.train_ds_rearrange = self.build_rearrangement_str( is_complement=True) self.train_ds_name = self.build_datasource_name( self.data_spec.name, self.train_ds_rearrange) self._ml.create_data_source_from_s3( data_source_id=self.train_ds_id, data_source_name=self.train_ds_name, data_spec={ "DataLocationS3": self.data_spec.data_s3_url, "DataSchema": self.data_spec.schema, "DataRearrangement": self.train_ds_rearrange }, compute_statistics=True ) logger.info("Created Training Datasource " + self.train_ds_id) # create evaluation datasource for this fold self.eval_ds_id = "ds-" + base64.b32encode(os.urandom(10)).decode( "ascii") self.eval_ds_rearrange = self.build_rearrangement_str( is_complement=False) self.eval_ds_name = self.build_datasource_name( self.data_spec.name, self.eval_ds_rearrange) self._ml.create_data_source_from_s3( data_source_id=self.eval_ds_id, data_source_name=self.eval_ds_name, data_spec={ "DataLocationS3": self.data_spec.data_s3_url, "DataSchema": self.data_spec.schema, "DataRearrangement": self.eval_ds_rearrange }, compute_statistics=True ) logger.info("Created Evaluation Datasource " + self.eval_ds_id) def build_rearrangement_str(self, is_complement): """ Returns the DataRearrangement string. Args: is_complement: the boolean flag to indicate whether the datasource takes the given splitting range, or the complement of it. Returns: a string of the DataRearrangement """ # Use integer division as rearrange API only support percentage # in integer. Casting self.kfolds to integer for Python 2 # compatibility. percent_begin = self.this_fold * (100 // int(self.kfolds)) percent_end = (self.this_fold + 1) * (100 // int(self.kfolds)) return json.dumps({ "splitting": { "percentBegin": percent_begin, "percentEnd": percent_end, "complement": is_complement, "strategy": config.STRATEGY, "strategyParams": { "randomSeed": config.RANDOM_STRATEGY_RANDOM_SEED } } }) def build_datasource_name(self, name, rearrangement_str): """ Builds the name of datasource to create Args: name: the user-provided name of entities on Amazon ML rearrangement_str: the rearrangement JSON string Returns: a string representing the name of datasource to create """ rearrangement = json.loads(rearrangement_str) percent_begin = rearrangement["splitting"]["percentBegin"] percent_end = rearrangement["splitting"]["percentEnd"] is_complement = rearrangement["splitting"]["complement"] return "{name} [percentBegin={pb}, percentEnd={pe}, complement={c}]"\ .format(name=name, pb=percent_begin, pe=percent_end, c=is_complement) def create_ml_model(self): """ Creates ML Model on Amazon ML using the training datasource. """ self.ml_id = "ml-" + base64.b32encode(os.urandom(10)).decode("ascii") self.ml_name = "ML model: " + self.train_ds_name self._ml.create_ml_model( ml_model_id=self.ml_id, ml_model_name=self.ml_name, training_data_source_id=self.train_ds_id, ml_model_type=self.data_spec.ml_model_type, parameters={ "sgd.maxPasses": self.data_spec.sgd_maxPasses, "sgd.maxMLModelSizeInBytes": self.data_spec.sgd_maxMLModelSizeInBytes, "sgd.l2RegularizationAmount": self.data_spec.sgd_l2RegularizationAmount }, recipe=self.data_spec.recipe, ) logger.info("Created ML Model " + self.ml_id) def create_eval(self): """ Created Evaluation on Amazon ML using the evaluation datasource. """ self.ev_id = "ev-" + base64.b32encode(os.urandom(10)).decode("ascii") self.ev_name = "Evaluation: " + self.ml_name self._ml.create_evaluation( evaluation_id=self.ev_id, evaluation_name=self.ev_name, ml_model_id=self.ml_id, evaluation_data_source_id=self.eval_ds_id ) logger.info("Created Evaluation " + self.ev_id)