# Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Amazon Software License (the "License"). # You may not use this file except in compliance with the License. # A copy of the License is located at # # http://aws.amazon.com/asl/ # # or in the "license" file accompanying this file. This file is distributed # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express # or implied. See the License for the specific language governing permissions # and limitations under the License. """AWSPyML - Python utilities to help with Amazon Machine Learning. """ import boto import csv import json import math import random import re def aml_connection(): """Connects to the service and validates that credentials are configured properly. """ ml = boto.connect_machinelearning() try: # Check that the connection is configured properly ml.describe_ml_models(limit=1) except: raise RuntimeError("""There was a problem connecting to Amazon Machine Learning. Be sure your AWS credentials are properly configured. A credentials file should be in ~/.aws/credentials (or C:\Users\USER_NAME\.aws\credentials on Windows) and look like: [Credentials] aws_access_key_id = aws_secret_access_key = """) return ml class Identifiers(object): chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789' @classmethod def _new(cls, prefix=None): if prefix: out = prefix + '-' else: out = '' for i in range(11): # 62^11 -> 65 random bits, which is sufficient to avoid collisions # creating 1 entity per second for 200+ years. out += cls.chars[random.randint(0, len(cls.chars) - 1)] return out @classmethod def new_data_source_id(cls): return cls._new('ds') @classmethod def new_ml_model_id(cls): return cls._new('ml') @classmethod def new_evaluation_id(cls): return cls._new('ev') @classmethod def new_batch_prediction_id(cls): return cls._new('bp') class AWSPyMLException(Exception): pass class SchemaException(AWSPyMLException): pass class InvalidSchemaException(SchemaException): pass class JsonConfiguration(object): """Base class for JSON Configuration objects. """ def as_obj(self): self.validate() return self._obj def _json_kwargs(self, dense): if dense: return {} else: return {"indent": 2} def validate(self): """Raises an exception if any of the contents do not match requirements. Should be overridden by sub-class. """ pass # base class does nothing def as_json_string(self, dense=False): self.validate() return json.dumps(self._obj, **(self._json_kwargs(dense))) def write_to_file(self, filename, dense=False): self.validate() f = open(filename) json.dump(self._obj, f, **self._json_kwargs(dense)) f.close() def save_to_s3(self, s3url, dense=False): self.validate() raise "Not implemented yet" class Schema(JsonConfiguration): """A Schema object for Amazon Machine Learning. Keeps track of variable types. Helps format the JSON used by the API. """ VALID_VARIABLE_TYPES = [ "BINARY", "CATEGORICAL", "NUMERIC", "TEXT", ] def __init__(self): self._obj = { "version": "1.0", "dataFormat": "CSV", "attributes": [], "dataFileContainsHeader": True, "targetAttributeName": None, "excludedAttributeNames": [], # Optional "rowId": None, # Optional } def validate(self): """Validates that the schema object is properly formed. Either returns True or raises an exception. Note this is not 100% robust and might not catch all problems that the API will catch. """ if self.num_attributes() == 0: raise InvalidSchemaException("no attributes defined") for idx, var in enumerate(self.attributes()): name = var["attributeName"] typ = var["attributeType"] if not name or not typ: raise InvalidSchemaException( "Variable #%d is not fully defined" % idx) if len(name) > 64: raise InvalidSchemaException("%s" % var["attributeName"]) if typ not in self.VALID_VARIABLE_TYPES: raise InvalidSchemaException( "variable %s has invalid type %s" % (name, typ)) return True # Passes all rules. def set_target(self, target_variable_name): """Sets the target variable to the variable of the specified name """ if not self.get_variable_by_name(target_variable_name): raise SchemaException("Can't set target to undefined variable %s" % target_variable_name) self._obj['targetAttributeName'] = target_variable_name def set_variable_name(self, idx, name): """For variable in the position idx, set its name """ self.get_variable_by_idx(idx)['attributeName'] = name def set_variable_type(self, idx, attributeType): """For variable in the position idx, set its field type. """ self.get_variable_by_idx(idx)['attributeType'] = attributeType def num_attributes(self): return len(self._obj['attributes']) def attributes(self): return self._obj["attributes"] def get_variable_by_name(self, name): for var in self.attributes(): if var["attributeName"] == name: return var return None def get_variable_by_idx(self, idx): if self.num_attributes() <= idx: # The list is too small, so we expand it amount = 1 + idx - self.num_attributes() self._obj['attributes'].extend( [{ "attributeName": None, "attributeType": None, }] * amount ) return self._obj['attributes'][idx] def set_header_line(self, header_line): """Takes a boolean to specify whether or not the data file(s) contain a header line with the names of the attributes in it """ self._obj["dataFileContainsHeader"] = header_line class SchemaGuesser(object): """Guesses at a good Schema object by looking at sample data. """ def __init__(self): self.data = [] self.num_attributes = 0 self.schema = Schema() self.text_words_threshold = 20 # Heuristic. def from_file(self, filename, target_variable=None, header_line='auto', num_lines_to_use=1000): """Returns a Schema object that is guessed by looking at the first num_lines_to_use records from the given file. Can set header_line to true or false if known, otherwise, it guesses. """ f = open(filename) try: self._load_csv_data(f, num_lines_to_use) schema = self._guess_schema_from_data(header_line) if target_variable: schema.set_target(target_variable) return schema finally: f.close() def _guess_schema_from_data(self, header_line): if header_line == 'auto': header_line = self._guess_if_header_line_present() self._name_attributes(header_line) for i in xrange(self.num_attributes): # Loop through columns column = [row[i] for row in self.data] if header_line: column = column[1:] self.schema.set_variable_type(i, self._guess_variable_type(column)) return self.schema def _guess_variable_type(self, samples): counts = { "NUMERIC": 0, "BINARY": 0, "TEXT": 0, "CATEGORICAL": 0, } for sample in samples: try: num = float(sample) counts["NUMERIC"] += 1 if num == 0 or num == 1: counts["BINARY"] += 1 except: # Non-numeric word_count = len(re.split("\s+", sample)) if word_count > self.text_words_threshold: counts["TEXT"] += 1 else: counts["CATEGORICAL"] += 1 return max(counts, key=counts.get) def _name_attributes(self, header_line): digits = int(math.floor(math.log10(self.num_attributes))) + 1 for idx, name in enumerate(self.data[0]): if header_line: self.schema.set_variable_name(idx, name) else: name = "Var{number:0{width}d}".format(width=digits, number = idx + 1) self.schema.set_variable_name(idx, name) def _guess_if_header_line_present(self): header_row = self.data[0] has_header_line = self._guess_variable_type( header_row) == "CATEGORICAL" self.schema.set_header_line(has_header_line) return has_header_line def _load_csv_data(self, csvfile, num_lines_to_use): """Loads at most num_lines_to_use records from the open file with csv data. Returns as list of lists. """ csvreader = csv.reader(csvfile) for record in csvreader: self.data.append(record) if len(self.data) == num_lines_to_use: break self.num_attributes = len(self.data[0])