# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of # the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the "license" file accompanying this file. This file is # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. from datetime import datetime from enum import Enum from dateutil import parser import numpy as np from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_array, check_is_fitted class DateTimeProperty: def __init__(self, extract_func, max_, min_): """Contains information about a property of a datetime object Parameters ---------- extract_func: function function mapping a datetime object to the property max_: int maximum value for the property min_: int minimum value for the property """ self.min = min_ self.max = max_ self.extract_func = extract_func def extract_week_of_year(t): return t.isocalendar()[1] if isinstance(t, datetime) else np.nan def extract_weekday(t): return t.isocalendar()[2] if isinstance(t, datetime) else np.nan def extract_year(t): return t.year if isinstance(t, datetime) else np.nan def extract_hour(t): return t.hour if isinstance(t, datetime) else np.nan def extract_month(t): return t.month if isinstance(t, datetime) else np.nan def extract_minute(t): return t.minute if isinstance(t, datetime) else np.nan def extract_quarter(t): return (t.month - 1) // 3 + 1 if isinstance(t, datetime) else np.nan def extract_second(t): return t.second if isinstance(t, datetime) else np.nan def extract_day_of_year(t): return t.timetuple().tm_yday if isinstance(t, datetime) else np.nan def extract_day_of_month(t): return t.day if isinstance(t, datetime) else np.nan class DateTimeDefinition(Enum): WEEK_OF_YEAR = DateTimeProperty(extract_week_of_year, 53, 1) WEEKDAY = DateTimeProperty(extract_weekday, 7, 1) YEAR = DateTimeProperty(extract_year, None, None) HOUR = DateTimeProperty(extract_hour, 23, 0) MONTH = DateTimeProperty(extract_month, 12, 1) MINUTE = DateTimeProperty(extract_minute, 59, 0) QUARTER = DateTimeProperty(extract_quarter, 4, 1) SECOND = DateTimeProperty(extract_second, 59, 0) DAY_OF_YEAR = DateTimeProperty(extract_day_of_year, 366, 1) DAY_OF_MONTH = DateTimeProperty(extract_day_of_month, 31, 1) class DateTimeVectorizer(BaseEstimator, TransformerMixin): def __init__(self, extract=None, mode="cyclic", ignore_constant_columns=True, default_datetime=None): """Converts array-like data with datetime.datetime or strings describing datetime objects into numeric features A datetime item contains categorical information: year, month, hour, day of week, etc. This information is given as the output features. The encoding of these categories can be ordinal or cyclic. The cyclic encoding of an integer i between 0 and k consists of two floats: sin(i/k), cos(i/k). This makes sure for example that the months Decembers and January are encoded to vectors that are close in Euclidean distance. Parameters ---------- extract: list of DateTimeProperty, default None Types of data to extract. See DateTimeDefinition class for options. If given None, defaults to DateTimeVectorizer.default_data mode: str, default cyclic 'ordinal': each data type is outputted to a non-negative integer, as in ordinal encoding for categorical data 'cyclic': each data type is converted to two numbers in [-1,1] so that the distance between these numbers is small for close items in the cyclic order (for example hour=23 is close to hour=0) ignore_constant_columns: bool, default True If True, fit will make sure the output columns are not constant in the training set. default_datetime: DateTime, default None Default DateTime object to use when information is missing from input array. This DateTime object is passed as a keyword argument into the dateutil.parser.parse method. If this is a datetime object and not None, elements specified in the parse method replace elements in the default object. When ignore_constant_columns is True, the filled DateTime information will be removed if constant. Attributes ---------- extract_ : list of DateTimeProperty List of DateTimeProperty objects, each providing the necessary information for extracting a single property from a datetime object. The properties corresponding to this list describe the different columns of the output of the transform function Examples -------- >>> from sagemaker_sklearn_extension.feature_extraction.date_time import DateTimeVectorizer >>> import numpy as np >>> data = np.array([ ... 'Jan 3th, 2018, 1:34am', ... 'Feb 11th, 2012, 11:34:59pm', ... ]).reshape((-1, 1)) >>> date_time = DateTimeVectorizer(mode='ordinal', ignore_constant_columns=False) >>> X = date_time.fit_transform(data) >>> print(X.shape) (2, 7) >>> print(X[0].astype(np.int)) [ 2 2018 1 34 0 0 0] >>> date_time = DateTimeVectorizer(mode='ordinal') >>> # with ignore_constant_columns=True, the minute field, which is 34 in both examples, will be filtered >>> X = date_time.fit_transform(data) >>> print(X.shape) (2, 6) >>> print(X[0].astype(np.int)) [ 2 2018 1 0 0 0] """ self.extract = extract self.mode = mode self.ignore_constant_columns = ignore_constant_columns self.default_datetime = default_datetime @staticmethod def _cyclic_transform(data, low, high): """ Converts numeric data into 2d-cyclic. The conversion of a single integer into two floats makes sure that the Euclidian distance between two (output) values is similar to the cyclic distance between the integers. For example, hour of day is a number between 0 and 23. The cyclic distance between the hours 0 and 23 is 1 (and not 23). After the cyclic transform, the transformed hour 0 will be a vector very close to that of the hour 23, and far away from that of 12. Parameters ---------- data: np.array of numbers low: lower bound of the data values high: upper bound of the data values Returns ------- np.array with double the dimension in the last axis Examples -------- >>> from sagemaker_sklearn_extension.feature_extraction.date_time import DateTimeVectorizer >>> output = DateTimeVectorizer._cyclic_transform(np.array([[1],[2],[3],[4]]), low=1, high=4) >>> # up to numeric precision, the outputs should be [[0,1], [1,0], [0,-1], [-1,0]] >>> print(output) [[ 0.0000000e+00 1.0000000e+00] [ 1.0000000e+00 6.1232340e-17] [ 1.2246468e-16 -1.0000000e+00] [-1.0000000e+00 -1.8369702e-16]] >>> output = DateTimeVectorizer._cyclic_transform(np.array([[1],[2],[3],[4],[5],[6],[7],[8]]), low=1, high=8) >>> print(output) [[ 0.00000000e+00 1.00000000e+00] [ 7.07106781e-01 7.07106781e-01] [ 1.00000000e+00 6.12323400e-17] [ 7.07106781e-01 -7.07106781e-01] [ 1.22464680e-16 -1.00000000e+00] [-7.07106781e-01 -7.07106781e-01] [-1.00000000e+00 -1.83697020e-16] [-7.07106781e-01 7.07106781e-01]] """ normalized = (data - low) * 2 * np.pi / (1 + high - low) sin_values = np.sin(normalized) cos_values = np.cos(normalized) shape = list(sin_values.shape) tmp_shape = tuple(shape + [1]) sin_values = sin_values.reshape(tmp_shape) cos_values = cos_values.reshape(tmp_shape) ret = np.concatenate((sin_values, cos_values), axis=len(tmp_shape) - 1) shape[-1] *= 2 return ret.reshape(tuple(shape)) default_data = [ DateTimeDefinition.WEEKDAY.value, DateTimeDefinition.YEAR.value, DateTimeDefinition.HOUR.value, DateTimeDefinition.MINUTE.value, DateTimeDefinition.SECOND.value, DateTimeDefinition.MONTH.value, DateTimeDefinition.WEEK_OF_YEAR.value, ] def _to_datetime_single(self, item): if isinstance(item, datetime): return item try: return parser.parse(item, default=self.default_datetime) except ValueError: pass except TypeError: pass def _to_datetime_array(self, X): """Converts np array with string or datetime into datetime or None Parameters ---------- X : np.array numpy array containing data representing datetime objects Returns ------- X : np.array np.array with datetime objects of the same shape of the input. Items that could not be parsed become None """ X = np.vectorize(DateTimeVectorizer._to_datetime_single)(self, X) return X def fit(self, X, y=None): """Filter the extracted field so as not to contain constant columns. Parameters ---------- X : {array-like}, datetime.datetime or str Notes ----- If fitting with a 2d array with more than one column, any data type that is not constant in any column will remain. If for example, column 1 has year=1999 for all rows but column 2 has two or more possible year values, we will still produce an output with the year information from column 1. To avoid this, run fit on each column separately, and obtain a separate DateTimeVectorizer for each column Returns ------- self : DateTimeVectorizer """ X = check_array(X, dtype=None, force_all_finite="allow-nan") X = np.array(X) X = self._to_datetime_array(X) if self.mode not in ["cyclic", "ordinal"]: raise ValueError("mode must be either cyclic or ordinal. Current value is {}".format(self.mode)) self.extract_ = self.extract or self.default_data if self.ignore_constant_columns: new_extract = [] for col in range(X.shape[1]): # convert the current column to get the different property values transformed = self._convert(X[:, col].reshape((-1, 1)), mode="ordinal") # check for constant columns transformed_var = np.nanvar(transformed, axis=0) for i, cur_var in enumerate(transformed_var): if cur_var > 0 and self.extract_[i] not in new_extract: new_extract.append(self.extract_[i]) if not new_extract: new_extract = [self.extract_[0]] self.extract_ = new_extract return self def _convert(self, X, mode): n_cols = X.shape[1] cols = [] for datetime_property in self.extract_: # apply the function on the datetime values in the input array, create a python list. To iterate over all # items we view the input as a 1d vector cur_conversions = list(map(datetime_property.extract_func, X.reshape((-1,)))) # convert the list to a float32 numpy array cur_extract = np.array(cur_conversions, dtype=np.float32).reshape((-1, 1)) if datetime_property.min is None: # the output isn't cyclic. Leave it as is pass elif mode == "ordinal": # the output is ordinal - shift it so the minimum value is 0 cur_extract -= datetime_property.min elif mode == "cyclic": # the output is cyclic - need to apply the cyclic transform cur_extract = self._cyclic_transform(cur_extract, low=datetime_property.min, high=datetime_property.max) cols.append(cur_extract) ret = np.concatenate(cols, axis=1) # the return array is in 1d form. We need to reshape it to bring it back to the correct 2d form ret = ret.reshape((-1, n_cols * ret.shape[1])) return ret def transform(self, X, y=None): X = check_array(X, dtype=None, force_all_finite="allow-nan") check_is_fitted(self, "extract_") X = np.array(X) X = self._to_datetime_array(X) return self._convert(X, self.mode) def _more_tags(self): return {"X_types": ["datetime.datetime", "string"]}