# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of # the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the "license" file accompanying this file. This file is # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. import numpy as np import scipy.sparse as sp from sklearn.base import BaseEstimator, TransformerMixin from sklearn.feature_extraction.text import VectorizerMixin, TfidfVectorizer from sklearn.utils.validation import check_array, check_is_fitted class MultiColumnTfidfVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin): """Applies ``sklearn.feature_extraction.text.TfidfVectorizer`` to each column in an array. Each column of text is treated separately with a unique TfidfVectorizer. The vectorizers are applied sequentially. Parameters ---------- strip_accents : {'ascii', 'unicode', None} (default=None) Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have an direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) does nothing. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`. lowercase : boolean (default=True) Convert all characters to lowercase before tokenizing. preprocessor : callable or None (default=None) Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. tokenizer : callable or None (default=None) Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``. stop_words : string {'english'}, list, or None (default) If 'english', a built-in stop word list for English is used. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms. token_pattern : string Regular expression denoting what constitutes a "token", only used if ``analyzer == 'word'``. The default regexp select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). ngram_range : tuple (min_n, max_n) (default=(1, 1)) The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. analyzer : string, {'word', 'char', 'char_wb'} or callable Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. max_df : float in range [0.0, 1.0] or int (default=1.0) When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None. min_df : float in range [0.0, 1.0] or int (default=1) When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None. max_features : int or None (default=1000) If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None. vocabulary : Mapping or iterable, optional (default=None) Either a Mapping (e.g., a dict) where keys are terms and values are indices in the feature matrix, or an iterable over terms. If not given, a vocabulary is determined from the input. dtype : type, optional (default=float64) Type of the matrix returned by fit_transform() or transform(). norm : 'l1', 'l2' or None, optional (default='l2') Each output row will have unit norm, either: * 'l2': Sum of squares of vector elements is 1. The cosine similarity between two vectors is their dot product when l2 norm has been applied. * 'l1': Sum of absolute values of vector elements is 1. See :func:`preprocessing.normalize` use_idf : boolean (default=True) Enable inverse-document-frequency reweighting. smooth_idf : boolean (default=True) Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions. sublinear_tf : boolean (default=False) Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). vocabulary_sizes : list(int) (default=None) Specify the exact vocabulary size to use while encoding each column in the input dataset. The vocabulary size of a column corresponds to the number of features in its TF-IDF encoding, before the feature matrices are concatenated. If the feature matrix of column ``i`` has more features than the corresponding vocabulary size, only the first ``vocabulary_sizes[i]`` features are kept. If the feature matrix of column ``i`` has fewer features than the corresponding vocabulary size, zero columns are added to the feature matrix until it has ``vocabulary_sizes[i]`` features. This parameter is useful if the total number of features of the encoding has to be constant. ignore_columns_with_zero_vocabulary_size : boolean (default=True) Allow ValueErrors thrown by ``sklearn.feature_extraction.text.TfidfVectorizer`` because of over-pruning of terms to be ignored and an empty ``scipy.sparse.csr_matrix`` to be used in place of the given columns TF-IDF document-term matrix. Attributes ---------- vectorizers_ : list of ``sklearn.feature_extraction.text.TfidfVectorizers`` List of ``sklearn.feature_extraction.text.TfidfVectorizers``. Each TfidfVectorizer is separately instantiated on an input column. len(self.vectorizers_) should equal to the number of input columns. Notes ----- MultiColumnTfidfVectorizer should be used with 2D arrays of text strings, for 1D arrays of text data, use ``sklearn.feature_extraction.text.TfidfVectorizer`` or reshape using array.reshape(-1, 1) """ def __init__( self, strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), analyzer="word", max_df=1.0, min_df=1, max_features=1000, vocabulary=None, dtype=np.float64, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False, vocabulary_sizes=None, ignore_columns_with_zero_vocabulary_size=True, ): self.strip_accents = strip_accents self.lowercase = lowercase self.preprocessor = preprocessor self.tokenizer = tokenizer self.stop_words = stop_words self.token_pattern = token_pattern self.ngram_range = ngram_range self.analyzer = analyzer self.max_df = max_df self.min_df = min_df self.max_features = max_features self.vocabulary = vocabulary self.dtype = dtype self.norm = norm self.use_idf = use_idf self.smooth_idf = smooth_idf self.sublinear_tf = sublinear_tf self.vocabulary_sizes = vocabulary_sizes self.ignore_columns_with_zero_vocabulary_size = ignore_columns_with_zero_vocabulary_size def _fit_vectorizer(self, col_idx, X): max_features = self.max_features # Override max_features for the current column in order to enforce the vocabulary size. if self.max_features and self.vocabulary_sizes: max_features = min(self.max_features, self.vocabulary_sizes[col_idx]) elif self.vocabulary_sizes: max_features = self.vocabulary_sizes[col_idx] try: vectorizer = TfidfVectorizer( strip_accents=self.strip_accents, lowercase=self.lowercase, preprocessor=self.preprocessor, tokenizer=self.tokenizer, stop_words=self.stop_words, token_pattern=self.token_pattern, ngram_range=self.ngram_range, analyzer=self.analyzer, max_df=self.max_df, min_df=self.min_df, max_features=max_features, vocabulary=self.vocabulary, dtype=self.dtype, norm=self.norm, use_idf=self.use_idf, smooth_idf=self.smooth_idf, sublinear_tf=self.sublinear_tf, ) vectorizer.fit(X[:, col_idx]) except ValueError as err: zero_vocab_errors = [ "After pruning, no terms remain. Try a lower min_df or a higher max_df.", "max_df corresponds to < documents than min_df", "empty vocabulary; perhaps the documents only contain stop words", ] if str(err) in zero_vocab_errors and self.ignore_columns_with_zero_vocabulary_size: vectorizer = None else: raise return vectorizer def fit(self, X, y=None): """Build the list of TfidfVectorizers for each column. Parameters ---------- X : {array-like}, text data Returns ------- self : MultiColumnTfidfVectorizer """ X = check_array(X, dtype=None) n_columns = X.shape[1] # If specified, vocabulary size must be given for each column of the input dataset. if self.vocabulary_sizes and len(self.vocabulary_sizes) != n_columns: raise ValueError("If specified, vocabulary_sizes has to have exactly one entry per data column.") self.vectorizers_ = [self._fit_vectorizer(i, X) for i in range(n_columns)] return self def _transform_vectorizer(self, col_idx, X): if self.vectorizers_[col_idx]: tfidf_features = self.vectorizers_[col_idx].transform(X[:, col_idx]) # If the vocabulary size is specified and there are too few features, then pad the output with zeros. if self.vocabulary_sizes and tfidf_features.shape[1] < self.vocabulary_sizes[col_idx]: tfidf_features = sp.csr_matrix( (tfidf_features.data, tfidf_features.indices, tfidf_features.indptr), shape=(tfidf_features.shape[0], self.vocabulary_sizes[col_idx]), ) return tfidf_features # If ``TfidfVectorizer`` threw a value error, add an empty TF-IDF document-term matrix for the column return sp.csr_matrix((X.shape[0], 0)) def transform(self, X, y=None): """Transform documents to document term-matrix. Parameters ---------- X : 2D array of text data Returns ------- tfidf_matrix : sparse matrix, [n_samples, n_features] Tf-idf-weighted document-term matrix. """ check_is_fitted(self, "vectorizers_") X = check_array(X, dtype=None) return sp.hstack([self._transform_vectorizer(i, X) for i in range(X.shape[1])]) def _more_tags(self): return {"X_types": ["string"]}