Path: blob/master/clustering/tfidf/feature_extraction.py
2587 views
import re1import numpy as np2from collections import defaultdict3from scipy.sparse import spdiags, csr_matrix4from sklearn.preprocessing import normalize5from sklearn.base import BaseEstimator, TransformerMixin6from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS789__all__ = [10'CountVectorizer',11'TfidfTransformer',12'TfidfVectorizer']131415class CountVectorizer(BaseEstimator):16"""17Convert a collection of text documents to a matrix of token counts,18this implementation produces a sparse representation of the counts using19scipy.sparse.csr_matrix.2021The number of features will be equal to the vocabulary size found by22analyzing all input documents and removal of stop words2324Parameters25----------26analyzer : str {'word'} or callable27Whether the feature should be made of word, if n-grams is specified,28then the words are concatenated with space.29If a callable is passed it is used to extract the sequence of features30out of the raw, unprocessed input.3132ngram_range : tuple (min_n, max_n)33The lower and upper boundary of the range of n-values for different34n-grams to be extracted. All values of n such that min_n <= n <= max_n35will be used.3637token_pattern : str38Regular expression denoting what constitutes a "token", only used39if ``analyzer == 'word'``. The default regexp select tokens of 240or more alphanumeric characters (punctuation is completely ignored41and always treated as a token separator).4243stop_words : str {'english'}, collection, or None, default None44- If 'english', a built-in stop word list for English is used.45- If a collection, that list or set is assumed to contain stop words,46all of which will be removed from the resulting tokens. Only applies47if ``analyzer == 'word'``.48- If None, no stop words will be used.4950lowercase : bool, default True51Convert all characters to lowercase before tokenizing.5253binary : bool, default False54If True, all non zero counts are set to 1. This is useful for discrete55probabilistic models such as binomial naive bayes that model binary56events rather than integer counts.5758Attributes59----------60vocabulary_ : dict61A mapping of terms to feature indices.62"""6364def __init__(self, analyzer = 'word', ngram_range = (1, 1),65token_pattern = r'\b\w\w+\b', stop_words = None,66lowercase = True, binary = False):67self.binary = binary68self.analyzer = analyzer69self.lowercase = lowercase70self.stop_words = stop_words71self.ngram_range = ngram_range72self.token_pattern = token_pattern7374def fit(self, raw_documents, y = None):75"""76Learn the vocabulary dictionary of all tokens in the raw documents.7778Parameters79----------80raw_documents : iterable81An iterable which yields str8283y : default None84Ignore, argument required for constructing sklearn Pipeline.8586Returns87-------88self89"""90self.fit_transform(raw_documents)91return self9293def fit_transform(self, raw_documents, y = None):94"""95Learn the vocabulary dictionary and return document-term matrix.96This is equivalent to calling fit followed by transform, but more97efficiently implemented.9899Parameters100----------101raw_documents : iterable102An iterable which yields either str.103104y : default None105Ignore, argument required for constructing sklearn Pipeline.106107Returns108-------109X : scipy sparse matrix, shape [n_samples, n_features]110Count document-term matrix.111"""112if isinstance(raw_documents, str):113raise ValueError(114'Iterable over raw text documents expected, '115'string objected received')116117X, vocabulary = self._count_vocab(raw_documents, fixed_vocab = False)118if self.binary:119X.data.fill(1)120121# we can add additional filtering after we construct122# the document-term matrix, but this is omitted for now123self.vocabulary_ = vocabulary124return X125126def _count_vocab(self, raw_documents, fixed_vocab):127"""Create sparse feature matrix and vocabulary if fixed_vocab = False"""128if fixed_vocab:129vocabulary = self.vocabulary_130else:131# add new value when a new vocabulary item is seen132vocabulary = defaultdict()133vocabulary.default_factory = vocabulary.__len__134135# lambda function to split strings into n_gramed tokens136analyze = self._build_analyzer()137138# information to create sparse csr_matrix139values = []140indptr = []141indices = []142indptr.append(0)143for doc in raw_documents:144# maps feature index to count145feature_counter = {}146for feature in analyze(doc):147try:148feature_idx = vocabulary[feature]149if feature_idx not in feature_counter:150feature_counter[feature_idx] = 1151else:152feature_counter[feature_idx] += 1153except KeyError:154# ignore out-of-vocabulary items for fixed_vocab = True155continue156157indices.extend(feature_counter.keys())158values.extend(feature_counter.values())159indptr.append(len(indices))160161# disable defaultdict behaviour162if not fixed_vocab:163vocabulary = dict(vocabulary)164165indices = np.asarray(indices, dtype = np.intc)166values = np.asarray(values, dtype = np.intc)167indptr = np.asarray(indptr, dtype = np.intc)168shape = len(indptr) - 1, len(vocabulary)169X = csr_matrix((values, indices, indptr), shape = shape, dtype = np.intc)170return X, vocabulary171172def _build_analyzer(self):173"""Return a callable that handles preprocessing and tokenization"""174if callable(self.analyzer):175return self.analyzer176elif self.analyzer == 'word':177tokenize = self._build_tokenizer()178stop_words = self._get_stop_words()179return lambda doc: self._word_ngrams(tokenize(doc), stop_words)180else:181raise ValueError('{} is not a valid tokenization scheme/analyzer'.format(182self.analyzer))183184def _build_tokenizer(self):185"""Returns a function that splits a string into a sequence of tokens"""186token_pattern = re.compile(self.token_pattern)187if self.lowercase:188return lambda doc: token_pattern.findall(doc.lower())189else:190return lambda doc: token_pattern.findall(doc)191192def _get_stop_words(self):193"""Build or fetch the effective stop words frozenset"""194stop = self.stop_words195if stop == 'english':196return ENGLISH_STOP_WORDS197elif stop is None:198return None199elif isinstance(stop, str):200raise ValueError("Stop words not a collection")201else:202return frozenset(stop)203204def _word_ngrams(self, tokens, stop_words):205"""Tokenize document into a sequence of n-grams after stop words filtering"""206if stop_words is not None:207tokens = [w for w in tokens if w not in stop_words]208209# handle token n-grams210min_n, max_n = self.ngram_range211if max_n == 1:212return tokens213else:214original_tokens = list(tokens)215n_original_tokens = len(original_tokens)216if min_n == 1:217min_n += 1218else:219tokens = []220221# bind method outside of loop to reduce overhead,222# as local variables are accessed more quickly than attribute lookups223# https://wiki.python.org/moin/PythonSpeed224# https://stackoverflow.com/questions/28597014/python-why-is-accessing-instance-attribute-is-slower-than-local225tokens_append = tokens.append226space_join = ' '.join227for n in range(min_n, min(max_n + 1, n_original_tokens + 1)):228for i in range(n_original_tokens - n + 1):229tokens_append(space_join(original_tokens[i:i + n]))230231return tokens232233def transform(self, raw_documents):234"""235Transform documents to document-term matrix.236Extract token counts out of raw text documents using the vocabulary237fitted with fit or fit_transform.238239Parameters240----------241raw_documents : iterable242An iterable which yields either str.243244Returns245-------246X : scipy sparse matrix, shape [n_samples, n_features]247Document-term matrix.248"""249250# use the same matrix-building strategy as fit_transform251X, _ = self._count_vocab(raw_documents, fixed_vocab = True)252if self.binary:253X.data.fill(1)254255return X256257258class TfidfTransformer(BaseEstimator, TransformerMixin):259"""260Transform a count matrix to a tf-idf representation.261262Parameters263----------264norm : 'l1', 'l2' or None, default 'l2'265Norm used to normalize term vectors. None for no normalization.266267smooth_idf : bool, default True268Smooth idf weights by adding one to document frequencies, as if an269extra document was seen containing every term in the collection270exactly once. Prevents zero divisions.271272sublinear_tf : bool, default False273Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).274275copy : bool, default True276Whether to copy input data and operate on the copy or perform in-place operations.277"""278279def __init__(self, norm = 'l2', smooth_idf = True, sublinear_tf = False, copy = True):280self.norm = norm281self.copy = copy282self.smooth_idf = smooth_idf283self.sublinear_tf = sublinear_tf284285def fit(self, X, y = None):286"""287Learn the idf vector.288289Parameters290----------291X : scipy sparse matrix, shape [n_samples, n_features]292Count document-term matrix.293294y : default None295Ignore, argument required for constructing sklearn Pipeline.296297Returns298-------299self300"""301n_samples, n_features = X.shape302doc_freq = np.bincount(X.indices, minlength = X.shape[1])303304# perform idf smoothing if required305doc_freq += int(self.smooth_idf)306n_samples += int(self.smooth_idf)307308# log + 1 instead of log makes sure terms with zero idf309# don't get suppressed entirely.310idf = np.log(float(n_samples) / doc_freq) + 1.0311self._idf_diag = spdiags(idf, diags = 0, m = n_features, n = n_features, format = 'csr')312return self313314def transform(self, X):315"""316Transform a count matrix to tf-idf representation.317318Parameters319----------320X : scipy sparse matrix, shape [n_samples, n_features]321Count document-term matrix.322323Returns324-------325X : scipy sparse matrix, shape [n_samples, n_features]326Tf-idf weighted document-term matrix.327"""328if self.copy:329X = X.copy()330331if self.sublinear_tf:332X.data = np.log(X.data)333X.data += 1334335# compute the tfidf matrix336X *= self._idf_diag337338if self.norm is not None:339X = normalize(X, norm = self.norm, copy = False)340341return X342343344class TfidfVectorizer(CountVectorizer):345"""346Convert a collection of raw documents to a matrix of TF-IDF features.347This is equivalent to CountVectorizer followed by TfidfTransformer.348349Parameters350----------351analyzer : str {'word'} or callable352Whether the feature should be made of word, if n-grams is specified,353then the words are concatenated with space.354If a callable is passed it is used to extract the sequence of features355out of the raw, unprocessed input.356357ngram_range : tuple (min_n, max_n)358The lower and upper boundary of the range of n-values for different359n-grams to be extracted. All values of n such that min_n <= n <= max_n360will be used.361362token_pattern : str363Regular expression denoting what constitutes a "token", only used364if ``analyzer == 'word'``. The default regexp select tokens of 2365or more alphanumeric characters (punctuation is completely ignored366and always treated as a token separator).367368stop_words : str {'english'}, collection, or None, default None369- If 'english', a built-in stop word list for English is used.370- If a collection, that list or set is assumed to contain stop words,371all of which will be removed from the resulting tokens. Only applies372if ``analyzer == 'word'``.373- If None, no stop words will be used.374375lowercase : bool, default True376Convert all characters to lowercase before tokenizing.377378binary : boolean, default False379If True, all non-zero term counts are set to 1. This does not mean380outputs will have only 0/1 values, only the tf term in tf-idf will381become binary.382383norm : 'l1', 'l2' or None, default 'l2'384Norm used to normalize term vectors. None for no normalization.385386smooth_idf : bool, default True387Smooth idf weights by adding one to document frequencies, as if an388extra document was seen containing every term in the collection389exactly once. Prevents zero divisions.390391sublinear_tf : bool, default False392Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).393394copy : bool, default True395Whether to copy input data and operate on the copy or perform in-place operations.396397Attributes398----------399vocabulary_ : dict400A mapping of terms to feature indices.401"""402403def __init__(self, analyzer = 'word', ngram_range = (1, 1), token_pattern = r'\b\w\w+\b',404stop_words = None, lowercase = True, binary = False, norm = 'l2',405smooth_idf = True, sublinear_tf = False, copy = True):406super().__init__(407analyzer = analyzer, ngram_range = ngram_range,408token_pattern = token_pattern, stop_words = stop_words, lowercase = lowercase)409410self._tfidf = TfidfTransformer(411norm = norm, smooth_idf = smooth_idf, sublinear_tf = sublinear_tf, copy = copy)412413def fit(self, raw_documents, y = None):414"""415Learn vocabulary and idf from training set.416417Parameters418----------419raw_documents : iterable420An iterable which yields str.421422y : default None423Ignore, argument required for constructing sklearn Pipeline.424425Returns426-------427self428"""429X = super().fit_transform(raw_documents)430self._tfidf.fit(X)431return self432433def fit_transform(self, raw_documents, y = None):434"""435Learn vocabulary and idf, return term-document matrix.436This is equivalent to calling fit followed by transform, but more437efficiently implemented.438439Parameters440----------441raw_documents : iterable442An iterable which yields str.443444y : default None445Ignore, argument required for constructing sklearn Pipeline.446447Returns448-------449X : scipy sparse matrix, shape [n_samples, n_features]450Tf-idf weighted document-term matrix.451"""452X = super().fit_transform(raw_documents)453return self._tfidf.fit_transform(X)454455def transform(self, raw_documents):456"""457Transform documents to document-term matrix.458459Uses the vocabulary and document frequencies learned by fit or460fit_transform.461462Parameters463----------464raw_documents : iterable465An iterable which yields str.466467Returns468-------469X : scipy sparse matrix, shape [n_samples, n_features]470Tf-idf weighted document-term matrix.471"""472X = super().transform(raw_documents)473return self._tfidf.transform(X)474475# broadcast the TfidfTransformer's parameters to the underlying transformer476# instance to enable hyperparameter search and repr477@property478def norm(self):479return self._tfidf.norm480481@norm.setter482def norm(self, value):483self._tfidf.norm = value484485@property486def use_idf(self):487return self._tfidf.use_idf488489@use_idf.setter490def use_idf(self, value):491self._tfidf.use_idf = value492493@property494def smooth_idf(self):495return self._tfidf.smooth_idf496497@smooth_idf.setter498def smooth_idf(self, value):499self._tfidf.smooth_idf = value500501@property502def sublinear_tf(self):503return self._tfidf.sublinear_tf504505@sublinear_tf.setter506def sublinear_tf(self, value):507self._tfidf.sublinear_tf = value508509510