CoCalc -- feature_extraction.py

GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering/tfidf/feature_extraction.py
²⁵⁸⁷ views
1
import re
2
import numpy as np
3
from collections import defaultdict
4
from scipy.sparse import spdiags, csr_matrix
5
from sklearn.preprocessing import normalize
6
from sklearn.base import BaseEstimator, TransformerMixin
7
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
8

9

10
__all__ = [
11
    'CountVectorizer',
12
    'TfidfTransformer',
13
    'TfidfVectorizer']
14

15

16
class CountVectorizer(BaseEstimator):
17
    """
18
    Convert a collection of text documents to a matrix of token counts,
19
    this implementation produces a sparse representation of the counts using
20
    scipy.sparse.csr_matrix.
21

22
    The number of features will be equal to the vocabulary size found by
23
    analyzing all input documents and removal of stop words
24

25
    Parameters
26
    ----------
27
    analyzer : str {'word'} or callable
28
        Whether the feature should be made of word, if n-grams is specified,
29
        then the words are concatenated with space.
30
        If a callable is passed it is used to extract the sequence of features
31
        out of the raw, unprocessed input.
32

33
    ngram_range : tuple (min_n, max_n)
34
        The lower and upper boundary of the range of n-values for different
35
        n-grams to be extracted. All values of n such that min_n <= n <= max_n
36
        will be used.
37

38
    token_pattern : str
39
        Regular expression denoting what constitutes a "token", only used
40
        if ``analyzer == 'word'``. The default regexp select tokens of 2
41
        or more alphanumeric characters (punctuation is completely ignored
42
        and always treated as a token separator).
43

44
    stop_words : str {'english'}, collection, or None, default None
45
        - If 'english', a built-in stop word list for English is used.
46
        - If a collection, that list or set is assumed to contain stop words,
47
        all of which will be removed from the resulting tokens. Only applies
48
        if ``analyzer == 'word'``.
49
        - If None, no stop words will be used.
50

51
    lowercase : bool, default True
52
        Convert all characters to lowercase before tokenizing.
53

54
    binary : bool, default False
55
        If True, all non zero counts are set to 1. This is useful for discrete
56
        probabilistic models such as binomial naive bayes that model binary
57
        events rather than integer counts.
58

59
    Attributes
60
    ----------
61
    vocabulary_ : dict
62
        A mapping of terms to feature indices.
63
    """
64

65
    def __init__(self, analyzer = 'word', ngram_range = (1, 1),
66
                 token_pattern = r'\b\w\w+\b', stop_words = None,
67
                 lowercase = True, binary = False):
68
        self.binary = binary
69
        self.analyzer = analyzer
70
        self.lowercase = lowercase
71
        self.stop_words = stop_words
72
        self.ngram_range = ngram_range
73
        self.token_pattern = token_pattern
74

75
    def fit(self, raw_documents, y = None):
76
        """
77
        Learn the vocabulary dictionary of all tokens in the raw documents.
78

79
        Parameters
80
        ----------
81
        raw_documents : iterable
82
            An iterable which yields str
83

84
        y : default None
85
            Ignore, argument required for constructing sklearn Pipeline.
86

87
        Returns
88
        -------
89
        self
90
        """
91
        self.fit_transform(raw_documents)
92
        return self
93

94
    def fit_transform(self, raw_documents, y = None):
95
        """
96
        Learn the vocabulary dictionary and return document-term matrix.
97
        This is equivalent to calling fit followed by transform, but more
98
        efficiently implemented.
99

100
        Parameters
101
        ----------
102
        raw_documents : iterable
103
            An iterable which yields either str.
104

105
        y : default None
106
            Ignore, argument required for constructing sklearn Pipeline.
107

108
        Returns
109
        -------
110
        X : scipy sparse matrix, shape [n_samples, n_features]
111
            Count document-term matrix.
112
        """
113
        if isinstance(raw_documents, str):
114
            raise ValueError(
115
                'Iterable over raw text documents expected, '
116
                'string objected received')
117

118
        X, vocabulary = self._count_vocab(raw_documents, fixed_vocab = False)
119
        if self.binary:
120
            X.data.fill(1)
121

122
        # we can add additional filtering after we construct
123
        # the document-term matrix, but this is omitted for now
124
        self.vocabulary_ = vocabulary
125
        return X
126

127
    def _count_vocab(self, raw_documents, fixed_vocab):
128
        """Create sparse feature matrix and vocabulary if fixed_vocab = False"""
129
        if fixed_vocab:
130
            vocabulary = self.vocabulary_
131
        else:
132
            # add new value when a new vocabulary item is seen
133
            vocabulary = defaultdict()
134
            vocabulary.default_factory = vocabulary.__len__
135

136
        # lambda function to split strings into n_gramed tokens
137
        analyze = self._build_analyzer()
138

139
        # information to create sparse csr_matrix
140
        values = []
141
        indptr = []
142
        indices = []
143
        indptr.append(0)
144
        for doc in raw_documents:
145
            # maps feature index to count
146
            feature_counter = {}
147
            for feature in analyze(doc):
148
                try:
149
                    feature_idx = vocabulary[feature]
150
                    if feature_idx not in feature_counter:
151
                        feature_counter[feature_idx] = 1
152
                    else:
153
                        feature_counter[feature_idx] += 1
154
                except KeyError:
155
                    # ignore out-of-vocabulary items for fixed_vocab = True
156
                    continue
157

158
            indices.extend(feature_counter.keys())
159
            values.extend(feature_counter.values())
160
            indptr.append(len(indices))
161

162
        # disable defaultdict behaviour
163
        if not fixed_vocab:
164
            vocabulary = dict(vocabulary)
165

166
        indices = np.asarray(indices, dtype = np.intc)
167
        values = np.asarray(values, dtype = np.intc)
168
        indptr = np.asarray(indptr, dtype = np.intc)
169
        shape = len(indptr) - 1, len(vocabulary)
170
        X = csr_matrix((values, indices, indptr), shape = shape, dtype = np.intc)
171
        return X, vocabulary
172

173
    def _build_analyzer(self):
174
        """Return a callable that handles preprocessing and tokenization"""
175
        if callable(self.analyzer):
176
            return self.analyzer
177
        elif self.analyzer == 'word':
178
            tokenize = self._build_tokenizer()
179
            stop_words = self._get_stop_words()
180
            return lambda doc: self._word_ngrams(tokenize(doc), stop_words)
181
        else:
182
            raise ValueError('{} is not a valid tokenization scheme/analyzer'.format(
183
                             self.analyzer))
184

185
    def _build_tokenizer(self):
186
        """Returns a function that splits a string into a sequence of tokens"""
187
        token_pattern = re.compile(self.token_pattern)
188
        if self.lowercase:
189
            return lambda doc: token_pattern.findall(doc.lower())
190
        else:
191
            return lambda doc: token_pattern.findall(doc)
192

193
    def _get_stop_words(self):
194
        """Build or fetch the effective stop words frozenset"""
195
        stop = self.stop_words
196
        if stop == 'english':
197
            return ENGLISH_STOP_WORDS
198
        elif stop is None:
199
            return None
200
        elif isinstance(stop, str):
201
            raise ValueError("Stop words not a collection")
202
        else:
203
            return frozenset(stop)
204

205
    def _word_ngrams(self, tokens, stop_words):
206
        """Tokenize document into a sequence of n-grams after stop words filtering"""
207
        if stop_words is not None:
208
            tokens = [w for w in tokens if w not in stop_words]
209

210
        # handle token n-grams
211
        min_n, max_n = self.ngram_range
212
        if max_n == 1:
213
            return tokens
214
        else:
215
            original_tokens = list(tokens)
216
            n_original_tokens = len(original_tokens)
217
            if min_n == 1:
218
                min_n += 1
219
            else:
220
                tokens = []
221

222
            # bind method outside of loop to reduce overhead,
223
            # as local variables are accessed more quickly than attribute lookups
224
            # https://wiki.python.org/moin/PythonSpeed
225
            # https://stackoverflow.com/questions/28597014/python-why-is-accessing-instance-attribute-is-slower-than-local
226
            tokens_append = tokens.append
227
            space_join = ' '.join
228
            for n in range(min_n, min(max_n + 1, n_original_tokens + 1)):
229
                for i in range(n_original_tokens - n + 1):
230
                    tokens_append(space_join(original_tokens[i:i + n]))
231

232
            return tokens
233

234
    def transform(self, raw_documents):
235
        """
236
        Transform documents to document-term matrix.
237
        Extract token counts out of raw text documents using the vocabulary
238
        fitted with fit or fit_transform.
239

240
        Parameters
241
        ----------
242
        raw_documents : iterable
243
            An iterable which yields either str.
244

245
        Returns
246
        -------
247
        X : scipy sparse matrix, shape [n_samples, n_features]
248
            Document-term matrix.
249
        """
250

251
        # use the same matrix-building strategy as fit_transform
252
        X, _ = self._count_vocab(raw_documents, fixed_vocab = True)
253
        if self.binary:
254
            X.data.fill(1)
255

256
        return X
257

258

259
class TfidfTransformer(BaseEstimator, TransformerMixin):
260
    """
261
    Transform a count matrix to a tf-idf representation.
262

263
    Parameters
264
    ----------
265
    norm : 'l1', 'l2' or None, default 'l2'
266
        Norm used to normalize term vectors. None for no normalization.
267

268
    smooth_idf : bool, default True
269
        Smooth idf weights by adding one to document frequencies, as if an
270
        extra document was seen containing every term in the collection
271
        exactly once. Prevents zero divisions.
272

273
    sublinear_tf : bool, default False
274
        Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
275

276
    copy : bool, default True
277
        Whether to copy input data and operate on the copy or perform in-place operations.
278
    """
279

280
    def __init__(self, norm = 'l2', smooth_idf = True, sublinear_tf = False, copy = True):
281
        self.norm = norm
282
        self.copy = copy
283
        self.smooth_idf = smooth_idf
284
        self.sublinear_tf = sublinear_tf
285

286
    def fit(self, X, y = None):
287
        """
288
        Learn the idf vector.
289

290
        Parameters
291
        ----------
292
        X : scipy sparse matrix, shape [n_samples, n_features]
293
            Count document-term matrix.
294

295
        y : default None
296
            Ignore, argument required for constructing sklearn Pipeline.
297

298
        Returns
299
        -------
300
        self
301
        """
302
        n_samples, n_features = X.shape
303
        doc_freq = np.bincount(X.indices, minlength = X.shape[1])
304

305
        # perform idf smoothing if required
306
        doc_freq += int(self.smooth_idf)
307
        n_samples += int(self.smooth_idf)
308

309
        # log + 1 instead of log makes sure terms with zero idf
310
        # don't get suppressed entirely.
311
        idf = np.log(float(n_samples) / doc_freq) + 1.0
312
        self._idf_diag = spdiags(idf, diags = 0, m = n_features, n = n_features, format = 'csr')
313
        return self
314

315
    def transform(self, X):
316
        """
317
        Transform a count matrix to tf-idf representation.
318

319
        Parameters
320
        ----------
321
        X : scipy sparse matrix, shape [n_samples, n_features]
322
            Count document-term matrix.
323

324
        Returns
325
        -------
326
        X : scipy sparse matrix, shape [n_samples, n_features]
327
            Tf-idf weighted document-term matrix.
328
        """
329
        if self.copy:
330
            X = X.copy()
331

332
        if self.sublinear_tf:
333
            X.data = np.log(X.data)
334
            X.data += 1
335

336
        # compute the tfidf matrix
337
        X *= self._idf_diag
338

339
        if self.norm is not None:
340
            X = normalize(X, norm = self.norm, copy = False)
341

342
        return X
343

344

345
class TfidfVectorizer(CountVectorizer):
346
    """
347
    Convert a collection of raw documents to a matrix of TF-IDF features.
348
    This is equivalent to CountVectorizer followed by TfidfTransformer.
349

350
    Parameters
351
    ----------
352
    analyzer : str {'word'} or callable
353
        Whether the feature should be made of word, if n-grams is specified,
354
        then the words are concatenated with space.
355
        If a callable is passed it is used to extract the sequence of features
356
        out of the raw, unprocessed input.
357

358
    ngram_range : tuple (min_n, max_n)
359
        The lower and upper boundary of the range of n-values for different
360
        n-grams to be extracted. All values of n such that min_n <= n <= max_n
361
        will be used.
362

363
    token_pattern : str
364
        Regular expression denoting what constitutes a "token", only used
365
        if ``analyzer == 'word'``. The default regexp select tokens of 2
366
        or more alphanumeric characters (punctuation is completely ignored
367
        and always treated as a token separator).
368

369
    stop_words : str {'english'}, collection, or None, default None
370
        - If 'english', a built-in stop word list for English is used.
371
        - If a collection, that list or set is assumed to contain stop words,
372
        all of which will be removed from the resulting tokens. Only applies
373
        if ``analyzer == 'word'``.
374
        - If None, no stop words will be used.
375

376
    lowercase : bool, default True
377
        Convert all characters to lowercase before tokenizing.
378

379
    binary : boolean, default False
380
        If True, all non-zero term counts are set to 1. This does not mean
381
        outputs will have only 0/1 values, only the tf term in tf-idf will
382
        become binary.
383

384
    norm : 'l1', 'l2' or None, default 'l2'
385
        Norm used to normalize term vectors. None for no normalization.
386

387
    smooth_idf : bool, default True
388
        Smooth idf weights by adding one to document frequencies, as if an
389
        extra document was seen containing every term in the collection
390
        exactly once. Prevents zero divisions.
391

392
    sublinear_tf : bool, default False
393
        Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
394

395
    copy : bool, default True
396
        Whether to copy input data and operate on the copy or perform in-place operations.
397

398
    Attributes
399
    ----------
400
    vocabulary_ : dict
401
        A mapping of terms to feature indices.
402
    """
403

404
    def __init__(self, analyzer = 'word', ngram_range = (1, 1), token_pattern = r'\b\w\w+\b',
405
                 stop_words = None, lowercase = True, binary = False, norm = 'l2',
406
                 smooth_idf = True, sublinear_tf = False, copy = True):
407
        super().__init__(
408
            analyzer = analyzer, ngram_range = ngram_range,
409
            token_pattern = token_pattern, stop_words = stop_words, lowercase = lowercase)
410

411
        self._tfidf = TfidfTransformer(
412
            norm = norm, smooth_idf = smooth_idf, sublinear_tf = sublinear_tf, copy = copy)
413

414
    def fit(self, raw_documents, y = None):
415
        """
416
        Learn vocabulary and idf from training set.
417

418
        Parameters
419
        ----------
420
        raw_documents : iterable
421
            An iterable which yields str.
422

423
        y : default None
424
            Ignore, argument required for constructing sklearn Pipeline.
425

426
        Returns
427
        -------
428
        self
429
        """
430
        X = super().fit_transform(raw_documents)
431
        self._tfidf.fit(X)
432
        return self
433

434
    def fit_transform(self, raw_documents, y = None):
435
        """
436
        Learn vocabulary and idf, return term-document matrix.
437
        This is equivalent to calling fit followed by transform, but more
438
        efficiently implemented.
439

440
        Parameters
441
        ----------
442
        raw_documents : iterable
443
            An iterable which yields str.
444

445
        y : default None
446
            Ignore, argument required for constructing sklearn Pipeline.
447

448
        Returns
449
        -------
450
        X : scipy sparse matrix, shape [n_samples, n_features]
451
            Tf-idf weighted document-term matrix.
452
        """
453
        X = super().fit_transform(raw_documents)
454
        return self._tfidf.fit_transform(X)
455

456
    def transform(self, raw_documents):
457
        """
458
        Transform documents to document-term matrix.
459

460
        Uses the vocabulary and document frequencies learned by fit or
461
        fit_transform.
462

463
        Parameters
464
        ----------
465
        raw_documents : iterable
466
            An iterable which yields str.
467

468
        Returns
469
        -------
470
        X : scipy sparse matrix, shape [n_samples, n_features]
471
            Tf-idf weighted document-term matrix.
472
        """
473
        X = super().transform(raw_documents)
474
        return self._tfidf.transform(X)
475

476
    # broadcast the TfidfTransformer's parameters to the underlying transformer
477
    # instance to enable hyperparameter search and repr
478
    @property
479
    def norm(self):
480
        return self._tfidf.norm
481

482
    @norm.setter
483
    def norm(self, value):
484
        self._tfidf.norm = value
485

486
    @property
487
    def use_idf(self):
488
        return self._tfidf.use_idf
489

490
    @use_idf.setter
491
    def use_idf(self, value):
492
        self._tfidf.use_idf = value
493

494
    @property
495
    def smooth_idf(self):
496
        return self._tfidf.smooth_idf
497

498
    @smooth_idf.setter
499
    def smooth_idf(self, value):
500
        self._tfidf.smooth_idf = value
501

502
    @property
503
    def sublinear_tf(self):
504
        return self._tfidf.sublinear_tf
505

506
    @sublinear_tf.setter
507
    def sublinear_tf(self, value):
508
        self._tfidf.sublinear_tf = value
509

510
Product

Resources

Company