Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering/tfidf/feature_extraction.py
2587 views
1
import re
2
import numpy as np
3
from collections import defaultdict
4
from scipy.sparse import spdiags, csr_matrix
5
from sklearn.preprocessing import normalize
6
from sklearn.base import BaseEstimator, TransformerMixin
7
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
8
9
10
__all__ = [
11
'CountVectorizer',
12
'TfidfTransformer',
13
'TfidfVectorizer']
14
15
16
class CountVectorizer(BaseEstimator):
17
"""
18
Convert a collection of text documents to a matrix of token counts,
19
this implementation produces a sparse representation of the counts using
20
scipy.sparse.csr_matrix.
21
22
The number of features will be equal to the vocabulary size found by
23
analyzing all input documents and removal of stop words
24
25
Parameters
26
----------
27
analyzer : str {'word'} or callable
28
Whether the feature should be made of word, if n-grams is specified,
29
then the words are concatenated with space.
30
If a callable is passed it is used to extract the sequence of features
31
out of the raw, unprocessed input.
32
33
ngram_range : tuple (min_n, max_n)
34
The lower and upper boundary of the range of n-values for different
35
n-grams to be extracted. All values of n such that min_n <= n <= max_n
36
will be used.
37
38
token_pattern : str
39
Regular expression denoting what constitutes a "token", only used
40
if ``analyzer == 'word'``. The default regexp select tokens of 2
41
or more alphanumeric characters (punctuation is completely ignored
42
and always treated as a token separator).
43
44
stop_words : str {'english'}, collection, or None, default None
45
- If 'english', a built-in stop word list for English is used.
46
- If a collection, that list or set is assumed to contain stop words,
47
all of which will be removed from the resulting tokens. Only applies
48
if ``analyzer == 'word'``.
49
- If None, no stop words will be used.
50
51
lowercase : bool, default True
52
Convert all characters to lowercase before tokenizing.
53
54
binary : bool, default False
55
If True, all non zero counts are set to 1. This is useful for discrete
56
probabilistic models such as binomial naive bayes that model binary
57
events rather than integer counts.
58
59
Attributes
60
----------
61
vocabulary_ : dict
62
A mapping of terms to feature indices.
63
"""
64
65
def __init__(self, analyzer = 'word', ngram_range = (1, 1),
66
token_pattern = r'\b\w\w+\b', stop_words = None,
67
lowercase = True, binary = False):
68
self.binary = binary
69
self.analyzer = analyzer
70
self.lowercase = lowercase
71
self.stop_words = stop_words
72
self.ngram_range = ngram_range
73
self.token_pattern = token_pattern
74
75
def fit(self, raw_documents, y = None):
76
"""
77
Learn the vocabulary dictionary of all tokens in the raw documents.
78
79
Parameters
80
----------
81
raw_documents : iterable
82
An iterable which yields str
83
84
y : default None
85
Ignore, argument required for constructing sklearn Pipeline.
86
87
Returns
88
-------
89
self
90
"""
91
self.fit_transform(raw_documents)
92
return self
93
94
def fit_transform(self, raw_documents, y = None):
95
"""
96
Learn the vocabulary dictionary and return document-term matrix.
97
This is equivalent to calling fit followed by transform, but more
98
efficiently implemented.
99
100
Parameters
101
----------
102
raw_documents : iterable
103
An iterable which yields either str.
104
105
y : default None
106
Ignore, argument required for constructing sklearn Pipeline.
107
108
Returns
109
-------
110
X : scipy sparse matrix, shape [n_samples, n_features]
111
Count document-term matrix.
112
"""
113
if isinstance(raw_documents, str):
114
raise ValueError(
115
'Iterable over raw text documents expected, '
116
'string objected received')
117
118
X, vocabulary = self._count_vocab(raw_documents, fixed_vocab = False)
119
if self.binary:
120
X.data.fill(1)
121
122
# we can add additional filtering after we construct
123
# the document-term matrix, but this is omitted for now
124
self.vocabulary_ = vocabulary
125
return X
126
127
def _count_vocab(self, raw_documents, fixed_vocab):
128
"""Create sparse feature matrix and vocabulary if fixed_vocab = False"""
129
if fixed_vocab:
130
vocabulary = self.vocabulary_
131
else:
132
# add new value when a new vocabulary item is seen
133
vocabulary = defaultdict()
134
vocabulary.default_factory = vocabulary.__len__
135
136
# lambda function to split strings into n_gramed tokens
137
analyze = self._build_analyzer()
138
139
# information to create sparse csr_matrix
140
values = []
141
indptr = []
142
indices = []
143
indptr.append(0)
144
for doc in raw_documents:
145
# maps feature index to count
146
feature_counter = {}
147
for feature in analyze(doc):
148
try:
149
feature_idx = vocabulary[feature]
150
if feature_idx not in feature_counter:
151
feature_counter[feature_idx] = 1
152
else:
153
feature_counter[feature_idx] += 1
154
except KeyError:
155
# ignore out-of-vocabulary items for fixed_vocab = True
156
continue
157
158
indices.extend(feature_counter.keys())
159
values.extend(feature_counter.values())
160
indptr.append(len(indices))
161
162
# disable defaultdict behaviour
163
if not fixed_vocab:
164
vocabulary = dict(vocabulary)
165
166
indices = np.asarray(indices, dtype = np.intc)
167
values = np.asarray(values, dtype = np.intc)
168
indptr = np.asarray(indptr, dtype = np.intc)
169
shape = len(indptr) - 1, len(vocabulary)
170
X = csr_matrix((values, indices, indptr), shape = shape, dtype = np.intc)
171
return X, vocabulary
172
173
def _build_analyzer(self):
174
"""Return a callable that handles preprocessing and tokenization"""
175
if callable(self.analyzer):
176
return self.analyzer
177
elif self.analyzer == 'word':
178
tokenize = self._build_tokenizer()
179
stop_words = self._get_stop_words()
180
return lambda doc: self._word_ngrams(tokenize(doc), stop_words)
181
else:
182
raise ValueError('{} is not a valid tokenization scheme/analyzer'.format(
183
self.analyzer))
184
185
def _build_tokenizer(self):
186
"""Returns a function that splits a string into a sequence of tokens"""
187
token_pattern = re.compile(self.token_pattern)
188
if self.lowercase:
189
return lambda doc: token_pattern.findall(doc.lower())
190
else:
191
return lambda doc: token_pattern.findall(doc)
192
193
def _get_stop_words(self):
194
"""Build or fetch the effective stop words frozenset"""
195
stop = self.stop_words
196
if stop == 'english':
197
return ENGLISH_STOP_WORDS
198
elif stop is None:
199
return None
200
elif isinstance(stop, str):
201
raise ValueError("Stop words not a collection")
202
else:
203
return frozenset(stop)
204
205
def _word_ngrams(self, tokens, stop_words):
206
"""Tokenize document into a sequence of n-grams after stop words filtering"""
207
if stop_words is not None:
208
tokens = [w for w in tokens if w not in stop_words]
209
210
# handle token n-grams
211
min_n, max_n = self.ngram_range
212
if max_n == 1:
213
return tokens
214
else:
215
original_tokens = list(tokens)
216
n_original_tokens = len(original_tokens)
217
if min_n == 1:
218
min_n += 1
219
else:
220
tokens = []
221
222
# bind method outside of loop to reduce overhead,
223
# as local variables are accessed more quickly than attribute lookups
224
# https://wiki.python.org/moin/PythonSpeed
225
# https://stackoverflow.com/questions/28597014/python-why-is-accessing-instance-attribute-is-slower-than-local
226
tokens_append = tokens.append
227
space_join = ' '.join
228
for n in range(min_n, min(max_n + 1, n_original_tokens + 1)):
229
for i in range(n_original_tokens - n + 1):
230
tokens_append(space_join(original_tokens[i:i + n]))
231
232
return tokens
233
234
def transform(self, raw_documents):
235
"""
236
Transform documents to document-term matrix.
237
Extract token counts out of raw text documents using the vocabulary
238
fitted with fit or fit_transform.
239
240
Parameters
241
----------
242
raw_documents : iterable
243
An iterable which yields either str.
244
245
Returns
246
-------
247
X : scipy sparse matrix, shape [n_samples, n_features]
248
Document-term matrix.
249
"""
250
251
# use the same matrix-building strategy as fit_transform
252
X, _ = self._count_vocab(raw_documents, fixed_vocab = True)
253
if self.binary:
254
X.data.fill(1)
255
256
return X
257
258
259
class TfidfTransformer(BaseEstimator, TransformerMixin):
260
"""
261
Transform a count matrix to a tf-idf representation.
262
263
Parameters
264
----------
265
norm : 'l1', 'l2' or None, default 'l2'
266
Norm used to normalize term vectors. None for no normalization.
267
268
smooth_idf : bool, default True
269
Smooth idf weights by adding one to document frequencies, as if an
270
extra document was seen containing every term in the collection
271
exactly once. Prevents zero divisions.
272
273
sublinear_tf : bool, default False
274
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
275
276
copy : bool, default True
277
Whether to copy input data and operate on the copy or perform in-place operations.
278
"""
279
280
def __init__(self, norm = 'l2', smooth_idf = True, sublinear_tf = False, copy = True):
281
self.norm = norm
282
self.copy = copy
283
self.smooth_idf = smooth_idf
284
self.sublinear_tf = sublinear_tf
285
286
def fit(self, X, y = None):
287
"""
288
Learn the idf vector.
289
290
Parameters
291
----------
292
X : scipy sparse matrix, shape [n_samples, n_features]
293
Count document-term matrix.
294
295
y : default None
296
Ignore, argument required for constructing sklearn Pipeline.
297
298
Returns
299
-------
300
self
301
"""
302
n_samples, n_features = X.shape
303
doc_freq = np.bincount(X.indices, minlength = X.shape[1])
304
305
# perform idf smoothing if required
306
doc_freq += int(self.smooth_idf)
307
n_samples += int(self.smooth_idf)
308
309
# log + 1 instead of log makes sure terms with zero idf
310
# don't get suppressed entirely.
311
idf = np.log(float(n_samples) / doc_freq) + 1.0
312
self._idf_diag = spdiags(idf, diags = 0, m = n_features, n = n_features, format = 'csr')
313
return self
314
315
def transform(self, X):
316
"""
317
Transform a count matrix to tf-idf representation.
318
319
Parameters
320
----------
321
X : scipy sparse matrix, shape [n_samples, n_features]
322
Count document-term matrix.
323
324
Returns
325
-------
326
X : scipy sparse matrix, shape [n_samples, n_features]
327
Tf-idf weighted document-term matrix.
328
"""
329
if self.copy:
330
X = X.copy()
331
332
if self.sublinear_tf:
333
X.data = np.log(X.data)
334
X.data += 1
335
336
# compute the tfidf matrix
337
X *= self._idf_diag
338
339
if self.norm is not None:
340
X = normalize(X, norm = self.norm, copy = False)
341
342
return X
343
344
345
class TfidfVectorizer(CountVectorizer):
346
"""
347
Convert a collection of raw documents to a matrix of TF-IDF features.
348
This is equivalent to CountVectorizer followed by TfidfTransformer.
349
350
Parameters
351
----------
352
analyzer : str {'word'} or callable
353
Whether the feature should be made of word, if n-grams is specified,
354
then the words are concatenated with space.
355
If a callable is passed it is used to extract the sequence of features
356
out of the raw, unprocessed input.
357
358
ngram_range : tuple (min_n, max_n)
359
The lower and upper boundary of the range of n-values for different
360
n-grams to be extracted. All values of n such that min_n <= n <= max_n
361
will be used.
362
363
token_pattern : str
364
Regular expression denoting what constitutes a "token", only used
365
if ``analyzer == 'word'``. The default regexp select tokens of 2
366
or more alphanumeric characters (punctuation is completely ignored
367
and always treated as a token separator).
368
369
stop_words : str {'english'}, collection, or None, default None
370
- If 'english', a built-in stop word list for English is used.
371
- If a collection, that list or set is assumed to contain stop words,
372
all of which will be removed from the resulting tokens. Only applies
373
if ``analyzer == 'word'``.
374
- If None, no stop words will be used.
375
376
lowercase : bool, default True
377
Convert all characters to lowercase before tokenizing.
378
379
binary : boolean, default False
380
If True, all non-zero term counts are set to 1. This does not mean
381
outputs will have only 0/1 values, only the tf term in tf-idf will
382
become binary.
383
384
norm : 'l1', 'l2' or None, default 'l2'
385
Norm used to normalize term vectors. None for no normalization.
386
387
smooth_idf : bool, default True
388
Smooth idf weights by adding one to document frequencies, as if an
389
extra document was seen containing every term in the collection
390
exactly once. Prevents zero divisions.
391
392
sublinear_tf : bool, default False
393
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
394
395
copy : bool, default True
396
Whether to copy input data and operate on the copy or perform in-place operations.
397
398
Attributes
399
----------
400
vocabulary_ : dict
401
A mapping of terms to feature indices.
402
"""
403
404
def __init__(self, analyzer = 'word', ngram_range = (1, 1), token_pattern = r'\b\w\w+\b',
405
stop_words = None, lowercase = True, binary = False, norm = 'l2',
406
smooth_idf = True, sublinear_tf = False, copy = True):
407
super().__init__(
408
analyzer = analyzer, ngram_range = ngram_range,
409
token_pattern = token_pattern, stop_words = stop_words, lowercase = lowercase)
410
411
self._tfidf = TfidfTransformer(
412
norm = norm, smooth_idf = smooth_idf, sublinear_tf = sublinear_tf, copy = copy)
413
414
def fit(self, raw_documents, y = None):
415
"""
416
Learn vocabulary and idf from training set.
417
418
Parameters
419
----------
420
raw_documents : iterable
421
An iterable which yields str.
422
423
y : default None
424
Ignore, argument required for constructing sklearn Pipeline.
425
426
Returns
427
-------
428
self
429
"""
430
X = super().fit_transform(raw_documents)
431
self._tfidf.fit(X)
432
return self
433
434
def fit_transform(self, raw_documents, y = None):
435
"""
436
Learn vocabulary and idf, return term-document matrix.
437
This is equivalent to calling fit followed by transform, but more
438
efficiently implemented.
439
440
Parameters
441
----------
442
raw_documents : iterable
443
An iterable which yields str.
444
445
y : default None
446
Ignore, argument required for constructing sklearn Pipeline.
447
448
Returns
449
-------
450
X : scipy sparse matrix, shape [n_samples, n_features]
451
Tf-idf weighted document-term matrix.
452
"""
453
X = super().fit_transform(raw_documents)
454
return self._tfidf.fit_transform(X)
455
456
def transform(self, raw_documents):
457
"""
458
Transform documents to document-term matrix.
459
460
Uses the vocabulary and document frequencies learned by fit or
461
fit_transform.
462
463
Parameters
464
----------
465
raw_documents : iterable
466
An iterable which yields str.
467
468
Returns
469
-------
470
X : scipy sparse matrix, shape [n_samples, n_features]
471
Tf-idf weighted document-term matrix.
472
"""
473
X = super().transform(raw_documents)
474
return self._tfidf.transform(X)
475
476
# broadcast the TfidfTransformer's parameters to the underlying transformer
477
# instance to enable hyperparameter search and repr
478
@property
479
def norm(self):
480
return self._tfidf.norm
481
482
@norm.setter
483
def norm(self, value):
484
self._tfidf.norm = value
485
486
@property
487
def use_idf(self):
488
return self._tfidf.use_idf
489
490
@use_idf.setter
491
def use_idf(self, value):
492
self._tfidf.use_idf = value
493
494
@property
495
def smooth_idf(self):
496
return self._tfidf.smooth_idf
497
498
@smooth_idf.setter
499
def smooth_idf(self, value):
500
self._tfidf.smooth_idf = value
501
502
@property
503
def sublinear_tf(self):
504
return self._tfidf.sublinear_tf
505
506
@sublinear_tf.setter
507
def sublinear_tf(self, value):
508
self._tfidf.sublinear_tf = value
509
510