Path: blob/master/CountVectorizer/CountVectorizer.ipynb
314 views
Kernel: Python 3
CountVectorizer Usage Examples
In [3]:
Warm Up Example
In [4]:
In [5]:
In [6]:
Out[6]:
{'one': 7,
'cent': 2,
'two': 8,
'cents': 3,
'old': 6,
'new': 5,
'all': 1,
'about': 0,
'money': 4}
In [7]:
Out[7]:
(1, 9)
In [8]:
Out[8]:
set()
CountVectorizer With More Data
Plain and Simple
In [9]:
In [10]:
In [11]:
Out[11]:
{'one': 28,
'cent': 8,
'two': 40,
'cents': 9,
'old': 26,
'new': 23,
'all': 1,
'about': 0,
'money': 22,
'cat': 7,
'in': 16,
'the': 37,
'hat': 13,
'learning': 19,
'library': 20,
'inside': 18,
'your': 42,
'outside': 30,
'human': 15,
'body': 4,
'oh': 25,
'things': 39,
'you': 41,
'can': 6,
'do': 10,
'that': 36,
'are': 2,
'good': 12,
'for': 11,
'staying': 34,
'healthy': 14,
'on': 27,
'beyond': 3,
'bugs': 5,
'insects': 17,
'there': 38,
'no': 24,
'place': 31,
'like': 21,
'space': 33,
'our': 29,
'solar': 32,
'system': 35}
In [14]:
Out[14]:
(5, 43)
In [15]:
Out[15]:
set()
CountVectorizer With Custom StopWords
In [21]:
Out[21]:
(5, 40)
In [22]:
Out[22]:
['all', 'in', 'the', 'is', 'and']
In [23]:
Out[23]:
set()
CountVectorizer With Predefined StopWords
In [435]:
In [436]:
Out[436]:
(5, 24)
In [437]:
Out[437]:
'english'
In [438]:
Out[438]:
set()
In [439]:
Out[439]:
{'cent': 3,
'cents': 4,
'old': 17,
'new': 15,
'money': 14,
'cat': 2,
'hat': 6,
'learning': 11,
'library': 12,
'inside': 10,
'outside': 18,
'human': 8,
'body': 0,
'oh': 16,
'things': 23,
'good': 5,
'staying': 22,
'healthy': 7,
'bugs': 1,
'insects': 9,
'place': 19,
'like': 13,
'space': 21,
'solar': 20}
CountVectorizer with MIN_DF as StopWords
In [440]:
In [441]:
Out[441]:
{'are',
'beyond',
'body',
'bugs',
'can',
'cent',
'cents',
'do',
'for',
'good',
'healthy',
'human',
'insects',
'inside',
'like',
'money',
'new',
'no',
'oh',
'old',
'on',
'one',
'our',
'outside',
'place',
'solar',
'space',
'staying',
'system',
'that',
'there',
'things',
'two',
'you',
'your'}
In [442]:
Out[442]:
(5, 8)
In [443]:
In [444]:
Out[444]:
(5, 8)
In [445]:
Out[445]:
{'all': 1,
'about': 0,
'cat': 2,
'in': 4,
'the': 7,
'hat': 3,
'learning': 5,
'library': 6}
In [446]:
Out[446]:
{'are',
'beyond',
'body',
'bugs',
'can',
'cent',
'cents',
'do',
'for',
'good',
'healthy',
'human',
'insects',
'inside',
'like',
'money',
'new',
'no',
'oh',
'old',
'on',
'one',
'our',
'outside',
'place',
'solar',
'space',
'staying',
'system',
'that',
'there',
'things',
'two',
'you',
'your'}
CountVectorizer with MAX_DF as StopWords
In [447]:
In [448]:
Out[448]:
{'one': 21,
'cent': 5,
'two': 32,
'cents': 6,
'old': 19,
'new': 16,
'money': 15,
'inside': 13,
'your': 34,
'outside': 23,
'human': 11,
'body': 2,
'oh': 18,
'things': 31,
'you': 33,
'can': 4,
'do': 7,
'that': 29,
'are': 0,
'good': 9,
'for': 8,
'staying': 27,
'healthy': 10,
'on': 20,
'beyond': 1,
'bugs': 3,
'insects': 12,
'there': 30,
'no': 17,
'place': 24,
'like': 14,
'space': 26,
'our': 22,
'solar': 25,
'system': 28}
In [449]:
Out[449]:
{'about', 'all', 'cat', 'hat', 'in', 'learning', 'library', 'the'}
In [450]:
In [451]:
Out[451]:
{'about', 'all', 'cat', 'hat', 'in', 'learning', 'library', 'the'}
Custom Preprocessing
In [32]:
In [33]:
Out[33]:
{'one': 25,
'cent': 8,
'two': 37,
'old': 23,
'new': 20,
'_connector_': 0,
'about': 1,
'money': 19,
'cat': 7,
'the': 34,
'hat': 11,
'learn': 16,
'librari': 17,
'insid': 15,
'your': 39,
'outsid': 27,
'human': 13,
'bodi': 4,
'oh': 22,
'thing': 36,
'you': 38,
'can': 6,
'do': 9,
'that': 33,
'are': 2,
'good': 10,
'stay': 31,
'healthi': 12,
'on': 24,
'beyond': 3,
'bug': 5,
'insect': 14,
'there': 35,
'no': 21,
'place': 28,
'like': 18,
'space': 30,
'our': 26,
'solar': 29,
'system': 32}
In [454]:
Working With N-Grams
In [455]:
In [456]:
Out[456]:
{'one cent': 35,
'cent two': 19,
'two cent': 47,
'cent old': 18,
'old cent': 33,
'cent new': 17,
'new cent': 30,
'cent _connector_': 16,
'_connector_ about': 0,
'about money': 7,
'money cat': 29,
'cat _connector_': 15,
'_connector_ the': 2,
'the hat': 44,
'hat learn': 22,
'learn librari': 27,
'insid your': 26,
'your outsid': 50,
'outsid _connector_': 37,
'about _connector_': 5,
'_connector_ human': 1,
'human bodi': 24,
'bodi cat': 12,
'oh _connector_': 32,
'_connector_ thing': 3,
'thing you': 46,
'you can': 49,
'can do': 14,
'do that': 20,
'that are': 43,
'are good': 10,
'good _connector_': 21,
'_connector_ you': 4,
'you _connector_': 48,
'about stay': 9,
'stay healthi': 41,
'healthi cat': 23,
'on beyond': 34,
'beyond bug': 11,
'bug _connector_': 13,
'about insect': 6,
'insect cat': 25,
'there no': 45,
'no place': 31,
'place like': 38,
'like space': 28,
'space _connector_': 40,
'about our': 8,
'our solar': 36,
'solar system': 39,
'system cat': 42}
In [457]:
Out[457]:
(5, 51)
In [458]:
In [459]:
Out[459]:
{'one': 60,
'cent': 24,
'two': 84,
'old': 56,
'new': 50,
'_connector_': 0,
'about': 6,
'money': 48,
'cat': 22,
'the': 78,
'hat': 33,
'learn': 43,
'librari': 45,
'one cent': 61,
'cent two': 28,
'two cent': 85,
'cent old': 27,
'old cent': 57,
'cent new': 26,
'new cent': 51,
'cent _connector_': 25,
'_connector_ about': 1,
'about money': 9,
'money cat': 49,
'cat _connector_': 23,
'_connector_ the': 3,
'the hat': 79,
'hat learn': 34,
'learn librari': 44,
'insid': 41,
'your': 89,
'outsid': 64,
'human': 37,
'bodi': 16,
'insid your': 42,
'your outsid': 90,
'outsid _connector_': 65,
'about _connector_': 7,
'_connector_ human': 2,
'human bodi': 38,
'bodi cat': 17,
'oh': 54,
'thing': 82,
'you': 86,
'can': 20,
'do': 29,
'that': 76,
'are': 12,
'good': 31,
'stay': 72,
'healthi': 35,
'oh _connector_': 55,
'_connector_ thing': 4,
'thing you': 83,
'you can': 88,
'can do': 21,
'do that': 30,
'that are': 77,
'are good': 13,
'good _connector_': 32,
'_connector_ you': 5,
'you _connector_': 87,
'about stay': 11,
'stay healthi': 73,
'healthi cat': 36,
'on': 58,
'beyond': 14,
'bug': 18,
'insect': 39,
'on beyond': 59,
'beyond bug': 15,
'bug _connector_': 19,
'about insect': 8,
'insect cat': 40,
'there': 80,
'no': 52,
'place': 66,
'like': 46,
'space': 70,
'our': 62,
'solar': 68,
'system': 74,
'there no': 81,
'no place': 53,
'place like': 67,
'like space': 47,
'space _connector_': 71,
'about our': 10,
'our solar': 63,
'solar system': 69,
'system cat': 75}
In [460]:
Out[460]:
(5, 91)
Working With Character N-Grams
In [461]:
In [462]:
Out[462]:
{' o': 11,
'on': 77,
'ne': 67,
'e ': 36,
' c': 3,
'ce': 30,
'en': 40,
'nt': 72,
't ': 96,
' t': 14,
'tw': 102,
'wo': 109,
'o ': 73,
'ol': 76,
'ld': 58,
'd ': 33,
' n': 10,
'ew': 42,
'w ': 108,
' _': 0,
'_c': 17,
'co': 31,
'nn': 69,
'ec': 38,
'ct': 32,
'to': 100,
'or': 79,
'r_': 84,
'_ ': 16,
' a': 1,
'ab': 18,
'bo': 26,
'ou': 80,
'ut': 107,
' m': 9,
'mo': 64,
'ey': 43,
'y ': 110,
'ca': 29,
'at': 23,
'th': 99,
'he': 48,
' h': 6,
'ha': 47,
' s': 13,
's ': 89,
' l': 8,
'le': 59,
'ea': 37,
'ar': 22,
'rn': 88,
'n ': 65,
'li': 60,
'ib': 52,
'br': 27,
'ra': 85,
'ri': 87,
'i ': 51,
' i': 7,
'in': 55,
'ns': 71,
'si': 91,
'id': 53,
' y': 15,
'yo': 111,
'ur': 106,
'r ': 83,
'ts': 101,
'hu': 50,
'um': 105,
'ma': 63,
'an': 21,
' b': 2,
'od': 74,
'di': 34,
'oh': 75,
'h ': 46,
'hi': 49,
'ng': 68,
'g ': 44,
'u ': 103,
' d': 4,
'do': 35,
're': 86,
' g': 5,
'go': 45,
'oo': 78,
'st': 94,
'ta': 97,
'ay': 24,
'al': 20,
'lt': 61,
'be': 25,
'nd': 66,
'bu': 28,
'ug': 104,
'se': 90,
'er': 41,
'no': 70,
' p': 12,
'pl': 82,
'la': 57,
'ac': 19,
'ik': 54,
'ke': 56,
'sp': 93,
'pa': 81,
'so': 92,
'sy': 95,
'ys': 112,
'te': 98,
'em': 39,
'm ': 62}
In [463]:
Out[463]:
(5, 113)
Limiting Vocabulary Size
In [464]:
In [465]:
Out[465]:
(5, 10)
In [466]:
Out[466]:
{'_connector_': 0,
'cat': 1,
'the': 8,
'hat': 3,
'learn': 5,
'librari': 7,
'cat _connector_': 2,
'the hat': 9,
'hat learn': 4,
'learn librari': 6}
Extracting Counts of Words / N-Grams
In [37]:
In [54]:
Out[54]:
[('cent', 4),
('_connector_', 2),
('two cent', 1),
('two', 1),
('the hat', 1),
('the', 1),
('one cent', 1),
('one', 1),
('old cent', 1),
('old', 1)]
Binary Values Instead of Counts
In [60]:
Out[60]:
(0, 35) 1
(0, 74) 1
(0, 4) 1
(0, 90) 1
(0, 29) 1
(0, 31) 1
(0, 9) 1
(0, 78) 1
(0, 27) 1
(0, 17) 1
(0, 91) 1
(0, 86) 1
(0, 82) 1
(0, 56) 1
(0, 34) 1
(0, 73) 1
(0, 28) 1
(0, 30) 1
(0, 8) 1
(0, 77) 1
(0, 26) 1
(0, 16) 1
(0, 89) 1
(0, 85) 1
(0, 55) 1
(0, 45) 1
(0, 33) 1
(0, 80) 1
(0, 39) 1
(0, 19) 1
(0, 7) 1
(0, 46) 1
(0, 44) 1
(0, 32) 1
(0, 79) 1
(0, 38) 1
(0, 18) 1
(0, 0) 1
(0, 6) 1
Custom Tokenizer
In [78]:
Out[78]:
{'one': 34, 'cent': 14, ',': 4, 'two': 47, 'cents': 15, 'old': 32, 'new': 29, ':': 5, 'all': 7, 'about': 6, 'money': 28, '(': 2, 'cat': 13, 'in': 22, 'the': 44, 'hat': 19, "'": 1, 's': 38, 'learning': 25, 'library': 26, 'inside': 24, 'your': 49, 'outside': 36, 'human': 21, 'body': 10, ')': 3, '': 0, 'oh': 31, 'things': 46, 'you': 48, 'can': 12, 'do': 16, 'that': 43, 'are': 8, 'good': 18, 'for': 17, 'staying': 41, 'healthy': 20, 'on': 33, 'beyond': 9, 'bugs': 11, 'insects': 23, 'there': 45, 'no': 30, 'place': 37, 'like': 27, 'space': 40, 'our': 35, 'solar': 39, 'system': 42}
In [ ]: