Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
kavgan
GitHub Repository: kavgan/nlp-in-practice
Path: blob/master/CountVectorizer/CountVectorizer.ipynb
314 views
Kernel: Python 3

CountVectorizer Usage Examples

from sklearn.feature_extraction.text import CountVectorizer

Warm Up Example

doc=["One Cent, Two Cents, Old Cent, New Cent: All About Money"]
cv = CountVectorizer(doc) count_vector=cv.fit_transform(doc)
# show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector. cv.vocabulary_
{'one': 7, 'cent': 2, 'two': 8, 'cents': 3, 'old': 6, 'new': 5, 'all': 1, 'about': 0, 'money': 4}
# matrix shape. 1 documents, 9 unique words count_vector.shape
(1, 9)
# any words eliminated internally? -- nope cv.stop_words_
set()

CountVectorizer With More Data

Plain and Simple

cat_in_the_hat_docs=[ "One Cent, Two Cents, Old Cent, New Cent: All About Money (Cat in the Hat's Learning Library", "Inside Your Outside: All About the Human Body (Cat in the Hat's Learning Library)", "Oh, The Things You Can Do That Are Good for You: All About Staying Healthy (Cat in the Hat's Learning Library)", "On Beyond Bugs: All About Insects (Cat in the Hat's Learning Library)", "There's No Place Like Space: All About Our Solar System (Cat in the Hat's Learning Library)" ]
cv = CountVectorizer(cat_in_the_hat_docs) count_vector=cv.fit_transform(cat_in_the_hat_docs)
# show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector. cv.vocabulary_
{'one': 28, 'cent': 8, 'two': 40, 'cents': 9, 'old': 26, 'new': 23, 'all': 1, 'about': 0, 'money': 22, 'cat': 7, 'in': 16, 'the': 37, 'hat': 13, 'learning': 19, 'library': 20, 'inside': 18, 'your': 42, 'outside': 30, 'human': 15, 'body': 4, 'oh': 25, 'things': 39, 'you': 41, 'can': 6, 'do': 10, 'that': 36, 'are': 2, 'good': 12, 'for': 11, 'staying': 34, 'healthy': 14, 'on': 27, 'beyond': 3, 'bugs': 5, 'insects': 17, 'there': 38, 'no': 24, 'place': 31, 'like': 21, 'space': 33, 'our': 29, 'solar': 32, 'system': 35}
#shape of count vector: 5 docs (book titles) and 43 unique words count_vector.shape
(5, 43)
#any stop words? cv.stop_words_
set()

CountVectorizer With Custom StopWords

cv = CountVectorizer(cat_in_the_hat_docs,stop_words=["all","in","the","is","and"]) count_vector=cv.fit_transform(cat_in_the_hat_docs) count_vector.shape
(5, 40)
#any stop words that we explicitly specified? cv.stop_words
['all', 'in', 'the', 'is', 'and']
#any stop words internally stopped by countvectorizer? cv.stop_words_
set()

CountVectorizer With Predefined StopWords

cv = CountVectorizer(cat_in_the_hat_docs,stop_words="english") count_vector=cv.fit_transform(cat_in_the_hat_docs)
# the shape should be smaller count_vector.shape
(5, 24)
#any stop words that we explicitly specified? cv.stop_words
'english'
#any stop words internally stopped by countvectorizer? cv.stop_words_
set()
# much smaller vocabulary with stopwords applied cv.vocabulary_
{'cent': 3, 'cents': 4, 'old': 17, 'new': 15, 'money': 14, 'cat': 2, 'hat': 6, 'learning': 11, 'library': 12, 'inside': 10, 'outside': 18, 'human': 8, 'body': 0, 'oh': 16, 'things': 23, 'good': 5, 'staying': 22, 'healthy': 7, 'bugs': 1, 'insects': 9, 'place': 19, 'like': 13, 'space': 21, 'solar': 20}

CountVectorizer with MIN_DF as StopWords

# ignore terms that appeared in less than n documents (can be proportion or absolute counts) cv = CountVectorizer(cat_in_the_hat_docs,min_df=2) count_vector=cv.fit_transform(cat_in_the_hat_docs)
#any stop words internally stopped by countvectorizer? cv.stop_words_
{'are', 'beyond', 'body', 'bugs', 'can', 'cent', 'cents', 'do', 'for', 'good', 'healthy', 'human', 'insects', 'inside', 'like', 'money', 'new', 'no', 'oh', 'old', 'on', 'one', 'our', 'outside', 'place', 'solar', 'space', 'staying', 'system', 'that', 'there', 'things', 'two', 'you', 'your'}
count_vector.shape
(5, 8)
# use proportion here. Ignore terms that occurred in less than 25% of the documents cv = CountVectorizer(cat_in_the_hat_docs,min_df=0.25) count_vector=cv.fit_transform(cat_in_the_hat_docs)
count_vector.shape
(5, 8)
cv.vocabulary_
{'all': 1, 'about': 0, 'cat': 2, 'in': 4, 'the': 7, 'hat': 3, 'learning': 5, 'library': 6}
#any stop words internally stopped by countvectorizer? cv.stop_words_
{'are', 'beyond', 'body', 'bugs', 'can', 'cent', 'cents', 'do', 'for', 'good', 'healthy', 'human', 'insects', 'inside', 'like', 'money', 'new', 'no', 'oh', 'old', 'on', 'one', 'our', 'outside', 'place', 'solar', 'space', 'staying', 'system', 'that', 'there', 'things', 'two', 'you', 'your'}

CountVectorizer with MAX_DF as StopWords

# ignore terms that appeared in more than n documents (can be proportion or absolute counts) # use proportion here cv = CountVectorizer(cat_in_the_hat_docs,max_df=0.50) count_vector=cv.fit_transform(cat_in_the_hat_docs)
cv.vocabulary_
{'one': 21, 'cent': 5, 'two': 32, 'cents': 6, 'old': 19, 'new': 16, 'money': 15, 'inside': 13, 'your': 34, 'outside': 23, 'human': 11, 'body': 2, 'oh': 18, 'things': 31, 'you': 33, 'can': 4, 'do': 7, 'that': 29, 'are': 0, 'good': 9, 'for': 8, 'staying': 27, 'healthy': 10, 'on': 20, 'beyond': 1, 'bugs': 3, 'insects': 12, 'there': 30, 'no': 17, 'place': 24, 'like': 14, 'space': 26, 'our': 22, 'solar': 25, 'system': 28}
cv.stop_words_
{'about', 'all', 'cat', 'hat', 'in', 'learning', 'library', 'the'}
# ignore terms that appeared in more than n documents (can be proportion or absolute counts) # use absolute values here - suitable when you know number of documents ahead of time and are dealing with only a handful cv = CountVectorizer(cat_in_the_hat_docs,max_df=4) count_vector=cv.fit_transform(cat_in_the_hat_docs)
cv.stop_words_
{'about', 'all', 'cat', 'hat', 'in', 'learning', 'library', 'the'}

Custom Preprocessing

import re import nltk import pandas as pd from nltk.stem import PorterStemmer # init stemmer porter_stemmer=PorterStemmer() def my_cool_preprocessor(text): text=text.lower() text=re.sub("\\W"," ",text) # remove special chars text=re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ",text) # normalize certain words # stem words words=re.split("\\s+",text) stemmed_words=[porter_stemmer.stem(word=word) for word in words] return ' '.join(stemmed_words) cv = CountVectorizer(cat_in_the_hat_docs,preprocessor=my_cool_preprocessor) count_vector=cv.fit_transform(cat_in_the_hat_docs)
cv.vocabulary_
{'one': 25, 'cent': 8, 'two': 37, 'old': 23, 'new': 20, '_connector_': 0, 'about': 1, 'money': 19, 'cat': 7, 'the': 34, 'hat': 11, 'learn': 16, 'librari': 17, 'insid': 15, 'your': 39, 'outsid': 27, 'human': 13, 'bodi': 4, 'oh': 22, 'thing': 36, 'you': 38, 'can': 6, 'do': 9, 'that': 33, 'are': 2, 'good': 10, 'stay': 31, 'healthi': 12, 'on': 24, 'beyond': 3, 'bug': 5, 'insect': 14, 'there': 35, 'no': 21, 'place': 28, 'like': 18, 'space': 30, 'our': 26, 'solar': 29, 'system': 32}
cv = CountVectorizer(cat_in_the_hat_docs,preprocessor=my_cool_preprocessor) count_vector=cv.fit_transform(cat_in_the_hat_docs)

Working With N-Grams

#only bigrams, word level cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(2,2),preprocessor=my_cool_preprocessor) count_vector=cv.fit_transform(cat_in_the_hat_docs)
cv.vocabulary_
{'one cent': 35, 'cent two': 19, 'two cent': 47, 'cent old': 18, 'old cent': 33, 'cent new': 17, 'new cent': 30, 'cent _connector_': 16, '_connector_ about': 0, 'about money': 7, 'money cat': 29, 'cat _connector_': 15, '_connector_ the': 2, 'the hat': 44, 'hat learn': 22, 'learn librari': 27, 'insid your': 26, 'your outsid': 50, 'outsid _connector_': 37, 'about _connector_': 5, '_connector_ human': 1, 'human bodi': 24, 'bodi cat': 12, 'oh _connector_': 32, '_connector_ thing': 3, 'thing you': 46, 'you can': 49, 'can do': 14, 'do that': 20, 'that are': 43, 'are good': 10, 'good _connector_': 21, '_connector_ you': 4, 'you _connector_': 48, 'about stay': 9, 'stay healthi': 41, 'healthi cat': 23, 'on beyond': 34, 'beyond bug': 11, 'bug _connector_': 13, 'about insect': 6, 'insect cat': 25, 'there no': 45, 'no place': 31, 'place like': 38, 'like space': 28, 'space _connector_': 40, 'about our': 8, 'our solar': 36, 'solar system': 39, 'system cat': 42}
count_vector.shape
(5, 51)
#only bigrams and unigrams cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(1,2),preprocessor=my_cool_preprocessor) count_vector=cv.fit_transform(cat_in_the_hat_docs)
cv.vocabulary_
{'one': 60, 'cent': 24, 'two': 84, 'old': 56, 'new': 50, '_connector_': 0, 'about': 6, 'money': 48, 'cat': 22, 'the': 78, 'hat': 33, 'learn': 43, 'librari': 45, 'one cent': 61, 'cent two': 28, 'two cent': 85, 'cent old': 27, 'old cent': 57, 'cent new': 26, 'new cent': 51, 'cent _connector_': 25, '_connector_ about': 1, 'about money': 9, 'money cat': 49, 'cat _connector_': 23, '_connector_ the': 3, 'the hat': 79, 'hat learn': 34, 'learn librari': 44, 'insid': 41, 'your': 89, 'outsid': 64, 'human': 37, 'bodi': 16, 'insid your': 42, 'your outsid': 90, 'outsid _connector_': 65, 'about _connector_': 7, '_connector_ human': 2, 'human bodi': 38, 'bodi cat': 17, 'oh': 54, 'thing': 82, 'you': 86, 'can': 20, 'do': 29, 'that': 76, 'are': 12, 'good': 31, 'stay': 72, 'healthi': 35, 'oh _connector_': 55, '_connector_ thing': 4, 'thing you': 83, 'you can': 88, 'can do': 21, 'do that': 30, 'that are': 77, 'are good': 13, 'good _connector_': 32, '_connector_ you': 5, 'you _connector_': 87, 'about stay': 11, 'stay healthi': 73, 'healthi cat': 36, 'on': 58, 'beyond': 14, 'bug': 18, 'insect': 39, 'on beyond': 59, 'beyond bug': 15, 'bug _connector_': 19, 'about insect': 8, 'insect cat': 40, 'there': 80, 'no': 52, 'place': 66, 'like': 46, 'space': 70, 'our': 62, 'solar': 68, 'system': 74, 'there no': 81, 'no place': 53, 'place like': 67, 'like space': 47, 'space _connector_': 71, 'about our': 10, 'our solar': 63, 'solar system': 69, 'system cat': 75}
count_vector.shape
(5, 91)

Working With Character N-Grams

#only character level bigrams cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(2,2),preprocessor=my_cool_preprocessor,analyzer='char_wb') count_vector=cv.fit_transform(cat_in_the_hat_docs)
cv.vocabulary_
{' o': 11, 'on': 77, 'ne': 67, 'e ': 36, ' c': 3, 'ce': 30, 'en': 40, 'nt': 72, 't ': 96, ' t': 14, 'tw': 102, 'wo': 109, 'o ': 73, 'ol': 76, 'ld': 58, 'd ': 33, ' n': 10, 'ew': 42, 'w ': 108, ' _': 0, '_c': 17, 'co': 31, 'nn': 69, 'ec': 38, 'ct': 32, 'to': 100, 'or': 79, 'r_': 84, '_ ': 16, ' a': 1, 'ab': 18, 'bo': 26, 'ou': 80, 'ut': 107, ' m': 9, 'mo': 64, 'ey': 43, 'y ': 110, 'ca': 29, 'at': 23, 'th': 99, 'he': 48, ' h': 6, 'ha': 47, ' s': 13, 's ': 89, ' l': 8, 'le': 59, 'ea': 37, 'ar': 22, 'rn': 88, 'n ': 65, 'li': 60, 'ib': 52, 'br': 27, 'ra': 85, 'ri': 87, 'i ': 51, ' i': 7, 'in': 55, 'ns': 71, 'si': 91, 'id': 53, ' y': 15, 'yo': 111, 'ur': 106, 'r ': 83, 'ts': 101, 'hu': 50, 'um': 105, 'ma': 63, 'an': 21, ' b': 2, 'od': 74, 'di': 34, 'oh': 75, 'h ': 46, 'hi': 49, 'ng': 68, 'g ': 44, 'u ': 103, ' d': 4, 'do': 35, 're': 86, ' g': 5, 'go': 45, 'oo': 78, 'st': 94, 'ta': 97, 'ay': 24, 'al': 20, 'lt': 61, 'be': 25, 'nd': 66, 'bu': 28, 'ug': 104, 'se': 90, 'er': 41, 'no': 70, ' p': 12, 'pl': 82, 'la': 57, 'ac': 19, 'ik': 54, 'ke': 56, 'sp': 93, 'pa': 81, 'so': 92, 'sy': 95, 'ys': 112, 'te': 98, 'em': 39, 'm ': 62}
count_vector.shape
(5, 113)

Limiting Vocabulary Size

#only bigrams and unigrams, limit to vocab size of 10 cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(1,2),preprocessor=my_cool_preprocessor,analyzer='word',max_features=10) count_vector=cv.fit_transform(cat_in_the_hat_docs)
count_vector.shape
(5, 10)
cv.vocabulary_
{'_connector_': 0, 'cat': 1, 'the': 8, 'hat': 3, 'learn': 5, 'librari': 7, 'cat _connector_': 2, 'the hat': 9, 'hat learn': 4, 'learn librari': 6}

Extracting Counts of Words / N-Grams

def sort_coo(coo_matrix): tuples = zip(coo_matrix.col, coo_matrix.data) return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True) def extract_topn_from_vector(feature_names, sorted_items, topn=10): """return n-gram counts in descending order of counts""" #use only topn items from vector sorted_items = sorted_items[:topn] score_vals = [] feature_vals = [] results=[] # word index, count i for idx, count in sorted_items: # get the ngram name n_gram=feature_names[idx] # collect as a list of tuples results.append((n_gram,count)) return results
cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(1,2),preprocessor=my_cool_preprocessor,max_features=100) count_vector=cv.fit_transform(cat_in_the_hat_docs) #sort the counts of first book title by descending order of counts sorted_items=sort_coo(count_vector[0].tocoo()) #Get feature names (words/n-grams). It is sorted by position in sparse matrix feature_names=cv.get_feature_names() n_grams=extract_topn_from_vector(feature_names,sorted_items,10) n_grams
[('cent', 4), ('_connector_', 2), ('two cent', 1), ('two', 1), ('the hat', 1), ('the', 1), ('one cent', 1), ('one', 1), ('old cent', 1), ('old', 1)]

Binary Values Instead of Counts

cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(1,2),binary=True) count_vector=cv.fit_transform(cat_in_the_hat_docs) print(count_vector[2])
(0, 35) 1 (0, 74) 1 (0, 4) 1 (0, 90) 1 (0, 29) 1 (0, 31) 1 (0, 9) 1 (0, 78) 1 (0, 27) 1 (0, 17) 1 (0, 91) 1 (0, 86) 1 (0, 82) 1 (0, 56) 1 (0, 34) 1 (0, 73) 1 (0, 28) 1 (0, 30) 1 (0, 8) 1 (0, 77) 1 (0, 26) 1 (0, 16) 1 (0, 89) 1 (0, 85) 1 (0, 55) 1 (0, 45) 1 (0, 33) 1 (0, 80) 1 (0, 39) 1 (0, 19) 1 (0, 7) 1 (0, 46) 1 (0, 44) 1 (0, 32) 1 (0, 79) 1 (0, 38) 1 (0, 18) 1 (0, 0) 1 (0, 6) 1

Custom Tokenizer

import re def my_tokenizer(text): text=re.sub("(\\W)"," \\1 ",text) return re.split("\\s+",text) cv = CountVectorizer(cat_in_the_hat_docs,tokenizer=my_tokenizer) count_vector=cv.fit_transform(cat_in_the_hat_docs) print(cv.vocabulary_)
{'one': 34, 'cent': 14, ',': 4, 'two': 47, 'cents': 15, 'old': 32, 'new': 29, ':': 5, 'all': 7, 'about': 6, 'money': 28, '(': 2, 'cat': 13, 'in': 22, 'the': 44, 'hat': 19, "'": 1, 's': 38, 'learning': 25, 'library': 26, 'inside': 24, 'your': 49, 'outside': 36, 'human': 21, 'body': 10, ')': 3, '': 0, 'oh': 31, 'things': 46, 'you': 48, 'can': 12, 'do': 16, 'that': 43, 'are': 8, 'good': 18, 'for': 17, 'staying': 41, 'healthy': 20, 'on': 33, 'beyond': 9, 'bugs': 11, 'insects': 23, 'there': 45, 'no': 30, 'place': 37, 'like': 27, 'space': 40, 'our': 35, 'solar': 39, 'system': 42}