GitHub Repository: kavgan/nlp-in-practice
Path: blob/master/CountVectorizer/CountVectorizer.ipynb
³¹⁴ views

Kernel: Python 3

CountVectorizer Usage Examples

In [3]:

from sklearn.feature_extraction.text import CountVectorizer

Warm Up Example

In [4]:


doc=["One Cent, Two Cents, Old Cent, New Cent: All About Money"]

In [5]:

cv = CountVectorizer(doc)
count_vector=cv.fit_transform(doc)

In [6]:

# show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector.
cv.vocabulary_

Out[6]:

{'one': 7,
 'cent': 2,
 'two': 8,
 'cents': 3,
 'old': 6,
 'new': 5,
 'all': 1,
 'about': 0,
 'money': 4}

In [7]:

# matrix shape. 1 documents, 9 unique words
count_vector.shape

Out[7]:

(1, 9)

In [8]:

# any words eliminated internally? -- nope
cv.stop_words_

Out[8]:

set()

CountVectorizer With More Data

Plain and Simple

In [9]:

cat_in_the_hat_docs=[
      "One Cent, Two Cents, Old Cent, New Cent: All About Money (Cat in the Hat's Learning Library",
      "Inside Your Outside: All About the Human Body (Cat in the Hat's Learning Library)",
      "Oh, The Things You Can Do That Are Good for You: All About Staying Healthy (Cat in the Hat's Learning Library)",
      "On Beyond Bugs: All About Insects (Cat in the Hat's Learning Library)",
      "There's No Place Like Space: All About Our Solar System (Cat in the Hat's Learning Library)" 
     ]

In [10]:

cv = CountVectorizer(cat_in_the_hat_docs)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [11]:

# show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector.
cv.vocabulary_

Out[11]:

{'one': 28,
 'cent': 8,
 'two': 40,
 'cents': 9,
 'old': 26,
 'new': 23,
 'all': 1,
 'about': 0,
 'money': 22,
 'cat': 7,
 'in': 16,
 'the': 37,
 'hat': 13,
 'learning': 19,
 'library': 20,
 'inside': 18,
 'your': 42,
 'outside': 30,
 'human': 15,
 'body': 4,
 'oh': 25,
 'things': 39,
 'you': 41,
 'can': 6,
 'do': 10,
 'that': 36,
 'are': 2,
 'good': 12,
 'for': 11,
 'staying': 34,
 'healthy': 14,
 'on': 27,
 'beyond': 3,
 'bugs': 5,
 'insects': 17,
 'there': 38,
 'no': 24,
 'place': 31,
 'like': 21,
 'space': 33,
 'our': 29,
 'solar': 32,
 'system': 35}

In [14]:

#shape of count vector: 5 docs (book titles) and 43 unique words
count_vector.shape

Out[14]:

(5, 43)

In [15]:

#any stop words?
cv.stop_words_

Out[15]:

set()

CountVectorizer With Custom StopWords

In [21]:

cv = CountVectorizer(cat_in_the_hat_docs,stop_words=["all","in","the","is","and"])
count_vector=cv.fit_transform(cat_in_the_hat_docs)
count_vector.shape

Out[21]:

(5, 40)

In [22]:

#any stop words that we explicitly specified?
cv.stop_words

Out[22]:

['all', 'in', 'the', 'is', 'and']

In [23]:

#any stop words internally stopped by countvectorizer?
cv.stop_words_

Out[23]:

set()

CountVectorizer With Predefined StopWords

In [435]:

cv = CountVectorizer(cat_in_the_hat_docs,stop_words="english")
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [436]:

# the shape should be smaller
count_vector.shape

Out[436]:

(5, 24)

In [437]:

#any stop words that we explicitly specified?
cv.stop_words

Out[437]:

'english'

In [438]:

#any stop words internally stopped by countvectorizer?
cv.stop_words_

Out[438]:

set()

In [439]:

# much smaller vocabulary with stopwords applied
cv.vocabulary_

Out[439]:

{'cent': 3,
 'cents': 4,
 'old': 17,
 'new': 15,
 'money': 14,
 'cat': 2,
 'hat': 6,
 'learning': 11,
 'library': 12,
 'inside': 10,
 'outside': 18,
 'human': 8,
 'body': 0,
 'oh': 16,
 'things': 23,
 'good': 5,
 'staying': 22,
 'healthy': 7,
 'bugs': 1,
 'insects': 9,
 'place': 19,
 'like': 13,
 'space': 21,
 'solar': 20}

CountVectorizer with MIN_DF as StopWords

In [440]:

# ignore terms that appeared in less than n documents (can be proportion or absolute counts)
cv = CountVectorizer(cat_in_the_hat_docs,min_df=2)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [441]:

#any stop words internally stopped by countvectorizer?
cv.stop_words_

Out[441]:

{'are',
 'beyond',
 'body',
 'bugs',
 'can',
 'cent',
 'cents',
 'do',
 'for',
 'good',
 'healthy',
 'human',
 'insects',
 'inside',
 'like',
 'money',
 'new',
 'no',
 'oh',
 'old',
 'on',
 'one',
 'our',
 'outside',
 'place',
 'solar',
 'space',
 'staying',
 'system',
 'that',
 'there',
 'things',
 'two',
 'you',
 'your'}

In [442]:

count_vector.shape

Out[442]:

(5, 8)

In [443]:

# use proportion here. Ignore terms that occurred in less than 25% of the documents
cv = CountVectorizer(cat_in_the_hat_docs,min_df=0.25)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [444]:

count_vector.shape

Out[444]:

(5, 8)

In [445]:

cv.vocabulary_

Out[445]:

{'all': 1,
 'about': 0,
 'cat': 2,
 'in': 4,
 'the': 7,
 'hat': 3,
 'learning': 5,
 'library': 6}

In [446]:

#any stop words internally stopped by countvectorizer?
cv.stop_words_

Out[446]:

{'are',
 'beyond',
 'body',
 'bugs',
 'can',
 'cent',
 'cents',
 'do',
 'for',
 'good',
 'healthy',
 'human',
 'insects',
 'inside',
 'like',
 'money',
 'new',
 'no',
 'oh',
 'old',
 'on',
 'one',
 'our',
 'outside',
 'place',
 'solar',
 'space',
 'staying',
 'system',
 'that',
 'there',
 'things',
 'two',
 'you',
 'your'}

CountVectorizer with MAX_DF as StopWords

In [447]:

# ignore terms that appeared in more than n documents (can be proportion or absolute counts)
# use proportion here
cv = CountVectorizer(cat_in_the_hat_docs,max_df=0.50)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [448]:

cv.vocabulary_

Out[448]:

{'one': 21,
 'cent': 5,
 'two': 32,
 'cents': 6,
 'old': 19,
 'new': 16,
 'money': 15,
 'inside': 13,
 'your': 34,
 'outside': 23,
 'human': 11,
 'body': 2,
 'oh': 18,
 'things': 31,
 'you': 33,
 'can': 4,
 'do': 7,
 'that': 29,
 'are': 0,
 'good': 9,
 'for': 8,
 'staying': 27,
 'healthy': 10,
 'on': 20,
 'beyond': 1,
 'bugs': 3,
 'insects': 12,
 'there': 30,
 'no': 17,
 'place': 24,
 'like': 14,
 'space': 26,
 'our': 22,
 'solar': 25,
 'system': 28}

In [449]:

cv.stop_words_

Out[449]:

{'about', 'all', 'cat', 'hat', 'in', 'learning', 'library', 'the'}

In [450]:

# ignore terms that appeared in more than n documents (can be proportion or absolute counts)
# use absolute values here - suitable when you know number of documents ahead of time and are dealing with only a handful
cv = CountVectorizer(cat_in_the_hat_docs,max_df=4)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [451]:

cv.stop_words_

Out[451]:

{'about', 'all', 'cat', 'hat', 'in', 'learning', 'library', 'the'}

Custom Preprocessing

In [32]:

import re
import nltk
import pandas as pd
from nltk.stem import PorterStemmer

# init stemmer
porter_stemmer=PorterStemmer()

def my_cool_preprocessor(text):
    
    text=text.lower() 
    text=re.sub("\\W"," ",text) # remove special chars
    text=re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ",text) # normalize certain words
    
    # stem words
    words=re.split("\\s+",text)
    stemmed_words=[porter_stemmer.stem(word=word) for word in words]
    return ' '.join(stemmed_words)

cv = CountVectorizer(cat_in_the_hat_docs,preprocessor=my_cool_preprocessor)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [33]:

cv.vocabulary_

Out[33]:

{'one': 25,
 'cent': 8,
 'two': 37,
 'old': 23,
 'new': 20,
 '_connector_': 0,
 'about': 1,
 'money': 19,
 'cat': 7,
 'the': 34,
 'hat': 11,
 'learn': 16,
 'librari': 17,
 'insid': 15,
 'your': 39,
 'outsid': 27,
 'human': 13,
 'bodi': 4,
 'oh': 22,
 'thing': 36,
 'you': 38,
 'can': 6,
 'do': 9,
 'that': 33,
 'are': 2,
 'good': 10,
 'stay': 31,
 'healthi': 12,
 'on': 24,
 'beyond': 3,
 'bug': 5,
 'insect': 14,
 'there': 35,
 'no': 21,
 'place': 28,
 'like': 18,
 'space': 30,
 'our': 26,
 'solar': 29,
 'system': 32}

In [454]:

cv = CountVectorizer(cat_in_the_hat_docs,preprocessor=my_cool_preprocessor)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

Working With N-Grams

In [455]:

#only bigrams, word level
cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(2,2),preprocessor=my_cool_preprocessor)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [456]:

cv.vocabulary_

Out[456]:

{'one cent': 35,
 'cent two': 19,
 'two cent': 47,
 'cent old': 18,
 'old cent': 33,
 'cent new': 17,
 'new cent': 30,
 'cent _connector_': 16,
 '_connector_ about': 0,
 'about money': 7,
 'money cat': 29,
 'cat _connector_': 15,
 '_connector_ the': 2,
 'the hat': 44,
 'hat learn': 22,
 'learn librari': 27,
 'insid your': 26,
 'your outsid': 50,
 'outsid _connector_': 37,
 'about _connector_': 5,
 '_connector_ human': 1,
 'human bodi': 24,
 'bodi cat': 12,
 'oh _connector_': 32,
 '_connector_ thing': 3,
 'thing you': 46,
 'you can': 49,
 'can do': 14,
 'do that': 20,
 'that are': 43,
 'are good': 10,
 'good _connector_': 21,
 '_connector_ you': 4,
 'you _connector_': 48,
 'about stay': 9,
 'stay healthi': 41,
 'healthi cat': 23,
 'on beyond': 34,
 'beyond bug': 11,
 'bug _connector_': 13,
 'about insect': 6,
 'insect cat': 25,
 'there no': 45,
 'no place': 31,
 'place like': 38,
 'like space': 28,
 'space _connector_': 40,
 'about our': 8,
 'our solar': 36,
 'solar system': 39,
 'system cat': 42}

In [457]:

count_vector.shape

Out[457]:

(5, 51)

In [458]:

#only bigrams and unigrams
cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(1,2),preprocessor=my_cool_preprocessor)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [459]:

cv.vocabulary_

Out[459]:

{'one': 60,
 'cent': 24,
 'two': 84,
 'old': 56,
 'new': 50,
 '_connector_': 0,
 'about': 6,
 'money': 48,
 'cat': 22,
 'the': 78,
 'hat': 33,
 'learn': 43,
 'librari': 45,
 'one cent': 61,
 'cent two': 28,
 'two cent': 85,
 'cent old': 27,
 'old cent': 57,
 'cent new': 26,
 'new cent': 51,
 'cent _connector_': 25,
 '_connector_ about': 1,
 'about money': 9,
 'money cat': 49,
 'cat _connector_': 23,
 '_connector_ the': 3,
 'the hat': 79,
 'hat learn': 34,
 'learn librari': 44,
 'insid': 41,
 'your': 89,
 'outsid': 64,
 'human': 37,
 'bodi': 16,
 'insid your': 42,
 'your outsid': 90,
 'outsid _connector_': 65,
 'about _connector_': 7,
 '_connector_ human': 2,
 'human bodi': 38,
 'bodi cat': 17,
 'oh': 54,
 'thing': 82,
 'you': 86,
 'can': 20,
 'do': 29,
 'that': 76,
 'are': 12,
 'good': 31,
 'stay': 72,
 'healthi': 35,
 'oh _connector_': 55,
 '_connector_ thing': 4,
 'thing you': 83,
 'you can': 88,
 'can do': 21,
 'do that': 30,
 'that are': 77,
 'are good': 13,
 'good _connector_': 32,
 '_connector_ you': 5,
 'you _connector_': 87,
 'about stay': 11,
 'stay healthi': 73,
 'healthi cat': 36,
 'on': 58,
 'beyond': 14,
 'bug': 18,
 'insect': 39,
 'on beyond': 59,
 'beyond bug': 15,
 'bug _connector_': 19,
 'about insect': 8,
 'insect cat': 40,
 'there': 80,
 'no': 52,
 'place': 66,
 'like': 46,
 'space': 70,
 'our': 62,
 'solar': 68,
 'system': 74,
 'there no': 81,
 'no place': 53,
 'place like': 67,
 'like space': 47,
 'space _connector_': 71,
 'about our': 10,
 'our solar': 63,
 'solar system': 69,
 'system cat': 75}

In [460]:

count_vector.shape

Out[460]:

(5, 91)

Working With Character N-Grams

In [461]:

#only character level bigrams 
cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(2,2),preprocessor=my_cool_preprocessor,analyzer='char_wb')
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [462]:

cv.vocabulary_

Out[462]:

{' o': 11,
 'on': 77,
 'ne': 67,
 'e ': 36,
 ' c': 3,
 'ce': 30,
 'en': 40,
 'nt': 72,
 't ': 96,
 ' t': 14,
 'tw': 102,
 'wo': 109,
 'o ': 73,
 'ol': 76,
 'ld': 58,
 'd ': 33,
 ' n': 10,
 'ew': 42,
 'w ': 108,
 ' _': 0,
 '_c': 17,
 'co': 31,
 'nn': 69,
 'ec': 38,
 'ct': 32,
 'to': 100,
 'or': 79,
 'r_': 84,
 '_ ': 16,
 ' a': 1,
 'ab': 18,
 'bo': 26,
 'ou': 80,
 'ut': 107,
 ' m': 9,
 'mo': 64,
 'ey': 43,
 'y ': 110,
 'ca': 29,
 'at': 23,
 'th': 99,
 'he': 48,
 ' h': 6,
 'ha': 47,
 ' s': 13,
 's ': 89,
 ' l': 8,
 'le': 59,
 'ea': 37,
 'ar': 22,
 'rn': 88,
 'n ': 65,
 'li': 60,
 'ib': 52,
 'br': 27,
 'ra': 85,
 'ri': 87,
 'i ': 51,
 ' i': 7,
 'in': 55,
 'ns': 71,
 'si': 91,
 'id': 53,
 ' y': 15,
 'yo': 111,
 'ur': 106,
 'r ': 83,
 'ts': 101,
 'hu': 50,
 'um': 105,
 'ma': 63,
 'an': 21,
 ' b': 2,
 'od': 74,
 'di': 34,
 'oh': 75,
 'h ': 46,
 'hi': 49,
 'ng': 68,
 'g ': 44,
 'u ': 103,
 ' d': 4,
 'do': 35,
 're': 86,
 ' g': 5,
 'go': 45,
 'oo': 78,
 'st': 94,
 'ta': 97,
 'ay': 24,
 'al': 20,
 'lt': 61,
 'be': 25,
 'nd': 66,
 'bu': 28,
 'ug': 104,
 'se': 90,
 'er': 41,
 'no': 70,
 ' p': 12,
 'pl': 82,
 'la': 57,
 'ac': 19,
 'ik': 54,
 'ke': 56,
 'sp': 93,
 'pa': 81,
 'so': 92,
 'sy': 95,
 'ys': 112,
 'te': 98,
 'em': 39,
 'm ': 62}

In [463]:

count_vector.shape

Out[463]:

(5, 113)

Limiting Vocabulary Size

In [464]:

#only bigrams and unigrams, limit to vocab size of 10
cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(1,2),preprocessor=my_cool_preprocessor,analyzer='word',max_features=10)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [465]:

count_vector.shape

Out[465]:

(5, 10)

In [466]:

cv.vocabulary_

Out[466]:

{'_connector_': 0,
 'cat': 1,
 'the': 8,
 'hat': 3,
 'learn': 5,
 'librari': 7,
 'cat _connector_': 2,
 'the hat': 9,
 'hat learn': 4,
 'learn librari': 6}

Extracting Counts of Words / N-Grams

In [37]:

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """return n-gram counts in descending order of counts"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    results=[]
    
    # word index, count i
    for idx, count in sorted_items:
        
        # get the ngram name
        n_gram=feature_names[idx]
        
        # collect as a list of tuples
        results.append((n_gram,count))
 
    return results

In [54]:

cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(1,2),preprocessor=my_cool_preprocessor,max_features=100)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

#sort the counts of first book title by descending order of counts
sorted_items=sort_coo(count_vector[0].tocoo())

#Get feature names (words/n-grams). It is sorted by position in sparse matrix
feature_names=cv.get_feature_names()
n_grams=extract_topn_from_vector(feature_names,sorted_items,10)
n_grams

Out[54]:

[('cent', 4),
 ('_connector_', 2),
 ('two cent', 1),
 ('two', 1),
 ('the hat', 1),
 ('the', 1),
 ('one cent', 1),
 ('one', 1),
 ('old cent', 1),
 ('old', 1)]

Binary Values Instead of Counts

In [60]:

cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(1,2),binary=True)
count_vector=cv.fit_transform(cat_in_the_hat_docs)
print(count_vector[2])

Out[60]:

  (0, 35)	1
  (0, 74)	1
  (0, 4)	1
  (0, 90)	1
  (0, 29)	1
  (0, 31)	1
  (0, 9)	1
  (0, 78)	1
  (0, 27)	1
  (0, 17)	1
  (0, 91)	1
  (0, 86)	1
  (0, 82)	1
  (0, 56)	1
  (0, 34)	1
  (0, 73)	1
  (0, 28)	1
  (0, 30)	1
  (0, 8)	1
  (0, 77)	1
  (0, 26)	1
  (0, 16)	1
  (0, 89)	1
  (0, 85)	1
  (0, 55)	1
  (0, 45)	1
  (0, 33)	1
  (0, 80)	1
  (0, 39)	1
  (0, 19)	1
  (0, 7)	1
  (0, 46)	1
  (0, 44)	1
  (0, 32)	1
  (0, 79)	1
  (0, 38)	1
  (0, 18)	1
  (0, 0)	1
  (0, 6)	1

Custom Tokenizer

In [78]:

import re

def my_tokenizer(text):
    text=re.sub("(\\W)"," \\1 ",text)
    return re.split("\\s+",text)
    

cv = CountVectorizer(cat_in_the_hat_docs,tokenizer=my_tokenizer)
count_vector=cv.fit_transform(cat_in_the_hat_docs)
print(cv.vocabulary_)

Out[78]:

{'one': 34, 'cent': 14, ',': 4, 'two': 47, 'cents': 15, 'old': 32, 'new': 29, ':': 5, 'all': 7, 'about': 6, 'money': 28, '(': 2, 'cat': 13, 'in': 22, 'the': 44, 'hat': 19, "'": 1, 's': 38, 'learning': 25, 'library': 26, 'inside': 24, 'your': 49, 'outside': 36, 'human': 21, 'body': 10, ')': 3, '': 0, 'oh': 31, 'things': 46, 'you': 48, 'can': 12, 'do': 16, 'that': 43, 'are': 8, 'good': 18, 'for': 17, 'staying': 41, 'healthy': 20, 'on': 33, 'beyond': 9, 'bugs': 11, 'insects': 23, 'there': 45, 'no': 30, 'place': 37, 'like': 27, 'space': 40, 'our': 35, 'solar': 39, 'system': 42}

In [ ]:

CountVectorizer Usage Examples

Warm Up Example

CountVectorizer With More Data

Plain and Simple

CountVectorizer With Custom StopWords

CountVectorizer With Predefined StopWords

CountVectorizer with MIN_DF as StopWords

CountVectorizer with MAX_DF as StopWords

Custom Preprocessing

Working With N-Grams

Working With Character N-Grams

Limiting Vocabulary Size

Extracting Counts of Words / N-Grams

Binary Values Instead of Counts

Custom Tokenizer

Product

Resources

Company