Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
kavgan
GitHub Repository: kavgan/nlp-in-practice
Path: blob/master/text-pre-processing/Text Preprocessing Examples.ipynb
314 views
Kernel: Python 3

Code tidbits for preprocessing texts

Lowercasing

texts=["CANADA","Canada","canadA","canada"] lower_words=[word.lower() for word in texts] lower_words
['canada', 'canada', 'canada', 'canada']

Stemming

import nltk import pandas as pd from nltk.stem import PorterStemmer # init stemmer porter_stemmer=PorterStemmer()
# stem connect variations words=["connect","connected","connection","connections","connects"] stemmed_words=[porter_stemmer.stem(word=word) for word in words] stemdf= pd.DataFrame({'original_word': words,'stemmed_word': stemmed_words}) stemdf
# stem trouble variations words=["trouble","troubled","troubles","troublemsome"] stemmed_words=[porter_stemmer.stem(word=word) for word in words] stemdf= pd.DataFrame({'original_word': words,'stemmed_word': stemmed_words}) stemdf

Lemmatization

from nltk.stem import WordNetLemmatizer nltk.download('wordnet') # init lemmatizer lemmatizer = WordNetLemmatizer()
[nltk_data] Downloading package wordnet to /Users/kavgan/nltk_data... [nltk_data] Package wordnet is already up-to-date!
#lemmatize trouble variations words=["trouble","troubling","troubled","troubles",] lemmatized_words=[lemmatizer.lemmatize(word=word,pos='v') for word in words] lemmatizeddf= pd.DataFrame({'original_word': words,'lemmatized_word': lemmatized_words}) lemmatizeddf=lemmatizeddf[['original_word','lemmatized_word']] lemmatizeddf
#lemmatize goose variations words=["goose","geese"] lemmatized_words=[lemmatizer.lemmatize(word=word,pos='n') for word in words] lemmatizeddf= pd.DataFrame({'original_word': words,'lemmatized_word': lemmatized_words}) lemmatizeddf=lemmatizeddf[['original_word','lemmatized_word']] lemmatizeddf

Stop Word Removal

stopwords=['this','that','and','a','we','it','to','is','of','up','need'] text="this is a text full of content and we need to clean it up"
words=text.split(" ") shortlisted_words=[] #remove stop words for w in words: if w not in stopwords: shortlisted_words.append(w) else: shortlisted_words.append("W") print("original sentence = ",text) print("sentence with stop words removed= ",' '.join(shortlisted_words))
original sentence = this is a text full of content and we need to clean it up sentence with stop words removed= W W W text full W content W W W W clean W W

Noise Removal

import nltk import pandas as pd import re from nltk.stem import PorterStemmer porter_stemmer=PorterStemmer()
# stem raw words with noise raw_words=["..trouble..","trouble<","trouble!","<a>trouble</a>",'1.trouble'] stemmed_words=[porter_stemmer.stem(word=word) for word in raw_words] stemdf= pd.DataFrame({'raw_word': raw_words,'stemmed_word': stemmed_words}) stemdf
def scrub_words(text): """Basic cleaning of texts.""" # remove html markup text=re.sub("(<.*?>)","",text) #remove non-ascii and digits text=re.sub("(\\W|\\d)"," ",text) #remove whitespace text=text.strip() return text
# stem words already cleaned cleaned_words=[scrub_words(w) for w in raw_words] cleaned_stemmed_words=[porter_stemmer.stem(word=word) for word in cleaned_words] stemdf= pd.DataFrame({'raw_word': raw_words,'cleaned_word':cleaned_words,'stemmed_word': cleaned_stemmed_words}) stemdf=stemdf[['raw_word','cleaned_word','stemmed_word']] stemdf