Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
rasbt
GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch08/logistic-regression-bag-of-words/log-reg.ipynb
1945 views
Kernel: Python 3 (ipykernel)

Logistic Regression Classifier for Text

Obtaining the IMDb movie review dataset

import os import sys import tarfile import time import urllib.request source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' target = 'aclImdb_v1.tar.gz' if os.path.exists(target): os.remove(target) def reporthook(count, block_size, total_size): global start_time if count == 0: start_time = time.time() return duration = time.time() - start_time progress_size = int(count * block_size) speed = progress_size / (1024.**2 * duration) percent = count * block_size * 100. / total_size sys.stdout.write(f'\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB ' f'| {speed:.2f} MB/s | {duration:.2f} sec elapsed') sys.stdout.flush() if not os.path.isdir('aclImdb') and not os.path.isfile('aclImdb_v1.tar.gz'): urllib.request.urlretrieve(source, target, reporthook)
100% | 80.23 MB | 6.03 MB/s | 13.30 sec elapsed
if not os.path.isdir('aclImdb'): with tarfile.open(target, 'r:gz') as tar: tar.extractall()

Preprocessing the movie dataset into more convenient format

Install pyprind by uncommenting the next code cell.

# !pip install pyprind
Collecting pyprind Using cached PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB) WARNING: Error parsing requirements for soupsieve: [Errno 2] No such file or directory: '/Users/sebastian/miniforge3/lib/python3.10/site-packages/soupsieve-2.3.2.post1.dist-info/METADATA' DEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063 Installing collected packages: pyprind Successfully installed pyprind-2.11.3
import pyprind import pandas as pd import os import sys from packaging import version # change the `basepath` to the directory of the # unzipped movie dataset basepath = 'aclImdb' labels = {'pos': 1, 'neg': 0} # if the progress bar does not show, change stream=sys.stdout to stream=2 pbar = pyprind.ProgBar(50000, stream=sys.stdout) df = pd.DataFrame() for s in ('test', 'train'): for l in ('pos', 'neg'): path = os.path.join(basepath, s, l) for file in sorted(os.listdir(path)): with open(os.path.join(path, file), 'r', encoding='utf-8') as infile: txt = infile.read() if version.parse(pd.__version__) >= version.parse("1.3.2"): x = pd.DataFrame([[txt, labels[l]]], columns=['review', 'sentiment']) df = pd.concat([df, x], ignore_index=False) else: df = df.append([[txt, labels[l]]], ignore_index=True) pbar.update() df.columns = ['review', 'sentiment']
0% [##############################] 100% | ETA: 00:00:00 Total time elapsed: 00:00:20

Shuffling the DataFrame:

import numpy as np if version.parse(pd.__version__) >= version.parse("1.3.2"): df = df.sample(frac=1, random_state=0).reset_index(drop=True) else: np.random.seed(0) df = df.reindex(np.random.permutation(df.index))

Optional: Saving the assembled data as CSV file:

df.to_csv('movie_data.csv', index=False, encoding='utf-8')
import pandas as pd df = pd.read_csv('movie_data.csv', encoding='utf-8') # the following is necessary on some computers: df = df.rename(columns={"0": "review", "1": "sentiment"}) df.head(3)
df.shape
(50000, 2)


Training a logistic regression model for document classification

Strip HTML and punctuation to speed up the GridSearch later:

X_train = df.loc[:25000, 'review'].values y_train = df.loc[:25000, 'sentiment'].values X_test = df.loc[25000:, 'review'].values y_test = df.loc[25000:, 'sentiment'].values
from nltk.stem.porter import PorterStemmer porter = PorterStemmer() def tokenizer(text): return text.split() def tokenizer_porter(text): return [porter.stem(word) for word in text.split()]
import nltk nltk.download('stopwords')
[nltk_data] Downloading package stopwords to [nltk_data] /Users/sebastian/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
True
from nltk.corpus import stopwords stop = stopwords.words('english') [w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]
['runner', 'like', 'run', 'run', 'lot']
from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import GridSearchCV tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None) small_param_grid = [{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [None], 'vect__tokenizer': [tokenizer, tokenizer_porter], 'clf__penalty': ['l2'], 'clf__C': [1.0, 10.0]}, {'vect__ngram_range': [(1, 1)], 'vect__stop_words': [stop, None], 'vect__tokenizer': [tokenizer], 'vect__use_idf':[False], 'vect__norm':[None], 'clf__penalty': ['l2'], 'clf__C': [1.0, 10.0]}, ] lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(solver='liblinear'))]) gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

Important Note about n_jobs

Please note that it is highly recommended to use n_jobs=-1 (instead of n_jobs=1) in the previous code example to utilize all available cores on your machine and speed up the grid search. However, some Windows users reported issues when running the previous code with the n_jobs=-1 setting related to pickling the tokenizer and tokenizer_porter functions for multiprocessing on Windows. Another workaround would be to replace those two functions, [tokenizer, tokenizer_porter], with [str.split]. However, note that the replacement by the simple str.split would not support stemming.

gs_lr_tfidf.fit(X_train, y_train)
Fitting 5 folds for each of 8 candidates, totalling 40 fits
print(f'Best parameter set: {gs_lr_tfidf.best_params_}') print(f'CV Accuracy: {gs_lr_tfidf.best_score_:.3f}')
Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x168eab370>} CV Accuracy: 0.887
clf = gs_lr_tfidf.best_estimator_ print(f'Test Accuracy: {clf.score(X_test, y_test):.3f}')
Test Accuracy: 0.893