Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
kavgan
GitHub Repository: kavgan/nlp-in-practice
Path: blob/master/text-classification/notebooks/Text Classification with Logistic Regression.ipynb
314 views
Kernel: Python 3

Read Data

import pandas as pd #this assumes one json item per line in json file df=pd.read_json("../data/news_category_dataset.json", lines=True)
df.dtypes
authors object category object date datetime64[ns] headline object link object short_description object dtype: object
#number of rows (datapoints) len(df)
124989
df.sample(100)

Date range

Articles are between July 2014 and July 2018

df.date.hist(figsize=(12,6),color='#86bf91',)
<matplotlib.axes._subplots.AxesSubplot at 0x138e995f8>
Image in a Jupyter notebook

Category Distribution

Number of categories

len(set(df['category'].values))
31

Category by count

Most of the articles are related to politics. Education related articles have the lowest volume.

df['category'].value_counts().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x1190fe668>
Image in a Jupyter notebook

Texts for Classification

These are some of the fields we can use for the classification task. We create 3 different versions.

import re def tokenize_url(url:str): url=url.replace("https://www.huffingtonpost.com/entry/","") url=re.sub("(\W|_)+"," ",url) return url df['tokenized_url']=df['link'].apply(lambda x:tokenize_url(x)) #just the description df['text_desc'] = df['short_description'] #description + headline df['text_desc_headline'] = df['short_description'] + ' '+ df['headline'] #description + headline + tokenized url df['text_desc_headline_url'] = df['short_description'] + ' '+ df['headline']+" " + df['tokenized_url']
def _reciprocal_rank(true_labels: list, machine_preds: list): """Compute the reciprocal rank at cutoff k""" # add index to list only if machine predicted label exists in true labels tp_pos_list = [(idx + 1) for idx, r in enumerate(machine_preds) if r in true_labels] rr = 0 if len(tp_pos_list) > 0: # for RR we need position of first correct item first_pos_list = tp_pos_list[0] # rr = 1/rank rr = 1 / float(first_pos_list) return rr def compute_mrr_at_k(items:list): """Compute the MRR (average RR) at cutoff k""" rr_total = 0 for item in items: rr_at_k = _reciprocal_rank(item[0],item[1]) rr_total = rr_total + rr_at_k mrr = rr_total / 1/float(len(items)) return mrr def collect_preds(Y_test,Y_preds): """Collect all predictions and ground truth""" pred_gold_list=[[[Y_test[idx]],pred] for idx,pred in enumerate(Y_preds)] return pred_gold_list def compute_accuracy(eval_items:list): correct=0 total=0 for item in eval_items: true_pred=item[0] machine_pred=set(item[1]) for cat in true_pred: if cat in machine_pred: correct+=1 break accuracy=correct/float(len(eval_items)) return accuracy
from sklearn.metrics import precision_recall_fscore_support from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer import numpy as np import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) def extract_features(df,field,training_data,testing_data,type="binary"): """Extract features using different methods""" logging.info("Extracting features and creating vocabulary...") if "binary" in type: # BINARY FEATURE REPRESENTATION cv= CountVectorizer(binary=True, max_df=0.95) cv.fit_transform(training_data[field].values) train_feature_set=cv.transform(training_data[field].values) test_feature_set=cv.transform(testing_data[field].values) return train_feature_set,test_feature_set,cv elif "counts" in type: # COUNT BASED FEATURE REPRESENTATION cv= CountVectorizer(binary=False, max_df=0.95) cv.fit_transform(training_data[field].values) train_feature_set=cv.transform(training_data[field].values) test_feature_set=cv.transform(testing_data[field].values) return train_feature_set,test_feature_set,cv else: # TF-IDF BASED FEATURE REPRESENTATION tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95) tfidf_vectorizer.fit_transform(training_data[field].values) train_feature_set=tfidf_vectorizer.transform(training_data[field].values) test_feature_set=tfidf_vectorizer.transform(testing_data[field].values) return train_feature_set,test_feature_set,tfidf_vectorizer def get_top_k_predictions(model,X_test,k): # get probabilities instead of predicted labels, since we want to collect top 3 probs = model.predict_proba(X_test) # GET TOP K PREDICTIONS BY PROB - note these are just index best_n = np.argsort(probs, axis=1)[:,-k:] # GET CATEGORY OF PREDICTIONS preds=[[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n] preds=[ item[::-1] for item in preds] return preds def train_model(df,field="text_desc",feature_rep="binary",top_k=3): logging.info("Starting model training...") # GET A TRAIN TEST SPLIT (set seed for consistent results) training_data, testing_data = train_test_split(df,random_state = 2000,) # GET LABELS Y_train=training_data['category'].values Y_test=testing_data['category'].values # GET FEATURES X_train,X_test,feature_transformer=extract_features(df,field,training_data,testing_data,type=feature_rep) # INIT LOGISTIC REGRESSION CLASSIFIER logging.info("Training a Logistic Regression Model...") scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000) model=scikit_log_reg.fit(X_train,Y_train) # GET TOP K PREDICTIONS preds=get_top_k_predictions(model,X_test,top_k) # GET PREDICTED VALUES AND GROUND TRUTH INTO A LIST OF LISTS - for ease of evaluation eval_items=collect_preds(Y_test,preds) # GET EVALUATION NUMBERS ON TEST SET -- HOW DID WE DO? logging.info("Starting evaluation...") accuracy=compute_accuracy(eval_items) mrr_at_k=compute_mrr_at_k(eval_items) logging.info("Done training and evaluation.") return model,feature_transformer,accuracy,mrr_at_k

Train a Single Model

Model - 1 (binary features with description only)

field='text_desc' feature_rep='binary' top_k=3 model,transformer,accuracy,mrr_at_k=train_model(df,field=field,feature_rep=feature_rep,top_k=top_k) print("\nAccuracy={0}; MRR={1}".format(accuracy,mrr_at_k))
2019-11-25 12:41:33,612 : INFO : Starting model training... 2019-11-25 12:41:33,739 : INFO : Extracting features and creating vocabulary... 2019-11-25 12:41:36,742 : INFO : Training a Logistic Regression Model...
[LibLinear]
2019-11-25 12:44:30,134 : INFO : Starting evaluation... 2019-11-25 12:44:30,202 : INFO : Done training and evaluation.
Accuracy=0.5980542754736303; MRR=0.48048941798943345

Model - 2 (tfidf features with description only)

field='text_desc' feature_rep='tfidf' top_k=3 model,transformer,accuracy,mrr_at_k=train_model(df,field=field,feature_rep=feature_rep,top_k=top_k) print("\nAccuracy={0}; MRR={1}".format(accuracy,mrr_at_k))
2019-11-25 12:44:30,242 : INFO : Starting model training... 2019-11-25 12:44:30,308 : INFO : Extracting features and creating vocabulary... 2019-11-25 12:44:33,389 : INFO : Training a Logistic Regression Model...
[LibLinear]
2019-11-25 12:45:21,446 : INFO : Starting evaluation... 2019-11-25 12:45:21,515 : INFO : Done training and evaluation.
Accuracy=0.6306323604710702; MRR=0.5108380269670774

Model - 3 (tfidf features with description, headline, url)

field='text_desc_headline_url' feature_rep='tfidf' top_k=3 model,transformer,accuracy,mrr_at_k=train_model(df,field=field,feature_rep=feature_rep,top_k=top_k) print("\nAccuracy={0}; MRR={1}".format(accuracy,mrr_at_k))
2019-11-25 12:45:21,554 : INFO : Starting model training... 2019-11-25 12:45:21,620 : INFO : Extracting features and creating vocabulary... 2019-11-25 12:45:27,755 : INFO : Training a Logistic Regression Model...
[LibLinear]
2019-11-25 12:46:27,562 : INFO : Starting evaluation... 2019-11-25 12:46:27,634 : INFO : Done training and evaluation.
Accuracy=0.8672555043522785; MRR=0.7511520737327071

Check Predictions on Unseen Articles from CNN (not HuffPost our training data)

# https://www.cnn.com/2019/07/19/politics/george-nader-child-porn-sex-charges/index.html test_features=transformer.transform(["George Aref Nader, who was a key witness in special counsel Robert Mueller's Russia investigation, faces new charges of transporting a minor with intent to engage in criminal sexual activity and child pornography"]) get_top_k_predictions(model,test_features,2)
[['POLITICS', 'CRIME']]
# https://www.cnn.com/2019/07/18/entertainment/khloe-kardashian-true-thompson-video-trnd/index.html test_features=transformer.transform(["True Thompson makes an adorable cameo in Khloe Kardashian's new makeup tutorial video"]) model.predict(test_features) get_top_k_predictions(model,test_features,2)
[['ENTERTAINMENT', 'STYLE']]
# https://www.cnn.com/2019/07/12/entertainment/heidi-klum-tom-kaulitz/ test_features=transformer.transform(["Heidi Klum is apparently the latest celeb to get married and not tell us"]) get_top_k_predictions(model,test_features,2)
[['ENTERTAINMENT', 'STYLE']]
# https://www.cnn.com/2019/07/19/investing/dow-stock-market-today/index.html test_features=transformer.transform(["Stocks end lower as geopolitical fears rise. The Dow and US markets closed lower on Friday, as geopolitical worries overshadowed the hopes of interest rate cuts by the Federal Reserve."]) get_top_k_predictions(model,test_features,2)
[['BUSINESS', 'POLITICS']]
# https://www.cnn.com/2019/07/19/health/astronaut-exercise-iv-faint-scn/index.html test_features=transformer.transform(["Exercise in space keeps astronauts from fainting when they return to Earth, study says. "]) get_top_k_predictions(model,test_features,2)
[['SCIENCE', 'HEALTHY LIVING']]

Train Different Types of Models

feature_reps=['binary','counts','tfidf'] fields=['text_desc','text_desc_headline','text_desc_headline_url'] top_ks=[3] results=[] for field in fields: for feature_rep in feature_reps: for top_k in top_ks: model,transformer,acc,mrr_at_k=train_model(df,field=field,feature_rep=feature_rep,top_k=top_k) results.append([field,feature_rep,top_k,acc,mrr_at_k])
2019-11-25 12:46:27,728 : INFO : Starting model training... 2019-11-25 12:46:27,788 : INFO : Extracting features and creating vocabulary... 2019-11-25 12:46:30,778 : INFO : Training a Logistic Regression Model...
[LibLinear]
2019-11-25 12:49:25,346 : INFO : Starting evaluation... 2019-11-25 12:49:25,419 : INFO : Done training and evaluation. 2019-11-25 12:49:25,462 : INFO : Starting model training... 2019-11-25 12:49:25,523 : INFO : Extracting features and creating vocabulary... 2019-11-25 12:49:28,496 : INFO : Training a Logistic Regression Model...
[LibLinear]
2019-11-25 12:53:27,625 : INFO : Starting evaluation... 2019-11-25 12:53:27,701 : INFO : Done training and evaluation. 2019-11-25 12:53:27,735 : INFO : Starting model training... 2019-11-25 12:53:27,797 : INFO : Extracting features and creating vocabulary... 2019-11-25 12:53:31,055 : INFO : Training a Logistic Regression Model...
[LibLinear]
2019-11-25 12:54:17,419 : INFO : Starting evaluation... 2019-11-25 12:54:17,493 : INFO : Done training and evaluation. 2019-11-25 12:54:17,527 : INFO : Starting model training... 2019-11-25 12:54:17,606 : INFO : Extracting features and creating vocabulary... 2019-11-25 12:54:22,294 : INFO : Training a Logistic Regression Model...
[LibLinear]
2019-11-25 12:57:33,965 : INFO : Starting evaluation... 2019-11-25 12:57:34,034 : INFO : Done training and evaluation. 2019-11-25 12:57:34,072 : INFO : Starting model training... 2019-11-25 12:57:34,132 : INFO : Extracting features and creating vocabulary... 2019-11-25 12:57:38,488 : INFO : Training a Logistic Regression Model...
[LibLinear]
2019-11-25 13:02:22,456 : INFO : Starting evaluation... 2019-11-25 13:02:22,513 : INFO : Done training and evaluation. 2019-11-25 13:02:22,546 : INFO : Starting model training... 2019-11-25 13:02:22,594 : INFO : Extracting features and creating vocabulary... 2019-11-25 13:02:27,275 : INFO : Training a Logistic Regression Model...
[LibLinear]
2019-11-25 13:03:19,438 : INFO : Starting evaluation... 2019-11-25 13:03:19,507 : INFO : Done training and evaluation. 2019-11-25 13:03:19,543 : INFO : Starting model training... 2019-11-25 13:03:19,601 : INFO : Extracting features and creating vocabulary... 2019-11-25 13:03:25,400 : INFO : Training a Logistic Regression Model...
[LibLinear]
2019-11-25 13:06:27,931 : INFO : Starting evaluation... 2019-11-25 13:06:28,002 : INFO : Done training and evaluation. 2019-11-25 13:06:28,057 : INFO : Starting model training... 2019-11-25 13:06:28,127 : INFO : Extracting features and creating vocabulary... 2019-11-25 13:06:34,953 : INFO : Training a Logistic Regression Model...
[LibLinear]
2019-11-25 13:11:21,625 : INFO : Starting evaluation... 2019-11-25 13:11:21,697 : INFO : Done training and evaluation. 2019-11-25 13:11:21,746 : INFO : Starting model training... 2019-11-25 13:11:21,805 : INFO : Extracting features and creating vocabulary... 2019-11-25 13:11:28,276 : INFO : Training a Logistic Regression Model...
[LibLinear]
2019-11-25 13:12:26,150 : INFO : Starting evaluation... 2019-11-25 13:12:26,222 : INFO : Done training and evaluation.

Results of Various Models

df_results=pd.DataFrame(results,columns=['text_fields','feature_representation','top_k','accuracy','mrr_at_k']) df_results.sort_values(by=['text_fields','accuracy'],ascending=False)

Save Model for Future Use

import pickle model_path="../models/model.pkl" transformer_path="../models/transformer.pkl" # we need to save both the transformer -> to encode a document and the model itself to make predictions based on the weight vectors pickle.dump(model,open(model_path, 'wb')) pickle.dump(transformer,open(transformer_path,'wb'))

Use Loaded Model

loaded_model = pickle.load(open(model_path, 'rb')) loaded_transformer = pickle.load(open(transformer_path, 'rb')) test_features=loaded_transformer.transform(["President Trump AND THE impeachment story !!!"]) get_top_k_predictions(loaded_model,test_features,2)
[['POLITICS', 'THE WORLDPOST']]