Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
NakulLakhotia
GitHub Repository: NakulLakhotia/Hate-Speech-Detection-in-Social-Media-using-Python
Path: blob/master/final_customization.ipynb
251 views
Kernel: Python 3

Hate Speech Detection

import pandas as panda from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem.porter import * import string import nltk from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import confusion_matrix import seaborn from textstat.textstat import * from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score from sklearn.feature_selection import SelectFromModel from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB import numpy as np from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS import warnings warnings.simplefilter(action='ignore', category=FutureWarning) %matplotlib inline
dataset = panda.read_csv("HateSpeechData.csv") dataset
# Adding text-length as a field in the dataset dataset['text length'] = dataset['tweet'].apply(len) print(dataset.head())
Unnamed: 0 count hate_speech offensive_language neither class \ 0 0 3 0 0 3 2 1 1 3 0 3 0 1 2 2 3 0 3 0 1 3 3 3 0 2 1 1 4 4 6 0 6 0 1 tweet text length 0 !!! RT @mayasolovely: As a woman you shouldn't... 140 1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... 85 2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... 120 3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... 62 4 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... 137
#Basic visualization of data using histograms # FacetGrid- Multi-plot grid for plotting conditional relationships import seaborn as sns import matplotlib.pyplot as plt graph = sns.FacetGrid(data=dataset, col='class') graph.map(plt.hist, 'text length', bins=50)
<seaborn.axisgrid.FacetGrid at 0x1c0b3cc71d0>
Image in a Jupyter notebook
  • a. Distribution of text-length almost seem to be similar across all three classes

  • b. Number of tweets seem to be skewed a lot higher towards the class-1

# Box-plot visvualization sns.boxplot(x='class', y='text length', data=dataset)
<matplotlib.axes._subplots.AxesSubplot at 0x1c0b39eeb00>
Image in a Jupyter notebook

From the box-plot, looks like the class-1 tweets have much longer text. There are also outliers present so text-length won’t be a useful feature to consider.

dataset['class'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1c0b3b2d6a0>
Image in a Jupyter notebook

The above histogram shows that most of the tweets are considered to be offensive words by the CF coders.

# collecting only the tweets from the csv file into a variable name tweet tweet=dataset.tweet

Preprocessing of the tweets

## 1. Removal of punctuation and capitlization ## 2. Tokenizing ## 3. Removal of stopwords ## 4. Stemming stopwords = nltk.corpus.stopwords.words("english") #extending the stopwords to include other words used in twitter such as retweet(rt) etc. other_exclusions = ["#ff", "ff", "rt"] stopwords.extend(other_exclusions) stemmer = PorterStemmer() def preprocess(tweet): # removal of extra spaces regex_pat = re.compile(r'\s+') tweet_space = tweet.str.replace(regex_pat, ' ') # removal of @name[mention] regex_pat = re.compile(r'@[\w\-]+') tweet_name = tweet_space.str.replace(regex_pat, '') # removal of links[https://abc.com] giant_url_regex = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') tweets = tweet_name.str.replace(giant_url_regex, '') # removal of punctuations and numbers punc_remove = tweets.str.replace("[^a-zA-Z]", " ") # remove whitespace with a single space newtweet=punc_remove.str.replace(r'\s+', ' ') # remove leading and trailing whitespace newtweet=newtweet.str.replace(r'^\s+|\s+?$','') # replace normal numbers with numbr newtweet=newtweet.str.replace(r'\d+(\.\d+)?','numbr') # removal of capitalization tweet_lower = newtweet.str.lower() # tokenizing tokenized_tweet = tweet_lower.apply(lambda x: x.split()) # removal of stopwords tokenized_tweet= tokenized_tweet.apply(lambda x: [item for item in x if item not in stopwords]) # stemming of the tweets tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) for i in range(len(tokenized_tweet)): tokenized_tweet[i] = ' '.join(tokenized_tweet[i]) tweets_p= tokenized_tweet return tweets_p processed_tweets = preprocess(tweet) dataset['processed_tweets'] = processed_tweets print(dataset[["tweet","processed_tweets"]].head(10))
tweet \ 0 !!! RT @mayasolovely: As a woman you shouldn't... 1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... 2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... 3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... 4 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... 5 !!!!!!!!!!!!!!!!!!"@T_Madison_x: The shit just... 6 !!!!!!"@__BrighterDays: I can not just sit up ... 7 !!!!&#8220;@selfiequeenbri: cause I'm tired of... 8 " &amp; you might not get ya bitch back &amp; ... 9 " @rhythmixx_ :hobbies include: fighting Maria... processed_tweets 0 woman complain clean hous amp man alway take t... 1 boy dat cold tyga dwn bad cuffin dat hoe st place 2 dawg ever fuck bitch start cri confus shit 3 look like tranni 4 shit hear might true might faker bitch told ya 5 shit blow claim faith somebodi still fuck hoe 6 sit hate anoth bitch got much shit go 7 caus tire big bitch come us skinni girl 8 amp might get ya bitch back amp that 9 hobbi includ fight mariam bitch

Visualizations

# visualizing which of the word is most commonly used in the twitter dataset from wordcloud import WordCloud # imshow-Display data as an image # interpolation - https://matplotlib.org/3.2.1/gallery/images_contours_and_fields/interpolation_methods.html all_words = ' '.join([text for text in dataset['processed_tweets'] ]) wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) #random=0.30 plt.figure(figsize=(10, 7)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis('off') plt.show()
Image in a Jupyter notebook
# visualizing which of the word is most commonly used for hatred speech hatred_words = ' '.join([text for text in dataset['processed_tweets'][dataset['class'] == 0]]) wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(hatred_words) plt.figure(figsize=(10, 7)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis('off') plt.show()
Image in a Jupyter notebook
# visualizing which of the word is most commonly used for offensive speech offensive_words = ' '.join([text for text in dataset['processed_tweets'][dataset['class'] == 1]]) wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(offensive_words) plt.figure(figsize=(10, 7)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis('off') plt.show()
Image in a Jupyter notebook

Feature Engineering

#TF-IDF Features-F1 # https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),max_df=0.75, min_df=5, max_features=10000) # TF-IDF feature matrix tfidf = tfidf_vectorizer.fit_transform(dataset['processed_tweets'] ) tfidf
<24783x6441 sparse matrix of type '<class 'numpy.float64'>' with 189618 stored elements in Compressed Sparse Row format>

Running various model Using TFIDF without additional features

# If you don't specify the random_state in the code, # then every time you run(execute) your code a new random value is generated # and the train and test datasets would have different values each time. X = tfidf y = dataset['class'].astype(int) X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) model = LogisticRegression().fit(X_train_tfidf,y_train) y_preds = model.predict(X_test_tfidf) report = classification_report( y_test, y_preds ) print(report) acc=accuracy_score(y_test,y_preds) print("Logistic Regression, Accuracy Score:" , acc)
precision recall f1-score support 0 0.54 0.15 0.23 290 1 0.91 0.97 0.93 3832 2 0.85 0.80 0.83 835 micro avg 0.89 0.89 0.89 4957 macro avg 0.77 0.64 0.66 4957 weighted avg 0.87 0.89 0.88 4957 Logistic Regression, Accuracy Score: 0.8910631430300585
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) rf=RandomForestClassifier() rf.fit(X_train_tfidf,y_train) y_preds = rf.predict(X_test_tfidf) acc1=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("Random Forest, Accuracy Score:",acc1)
precision recall f1-score support 0 0.51 0.29 0.37 290 1 0.93 0.95 0.94 3832 2 0.83 0.89 0.86 835 micro avg 0.90 0.90 0.90 4957 macro avg 0.76 0.71 0.72 4957 weighted avg 0.89 0.90 0.89 4957 Random Forest, Accuracy Score: 0.9009481541254791
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X.toarray(), y, random_state=42, test_size=0.2) nb=GaussianNB() nb.fit(X_train_tfidf,y_train) y_preds = nb.predict(X_test_tfidf) acc2=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("Naive Bayes, Accuracy Score:",acc2)
precision recall f1-score support 0 0.10 0.39 0.16 290 1 0.89 0.68 0.77 3832 2 0.54 0.58 0.56 835 micro avg 0.65 0.65 0.65 4957 macro avg 0.51 0.55 0.50 4957 weighted avg 0.79 0.65 0.70 4957 Naive Bayes, Accuracy Score: 0.6491829735727255
support =LinearSVC(random_state=20) support.fit(X_train_tfidf,y_train) y_preds = support.predict(X_test_tfidf) acc3=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("SVM, Accuracy Score:" , acc3)
precision recall f1-score support 0 0.46 0.26 0.33 290 1 0.92 0.95 0.94 3832 2 0.83 0.85 0.84 835 micro avg 0.89 0.89 0.89 4957 macro avg 0.74 0.69 0.70 4957 weighted avg 0.88 0.89 0.89 4957 SVM, Accuracy Score: 0.8932822271535202
objects = ('Logistic', 'RandomForest', 'Naive_bayes', 'SVM') y_pos = np.arange(len(objects)) performance = [acc,acc1,acc2,acc3] plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Accuracy') plt.title('Algorithm Comparision for F1') plt.show()
Image in a Jupyter notebook

Sentiment Analysis, using polarity scores as features

sentiment_analyzer = VS() def count_tags(tweet_c): space_pattern = '\s+' giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') mention_regex = '@[\w\-]+' hashtag_regex = '#[\w\-]+' parsed_text = re.sub(space_pattern, ' ', tweet_c) parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text) parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text) parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text) return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE')) def sentiment_analysis(tweet): sentiment = sentiment_analyzer.polarity_scores(tweet) twitter_objs = count_tags(tweet) features = [sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],twitter_objs[0], twitter_objs[1], twitter_objs[2]] #features = pandas.DataFrame(features) return features def sentiment_analysis_array(tweets): features=[] for t in tweets: features.append(sentiment_analysis(t)) return np.array(features) final_features = sentiment_analysis_array(tweet) #final_features new_features = panda.DataFrame({'Neg':final_features[:,0],'Pos':final_features[:,1],'Neu':final_features[:,2],'Compound':final_features[:,3], 'url_tag':final_features[:,4],'mention_tag':final_features[:,5],'hash_tag':final_features[:,6]}) new_features
# F2-Conctaenation of tf-idf scores and sentiment scores tfidf_a = tfidf.toarray() modelling_features = np.concatenate([tfidf_a,final_features],axis=1) modelling_features.shape
(24783, 6448)

Running various model Using TFIDF and additional features

# Running the model Using TFIDF with some features from sentiment analysis X = panda.DataFrame(modelling_features) y = dataset['class'].astype(int) X_train_bow, X_test_bow, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) model = LogisticRegression().fit(X_train_bow,y_train) y_preds = model.predict(X_test_bow) report = classification_report( y_test, y_preds ) print(report) acc=accuracy_score(y_test,y_preds) print("Logistic Regression,Accuracy Score:" , acc)
precision recall f1-score support 0 0.56 0.15 0.24 290 1 0.91 0.96 0.94 3832 2 0.84 0.83 0.84 835 micro avg 0.89 0.89 0.89 4957 macro avg 0.77 0.65 0.67 4957 weighted avg 0.88 0.89 0.88 4957 Logistic Regression,Accuracy Score: 0.8946943715957232
X = panda.DataFrame(modelling_features) y = dataset['class'].astype(int) X_train_bow, X_test_bow, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) rf=RandomForestClassifier() rf.fit(X_train_bow,y_train) y_preds = rf.predict(X_test_bow) acc1=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("Random Forest, Accuracy Score:",acc1)
precision recall f1-score support 0 0.44 0.15 0.23 290 1 0.90 0.97 0.93 3832 2 0.86 0.76 0.81 835 micro avg 0.88 0.88 0.88 4957 macro avg 0.73 0.63 0.66 4957 weighted avg 0.87 0.88 0.87 4957 Random Forest, Accuracy Score: 0.8840024208190438
X = panda.DataFrame(modelling_features) y = dataset['class'].astype(int) X_train_bow, X_test_bow, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) nb=GaussianNB() nb.fit(X_train_bow,y_train) y_preds = nb.predict(X_test_bow) acc2=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("Naive Bayes, Accuracy Score:",acc2)
precision recall f1-score support 0 0.10 0.39 0.16 290 1 0.89 0.68 0.77 3832 2 0.54 0.59 0.56 835 micro avg 0.65 0.65 0.65 4957 macro avg 0.51 0.55 0.50 4957 weighted avg 0.79 0.65 0.70 4957 Naive Bayes, Accuracy Score: 0.650191648174299
X = panda.DataFrame(modelling_features) y = dataset['class'].astype(int) X_train_bow, X_test_bow, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) support =LinearSVC(random_state=20) support.fit(X_train_bow,y_train) y_preds = support.predict(X_test_bow) acc3=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("SVM, Accuracy Score:" , acc3)
precision recall f1-score support 0 0.46 0.26 0.33 290 1 0.92 0.95 0.94 3832 2 0.83 0.85 0.84 835 micro avg 0.89 0.89 0.89 4957 macro avg 0.74 0.69 0.70 4957 weighted avg 0.88 0.89 0.88 4957 SVM, Accuracy Score: 0.891466612870688
C:\Users\NAKUL LAKHOTIA\Anaconda3\lib\site-packages\sklearn\svm\base.py:931: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. "the number of iterations.", ConvergenceWarning)
objects = ('Logistic', 'RandomForest', 'Naive_bayes', 'SVM') y_pos = np.arange(len(objects)) performance = [acc,acc1,acc2,acc3] plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Accuracy') plt.title('Algorithm Comparision for F2') plt.show()
Image in a Jupyter notebook
# create doc2vec vector columns # Initialize and train the model from gensim.test.utils import common_texts from gensim.models.doc2vec import Doc2Vec, TaggedDocument #The input for a Doc2Vec model should be a list of TaggedDocument(['list','of','word'], [TAG_001]). #A good practice is using the indexes of sentences as the tags. documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(dataset["processed_tweets"].apply(lambda x: x.split(" ")))] # train a Doc2Vec model with our text data # window- The maximum distance between the current and predicted word within a sentence. # mincount-Ignores all words with total frequency lower than this. # workers -Use these many worker threads to train the model # Training Model - distributed bag of words (PV-DBOW) is employed. model = Doc2Vec(documents, ,vector_size=5, window=2, min_count=1, workers=4) #infer_vector - Infer a vector for given post-bulk training document. # Syntax- infer_vector(doc_words, alpha=None, min_alpha=None, epochs=None, steps=None) # doc_words-A document for which the vector representation will be inferred. # transform each document into a vector data doc2vec_df = dataset["processed_tweets"].apply(lambda x: model.infer_vector(x.split(" "))).apply(panda.Series) doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns] doc2vec_df
C:\Users\NAKUL LAKHOTIA\Anaconda3\lib\site-packages\gensim\models\base_any2vec.py:743: UserWarning: C extension not loaded, training will be slow. Install a C compiler and reinstall gensim for fast training. "C extension not loaded, training will be slow. "
# conctaenation of tf-idf scores, sentiment scores and doc2vec columns modelling_features = np.concatenate([tfidf_a,final_features,doc2vec_df],axis=1) modelling_features.shape
(24783, 6453)

Running the models Using TFIDF with additional features from sentiment analysis and doc2vec

X = panda.DataFrame(modelling_features) y = dataset['class'].astype(int) X_train_bow, X_test_bow, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) model = LogisticRegression().fit(X_train_bow,y_train) y_preds = model.predict(X_test_bow) report = classification_report( y_test, y_preds ) print(report) acc=accuracy_score(y_test,y_preds) print("Logistic Regression, Accuracy Score:" , acc)
precision recall f1-score support 0 0.56 0.15 0.24 290 1 0.91 0.96 0.94 3832 2 0.84 0.83 0.84 835 micro avg 0.89 0.89 0.89 4957 macro avg 0.77 0.65 0.67 4957 weighted avg 0.88 0.89 0.88 4957 Logistic Regression, Accuracy Score: 0.8946943715957232
X = panda.DataFrame(modelling_features) y = dataset['class'].astype(int) X_train_bow, X_test_bow, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) rf=RandomForestClassifier() rf.fit(X_train_bow,y_train) y_preds = rf.predict(X_test_bow) acc1=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("Random Forest, Accuracy Score:",acc1)
precision recall f1-score support 0 0.46 0.17 0.24 290 1 0.89 0.96 0.93 3832 2 0.84 0.70 0.77 835 micro avg 0.87 0.87 0.87 4957 macro avg 0.73 0.61 0.65 4957 weighted avg 0.86 0.87 0.86 4957 Random Forest, Accuracy Score: 0.8739156748033085
X = panda.DataFrame(modelling_features) y = dataset['class'].astype(int) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) nb=GaussianNB() nb.fit(X_train,y_train) y_preds = nb.predict(X_test) acc2=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("Naive Bayes, Accuracy Score:",acc2)
precision recall f1-score support 0 0.10 0.39 0.16 290 1 0.89 0.68 0.77 3832 2 0.54 0.59 0.56 835 micro avg 0.65 0.65 0.65 4957 macro avg 0.51 0.55 0.50 4957 weighted avg 0.79 0.65 0.70 4957 Naive Bayes, Accuracy Score: 0.650191648174299
X = panda.DataFrame(modelling_features) y = dataset['class'].astype(int) X_train_bow, X_test_bow, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) support =LinearSVC(random_state=20) support.fit(X_train_bow,y_train) y_preds = support.predict(X_test_bow) acc3=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("SVM, Accuracy Score:" , acc3)
precision recall f1-score support 0 0.44 0.26 0.33 290 1 0.92 0.95 0.94 3832 2 0.82 0.85 0.84 835 micro avg 0.89 0.89 0.89 4957 macro avg 0.73 0.68 0.70 4957 weighted avg 0.88 0.89 0.88 4957 SVM, Accuracy Score: 0.890054468428485
C:\Users\NAKUL LAKHOTIA\Anaconda3\lib\site-packages\sklearn\svm\base.py:931: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. "the number of iterations.", ConvergenceWarning)
objects = ('Logistic', 'RandomForest', 'Naive_bayes', 'SVM') y_pos = np.arange(len(objects)) performance = [acc,acc1,acc2,acc3] plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Accuracy') plt.title('Algorithm Comparision') plt.show()
Image in a Jupyter notebook
#Using TFIDF with sentiment scores,doc2vec and enhanced features def additional_features(tweet): syllables = textstat.syllable_count(tweet) num_chars = sum(len(w) for w in tweet) num_chars_total = len(tweet) num_words = len(tweet.split()) # avg_syl = total syllables/ total words avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4) num_unique_terms = len(set(tweet.split())) # Flesch–Kincaid readability tests are readability tests # designed to indicate how difficult a passage in English is to understand. # There are two tests, the Flesch Reading Ease, and the Flesch–Kincaid Grade # A text with a comparatively high score on FRE test should have a lower score on the FKRA test. # Reference - https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests ###Modified FK grade, where avg words per sentence is : just num words/1 FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1) ##Modified FRE score, where sentence fixed to 1 FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2) add_features=[FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_words, num_unique_terms] return add_features def get_additonal_feature_array(tweets): features=[] for t in tweets: features.append(additional_features(t)) return np.array(features) fFeatures = get_additonal_feature_array(processed_tweets)
tfidf_a = tfidf.toarray() modelling_features_enhanced = np.concatenate([tfidf_a,final_features,doc2vec_df,fFeatures],axis=1) modelling_features_enhanced.shape
(24783, 6461)

Running the models Using TFIDF with sentiment scores,doc2vec and enhanced features

# Running the model Using TFIDF with enhanced features X = panda.DataFrame(modelling_features_enhanced) y = dataset['class'].astype(int) X_train_features, X_test_features, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) model = LogisticRegression().fit(X_train_features,y_train) y_preds = model.predict(X_test_features) report = classification_report( y_test, y_preds ) print(report) acc=accuracy_score(y_test,y_preds) print("Logistic Regression, Accuracy Score:" , acc)
precision recall f1-score support 0 0.56 0.14 0.23 279 1 0.91 0.97 0.94 3852 2 0.85 0.82 0.84 826 micro avg 0.90 0.90 0.90 4957 macro avg 0.78 0.64 0.67 4957 weighted avg 0.88 0.90 0.88 4957 Logistic Regression, Accuracy Score: 0.8961065160379261
#Confusion Matrix for TFIDF with additional features from sklearn.metrics import confusion_matrix confusion_matrix = confusion_matrix(y_test,y_preds) matrix_proportions = np.zeros((3,3)) for i in range(0,3): matrix_proportions[i,:] = confusion_matrix[i,:]/float(confusion_matrix[i,:].sum()) names=['Hate','Offensive','Neither'] confusion_df = panda.DataFrame(matrix_proportions, index=names,columns=names) plt.figure(figsize=(5,5)) seaborn.heatmap(confusion_df,annot=True,annot_kws={"size": 12},cmap='YlGnBu',cbar=False, square=True,fmt='.2f') plt.ylabel(r'True Value',fontsize=14) plt.xlabel(r'Predicted Value',fontsize=14) plt.tick_params(labelsize=12)
Image in a Jupyter notebook
# From the confusion matrix its clear that the model misclassifies 78% of the hate data as offensive data. This explains the reduction in class-0 # bar on the histogram for the predicted class and increase of bar for class-1 (offensive).
testing_index=list(X_test_features.index[0:10]) #print(testing_index) print("Predicted Class:",y_preds[0:10]) print("Actual Class:",y_test.tolist()[0:10])
Predicted Class: [2 1 1 1 1 1 1 1 2 1] Actual Class: [2, 1, 1, 0, 2, 1, 1, 1, 2, 2]
# Histogram presenting the count of different classes- Actual ax=y_test.hist() ax.set_xlabel("Class", labelpad=20, weight='bold', size=12) ax.set_ylabel("Count", labelpad=20, weight='bold', size=12)
Text(0, 0.5, 'Count')
Image in a Jupyter notebook
# Histogram presenting the count of different classes- Predicted ax=panda.Series(y_preds).hist() ax.set_xlabel("Class", labelpad=20, weight='bold', size=12) ax.set_ylabel("Count", labelpad=20, weight='bold', size=12)
Text(0, 0.5, 'Count')
Image in a Jupyter notebook
X = panda.DataFrame(modelling_features_enhanced) y = dataset['class'].astype(int) X_train_features, X_test_features, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) rf=RandomForestClassifier() rf.fit(X_train_features,y_train) y_preds = rf.predict(X_test_features) acc1=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("Random Forest, Accuracy Score:",acc1)
precision recall f1-score support 0 0.41 0.09 0.15 279 1 0.88 0.97 0.92 3852 2 0.84 0.63 0.72 826 micro avg 0.86 0.86 0.86 4957 macro avg 0.71 0.56 0.60 4957 weighted avg 0.84 0.86 0.84 4957 Random Forest, Accuracy Score: 0.8642323986282026
X = panda.DataFrame(modelling_features_enhanced) y = dataset['class'].astype(int) X_train_features, X_test_features, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) nb=GaussianNB() nb.fit(X_train_features,y_train) y_preds = nb.predict(X_test_features) acc2=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("Naive Bayes, Accuracy Score:",acc2)
precision recall f1-score support 0 0.09 0.36 0.15 279 1 0.90 0.69 0.78 3852 2 0.59 0.65 0.62 826 micro avg 0.66 0.66 0.66 4957 macro avg 0.53 0.57 0.51 4957 weighted avg 0.80 0.66 0.72 4957 Naive Bayes, Accuracy Score: 0.662497478313496
#Confusion Matrix for TFIDF with additional features from sklearn.metrics import confusion_matrix confusion_matrix = confusion_matrix(y_test,y_preds) matrix_proportions = np.zeros((3,3)) for i in range(0,3): matrix_proportions[i,:] = confusion_matrix[i,:]/float(confusion_matrix[i,:].sum()) names=['Hate','Offensive','Neither'] confusion_df = panda.DataFrame(matrix_proportions, index=names,columns=names) plt.figure(figsize=(5,5)) seaborn.heatmap(confusion_df,annot=True,annot_kws={"size": 12},cmap='YlGnBu',cbar=False, square=True,fmt='.2f') plt.ylabel(r'True Value',fontsize=14) plt.xlabel(r'Predicted Value',fontsize=14) plt.tick_params(labelsize=12)
Image in a Jupyter notebook
X = panda.DataFrame(modelling_features_enhanced) y = dataset['class'].astype(int) X_train_features, X_test_features, y_train, y_test_helo = train_test_split(X, y, random_state=0, test_size=0.2) support =LinearSVC(random_state=20) support.fit(X_train_features,y_train) y_preds = support.predict(X_test_features) acc3=accuracy_score(y_test_helo,y_preds) report = classification_report( y_test_helo, y_preds ) print(report) print("SVM, Accuracy Score:" ,acc3 )
precision recall f1-score support 0 0.40 0.01 0.03 279 1 0.83 0.99 0.90 3852 2 0.92 0.36 0.51 826 micro avg 0.83 0.83 0.83 4957 macro avg 0.72 0.45 0.48 4957 weighted avg 0.82 0.83 0.79 4957 SVM, Accuracy Score: 0.8317530764575348
C:\Users\NAKUL LAKHOTIA\Anaconda3\lib\site-packages\sklearn\svm\base.py:931: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. "the number of iterations.", ConvergenceWarning)
objects = ('Logistic', 'RandomForest', 'Naive_bayes', 'SVM') y_pos = np.arange(len(objects)) performance = [acc,acc1,acc2,acc3] plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Accuracy') plt.title('Algorithm Comparision') plt.show()
Image in a Jupyter notebook

Combining different features

#f1,f3 and f4 combined tfidf_a = tfidf.toarray() modelling_features_one = np.concatenate([tfidf_a,doc2vec_df,fFeatures],axis=1) modelling_features_one.shape
(24783, 6454)
X = panda.DataFrame(modelling_features_one) y = dataset['class'].astype(int) X_train_features, X_test_features, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) support =LinearSVC(random_state=20) support.fit(X_train_features,y_train) y_preds = support.predict(X_test_features) acc3=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("SVM, Accuracy Score:" ,acc3 )
C:\Users\NAKUL LAKHOTIA\Anaconda3\lib\site-packages\sklearn\svm\base.py:931: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. "the number of iterations.", ConvergenceWarning)
precision recall f1-score support 0 0.57 0.15 0.24 279 1 0.89 0.97 0.93 3852 2 0.86 0.73 0.79 826 micro avg 0.88 0.88 0.88 4957 macro avg 0.78 0.62 0.65 4957 weighted avg 0.87 0.88 0.87 4957 SVM, Accuracy Score: 0.8846076255799878
#f1,f2 and f4 combined tfidf_a = tfidf.toarray() modelling_features_two = np.concatenate([tfidf_a,final_features,fFeatures],axis=1) modelling_features_two.shape
(24783, 6456)
X = panda.DataFrame(modelling_features_two) y = dataset['class'].astype(int) X_train_features, X_test_features, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) support =LinearSVC(random_state=20) support.fit(X_train_features,y_train) y_preds = support.predict(X_test_features) acc3=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("SVM, Accuracy Score:" ,acc3 )
C:\Users\NAKUL LAKHOTIA\Anaconda3\lib\site-packages\sklearn\svm\base.py:931: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. "the number of iterations.", ConvergenceWarning)
precision recall f1-score support 0 0.53 0.19 0.28 279 1 0.93 0.94 0.93 3852 2 0.76 0.88 0.81 826 micro avg 0.89 0.89 0.89 4957 macro avg 0.74 0.67 0.67 4957 weighted avg 0.88 0.89 0.88 4957 SVM, Accuracy Score: 0.8876336493847085
#f2,f3 and f4 combined modelling_features_three = np.concatenate([final_features,fFeatures],axis=1) modelling_features_three.shape
(24783, 15)
X = panda.DataFrame(modelling_features_three) y = dataset['class'].astype(int) X_train_features, X_test_features, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) support =LinearSVC(random_state=20) support.fit(X_train_features,y_train) y_preds = support.predict(X_test_features) acc3=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("SVM, Accuracy Score:" ,acc3 )
precision recall f1-score support 0 0.14 0.01 0.02 279 1 0.79 0.99 0.88 3852 2 0.66 0.06 0.11 826 micro avg 0.78 0.78 0.78 4957 macro avg 0.53 0.35 0.34 4957 weighted avg 0.73 0.78 0.70 4957 SVM, Accuracy Score: 0.7805124066975994
C:\Users\NAKUL LAKHOTIA\Anaconda3\lib\site-packages\sklearn\svm\base.py:931: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. "the number of iterations.", ConvergenceWarning)
# the most important feature we found to be was the tf-idf scores which helps in better classification of hate speech. # Doc2vec columns are not found to be very significant in classification purpose as it makes very less difference when #its removed form the feature set. SVM's and RF's performance is hugely impacted when tf-idf scores are not included in the feature set.
X = panda.DataFrame(modelling_features_two) y = dataset['class'].astype(int) X_train_features, X_test_features, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) nb=GaussianNB() nb.fit(X_train_features,y_train) y_preds = nb.predict(X_test_features) acc2=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("Naive Bayes, Accuracy Score:",acc2)
precision recall f1-score support 0 0.09 0.36 0.15 279 1 0.90 0.69 0.78 3852 2 0.59 0.65 0.62 826 micro avg 0.66 0.66 0.66 4957 macro avg 0.53 0.57 0.51 4957 weighted avg 0.80 0.66 0.72 4957 Naive Bayes, Accuracy Score: 0.662497478313496
# Naive Baiyes Classifier performs significantly better with feature set of f-2,3,4 which #actually performs poor for Logistic Regression especially in prediction of "hate" label.
X = panda.DataFrame(modelling_features_two) y = dataset['class'].astype(int) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) model = LogisticRegression().fit(X_train,y_train) y_preds = model.predict(X_test) report = classification_report( y_test, y_preds ) print(report) acc=accuracy_score(y_test,y_preds) print("Logistic Regression, Accuracy Score:" , acc)
precision recall f1-score support 0 0.56 0.14 0.23 279 1 0.91 0.97 0.94 3852 2 0.84 0.82 0.83 826 micro avg 0.89 0.89 0.89 4957 macro avg 0.77 0.64 0.67 4957 weighted avg 0.88 0.89 0.88 4957 Logistic Regression, Accuracy Score: 0.8946943715957232
X = panda.DataFrame(modelling_features_three) y = dataset['class'].astype(int) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) rf=RandomForestClassifier() rf.fit(X_train,y_train) y_preds = rf.predict(X_test) acc1=accuracy_score(y_test,y_preds) report = classification_report( y_test, y_preds ) print(report) print("Random Forest, Accuracy Score:",acc1)
precision recall f1-score support 0 0.13 0.04 0.06 279 1 0.82 0.92 0.86 3852 2 0.47 0.32 0.38 826 micro avg 0.77 0.77 0.77 4957 macro avg 0.48 0.42 0.44 4957 weighted avg 0.72 0.77 0.74 4957 Random Forest, Accuracy Score: 0.7682065765584023
#Confusion Matrix for TFIDF with additional features from sklearn.metrics import confusion_matrix confusion_matrix = confusion_matrix(y_test,y_preds) matrix_proportions = np.zeros((3,3)) for i in range(0,3): matrix_proportions[i,:] = confusion_matrix[i,:]/float(confusion_matrix[i,:].sum()) names=['Hate','Offensive','Neither'] confusion_df = panda.DataFrame(matrix_proportions, index=names,columns=names) plt.figure(figsize=(5,5)) seaborn.heatmap(confusion_df,annot=True,annot_kws={"size": 12},cmap='YlGnBu',cbar=False, square=True,fmt='.2f') plt.ylabel(r'True Value',fontsize=14) plt.xlabel(r'Predicted Value',fontsize=14) plt.tick_params(labelsize=12)
Image in a Jupyter notebook