Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download

Jupyter notebook nlp-of-jokes.ipynb

140 views
Kernel: Python 2

Analysing the Edinburgh Fringe Festival Jokes

This is the ipython notebook for the blog post: Python, natural language processing and predicting funny.

Here are the libraries we are going to need:

import pandas # To handle our data nicely import nltk # For all the clever stuff

Loading and tidying the data

df = pandas.read_json('jokes.json') # Loading the json file df.head()
df.tail()

Getting rid of the common word and tokenising the jokes

# nltk.download() # Only do this once: needed to download the `stopwords` corpus
commonwords = [e.upper() for e in set(nltk.corpus.stopwords.words('english'))] # <- Need to download the corpus: import nltk; nltk.download() commonwords.extend(['M', 'VE']) tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') # To be able to strip out unwanted things in strings string_to_list = lambda x: [el.upper() for el in tokenizer.tokenize(x) if el.upper() not in commonwords] df['Joke'] = df['Raw_joke'].apply(string_to_list)
df.head()

Training our classifier

From here on in we use the jokes up until 2013 as the training set.

We start by getting the entire set of words in all the jokes from the training set.

df['Year'] = df['Year'].apply(int) def get_all_words(dataframe): """ A function that gets all the words from the Joke column in a given dataframe """ all_words = [] for jk in dataframe['Joke']: all_words.extend(jk) return all_words all_words = get_all_words(df[df['Year'] <= 2013]) all_words[:10] # The first ten words in our training data set
[u'HEARD', u'RUMOUR', u'CADBURY', u'BRINGING', u'ORIENTAL', u'CHOCOLATE', u'BAR', u'COULD', u'CHINESE', u'WISPA']

Creating a function to extract features from a given joke

def extract_features(joke, all_words): words = set(joke) features = {} for word in words: features['contains(%s)' % word] = (word in all_words) return features
df['Features'] = df['Joke'].apply(lambda x:extract_features(x, get_all_words(df[df['Year'] <= 2013]))) df.head()

Labelling our jokes depending on what will be deemed as funny

funny_threshold = 5 df['Rank'] = df['Rank'].apply(int) df['Funny'] = df['Rank'] <= funny_threshold df.head(10)

Creating a labeled feature

df['Labeled_Feature'] = zip(df['Features'],df['Funny']) df.head()

Creating our classifier

classifier = nltk.NaiveBayesClassifier.train(df[df['Year'] <= 2013]['Labeled_Feature'])
classifier.show_most_informative_features(10)
Most Informative Features contains(GOT) = True False : True = 2.4 : 1.0 contains(KNOW) = True True : False = 1.7 : 1.0 contains(PEOPLE) = True False : True = 1.7 : 1.0 contains(SEX) = True False : True = 1.7 : 1.0 contains(NEVER) = True False : True = 1.7 : 1.0 contains(RE) = True True : False = 1.6 : 1.0 contains(FRIEND) = True True : False = 1.6 : 1.0 contains(SAY) = True True : False = 1.6 : 1.0 contains(BOUGHT) = True True : False = 1.6 : 1.0 contains(ONE) = True True : False = 1.5 : 1.0
joke = 'Why was 10 afraid of 7? Because 7 8 9' classifier.classify(extract_features(string_to_list(joke), get_all_words(df[df['Year'] <= 2013])))
True
joke = 'Your mother is ...' classifier.classify(extract_features(string_to_list(joke), get_all_words(df[df['Year'] <= 2013])))
False

The real test comes from applying our classifier to this year's jokes

df['Prediction'] = df['Features'].apply(classifier.classify) df[df['Year'] == 2014][['Raw_joke','Funny','Prediction']]
sum(df[df['Year'] == 2014]['Prediction'] == df[df['Year'] == 2014]['Funny']) / float(len(df[df['Year'] == 2014]))
0.33333333333333331

Wrapping all of the above in a function to see if we can identify how our classifier performs based on a funniness threshold

def accuracy(funny_threshold): """ A function to return the accuracy of our predictor """ df['Funny'] = df['Rank'] <= funny_threshold # Changing the threshold df['Labeled_Feature'] = zip(df['Features'], df['Funny']) # Re create labeled features classifier = nltk.NaiveBayesClassifier.train(df[df['Year'] <= 2013]['Labeled_Feature']) # Train classifier df['Prediction'] = df['Features'].apply(classifier.classify) # Apply classifier return sum(df[df['Year'] == 2014]['Prediction'] == df[df['Year'] == 2014]['Funny']) / float(len(df[df['Year'] == 2014]))
import seaborn as sns # Making our plots look nicer easier (seaborn does a lot more: check it out) import matplotlib.pyplot as plt # Plots %matplotlib inline
x = range(0, 11) y = [accuracy(n) for n in x] sns.set_style("darkgrid") sns.set_context(rc={"figure.figsize": (8, 4)}) fig = plt.figure() plt.scatter(x, y) plt.xlabel('Funny Threshold') plt.ylabel('Accuracy of classifier') sns.plt.ylim(0,1.05) sns.plt.xlim(min(x) - .5, max(x) + .5) plt.xticks(x) sns.despine()
Image in a Jupyter notebook

Wrapping everything in another function to see the effect of the testing data set

We used previous years to train for this year. Here we will just use random samples of a variety of size of the data to train.

import random
def accuracy(ratio_of_data=.8, funny_threshold=5): """ A function to return the accuracy of our predictor based on the percentage of rows and threshold of funniness used as a training set """ n = max(1, min(int(ratio_of_data * len(df)), len(df) - 1)) # Makes sure we don't have a stupid number of rows df['Funny'] = df['Rank'] <= funny_threshold # Threshold funniness training_rows = random.sample(df.index, n) # Identify the rows used for the training all_words = get_all_words(df.ix[training_rows]) # Identify all the words df['Features'] = df['Joke'].apply(lambda x:extract_features(x, all_words)) # Extract the features df['Labeled_Feature'] = zip(df['Features'],df['Funny']) # Re create labeled features classifier = nltk.NaiveBayesClassifier.train(df.ix[training_rows]['Labeled_Feature']) # Train classifier df['Prediction'] = df['Features'].apply(classifier.classify) # Apply classifier return sum(df.drop(training_rows)['Prediction'] == df.drop(training_rows)['Funny']) / float((len(df) - n))

Here is a plot of the accuracy for varying ratio.

def ratio_experiment_data(funny_threshold=5, number_of_steps=10, number_of_repetitions=20): """ Use `funny_threshold` as the funny threshold. Repeat all this for a granularity given by `number_of_steps` and repeating each experiment `number_of_repetitions` times. """ steps = [x / float(number_of_steps) for x in range(number_of_steps)] return [[accuracy(ratio_of_data=x, funny_threshold=funny_threshold) for x in steps] for k in range(number_of_repetitions)], steps
def plot(funny_threshold=5, number_of_steps=10, number_of_repetitions=20, color='blue'): data, steps = random_experiment_data(funny_threshold=funny_threshold, number_of_steps=number_of_steps, number_of_repetitions=number_of_repetitions) fig = plt.figure() sns.tsplot(data, steps, color=color) plt.xlabel('Ratio of data used for training (%s repetitions)' % number_of_repetitions) plt.ylabel('Accuracy') sns.plt.ylim(0,1) plt.title('Accuracy with funny threshold: %s ' % funny_threshold) plt.show() return data, steps
steps = 30 repetitions = 50 stacked_data = [] clrs = sns.color_palette("hls", 11) for n in range(11): stacked_data.append(plot(funny_threshold=n, number_of_steps=steps, number_of_repetitions=repetitions, color=clrs[n - 1]))
Image in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebook

Here are all the above on a single plot (not terrible helpful).

sns.set_context(rc={"figure.figsize": (10, 10)}) plt.figure() for n in range(11): sns.tsplot(*stacked_data[n], color=clrs[n - 1], condition="$n=%s$" % (n)) plt.xlabel('% of data') plt.ylabel('Accuracy') sns.plt.ylim(0,1) plt.title('Accuracy with funny threshold: $n$' ) plt.show()
Image in a Jupyter notebook