In this project we will do a little stylometry to measure the similarity of two text documents.

Here is a demonstration of how your project will work.

In [37]:
#First we define the two texts.

text1 = "Crabs are a noisome animal, filthy and mean-spirited.  The world would be better without crabs.  We would be spared the evil sight of their clacking little claws."

text2 = "Horses are well known to be ravenous devourers of clouds.  They will especially eat clouds that are shaped like pieces of Civil War ordinance, such as cannons, or even sometimes smooth bore muskets."

text3 = "Cancer the Crab is in particular a most offensive creature.  Repellent to both eye and nose, this crab yet manages to offend senses heretofore unremarked upon, which the sensorium reserves only for terrific outrages to perception and taste."

text4 = "Clouds pass by all the time. Some say they drift but others say that clouds scud.  Scudding is something only clouds do, although there was a strangely named missile in the 1990's that was also said to scud.  This is not to be confused with 'scut' which is a word describing a short tail."

texts  = text1,text2,text3,text4
In [41]:
# Now they are processed into lists of lowercase words.

wordlists = []
for text in texts:
    ###FILL IN THIS CODE###

print(wordlists    )
[['crabs', 'are', 'a', 'noisome', 'animal', 'filthy', 'and', 'meanspirited', 'the', 'world', 'would', 'be', 'better', 'without', 'crabs', 'we', 'would', 'be', 'spared', 'the', 'evil', 'sight', 'of', 'their', 'clacking', 'little', 'claws'], ['horses', 'are', 'well', 'known', 'to', 'be', 'ravenous', 'devourers', 'of', 'clouds', 'they', 'will', 'especially', 'eat', 'clouds', 'that', 'are', 'shaped', 'like', 'pieces', 'of', 'civil', 'war', 'ordinance', 'such', 'as', 'cannons', 'or', 'even', 'sometimes', 'smooth', 'bore', 'muskets'], ['cancer', 'the', 'crab', 'is', 'in', 'particular', 'a', 'most', 'offensive', 'creature', 'repellent', 'to', 'both', 'eye', 'and', 'nose', 'this', 'crab', 'yet', 'manages', 'to', 'offend', 'senses', 'heretofore', 'unremarked', 'upon', 'which', 'the', 'sensorium', 'reserves', 'only', 'for', 'terrific', 'outrages', 'to', 'perception', 'and', 'taste'], ['clouds', 'pass', 'by', 'all', 'the', 'time', 'some', 'say', 'they', 'drift', 'but', 'others', 'say', 'that', 'clouds', 'scud', 'scudding', 'is', 'something', 'only', 'clouds', 'do', 'although', 'there', 'was', 'a', 'strangely', 'named', 'missile', 'in', 'the', '1990s', 'that', 'was', 'also', 'said', 'to', 'scud', 'this', 'is', 'not', 'to', 'be', 'confused', 'with', 'scut', 'which', 'is', 'a', 'word', 'describing', 'a', 'short', 'tail']]
In [42]:
# Now make one big list of all the words that occur in any of the documents...
# We make the list have no repeats by converting from list to set and then back to list.

all_words = []
for wordlist in wordlists:
    ###FILL IN THIS CODE###
    
all_words
Out[42]:
['evil',
 'others',
 'also',
 'we',
 'civil',
 'say',
 'senses',
 'are',
 'of',
 'world',
 'outrages',
 'little',
 'which',
 'time',
 'even',
 'to',
 'that',
 'something',
 'for',
 'be',
 'both',
 'without',
 'short',
 'pass',
 'scudding',
 'a',
 'drift',
 'the',
 'confused',
 'ravenous',
 'all',
 'strangely',
 'missile',
 'sensorium',
 'their',
 'such',
 'this',
 'spared',
 'in',
 'offensive',
 'crabs',
 'cancer',
 'as',
 'muskets',
 'crab',
 'describing',
 'and',
 'reserves',
 'there',
 'repellent',
 'better',
 'sometimes',
 'nose',
 'taste',
 'scut',
 'although',
 'known',
 'but',
 'clacking',
 'horses',
 'sight',
 'shaped',
 'creature',
 'tail',
 'is',
 'especially',
 'or',
 'said',
 'with',
 'some',
 'named',
 'war',
 'well',
 'eat',
 'manages',
 'was',
 'cannons',
 'word',
 'claws',
 'like',
 'pieces',
 'clouds',
 'noisome',
 'meanspirited',
 'filthy',
 'would',
 'will',
 'they',
 'ordinance',
 'bore',
 'only',
 'heretofore',
 'devourers',
 'offend',
 'do',
 'yet',
 'animal',
 'scud',
 'by',
 'smooth',
 'perception',
 'unremarked',
 'upon',
 'particular',
 'most',
 'terrific',
 '1990s',
 'eye',
 'not']
In [43]:
# We make a frequency count of the words occurring in each document

frequency_counts = []
for wl in wordlists:
    freqs = []
    for word in all_words:
        ###FILL IN THIS CODE###
    frequency_counts.append(freqs)
frequency_counts    
Out[43]:
[[[1, 'evil'],
  [0, 'others'],
  [0, 'also'],
  [1, 'we'],
  [0, 'civil'],
  [0, 'say'],
  [0, 'senses'],
  [1, 'are'],
  [1, 'of'],
  [1, 'world'],
  [0, 'outrages'],
  [1, 'little'],
  [0, 'which'],
  [0, 'time'],
  [0, 'even'],
  [0, 'to'],
  [0, 'that'],
  [0, 'something'],
  [0, 'for'],
  [2, 'be'],
  [0, 'both'],
  [1, 'without'],
  [0, 'short'],
  [0, 'pass'],
  [0, 'scudding'],
  [1, 'a'],
  [0, 'drift'],
  [2, 'the'],
  [0, 'confused'],
  [0, 'ravenous'],
  [0, 'all'],
  [0, 'strangely'],
  [0, 'missile'],
  [0, 'sensorium'],
  [1, 'their'],
  [0, 'such'],
  [0, 'this'],
  [1, 'spared'],
  [0, 'in'],
  [0, 'offensive'],
  [2, 'crabs'],
  [0, 'cancer'],
  [0, 'as'],
  [0, 'muskets'],
  [0, 'crab'],
  [0, 'describing'],
  [1, 'and'],
  [0, 'reserves'],
  [0, 'there'],
  [0, 'repellent'],
  [1, 'better'],
  [0, 'sometimes'],
  [0, 'nose'],
  [0, 'taste'],
  [0, 'scut'],
  [0, 'although'],
  [0, 'known'],
  [0, 'but'],
  [1, 'clacking'],
  [0, 'horses'],
  [1, 'sight'],
  [0, 'shaped'],
  [0, 'creature'],
  [0, 'tail'],
  [0, 'is'],
  [0, 'especially'],
  [0, 'or'],
  [0, 'said'],
  [0, 'with'],
  [0, 'some'],
  [0, 'named'],
  [0, 'war'],
  [0, 'well'],
  [0, 'eat'],
  [0, 'manages'],
  [0, 'was'],
  [0, 'cannons'],
  [0, 'word'],
  [1, 'claws'],
  [0, 'like'],
  [0, 'pieces'],
  [0, 'clouds'],
  [1, 'noisome'],
  [1, 'meanspirited'],
  [1, 'filthy'],
  [2, 'would'],
  [0, 'will'],
  [0, 'they'],
  [0, 'ordinance'],
  [0, 'bore'],
  [0, 'only'],
  [0, 'heretofore'],
  [0, 'devourers'],
  [0, 'offend'],
  [0, 'do'],
  [0, 'yet'],
  [1, 'animal'],
  [0, 'scud'],
  [0, 'by'],
  [0, 'smooth'],
  [0, 'perception'],
  [0, 'unremarked'],
  [0, 'upon'],
  [0, 'particular'],
  [0, 'most'],
  [0, 'terrific'],
  [0, '1990s'],
  [0, 'eye'],
  [0, 'not']],
 [[0, 'evil'],
  [0, 'others'],
  [0, 'also'],
  [0, 'we'],
  [1, 'civil'],
  [0, 'say'],
  [0, 'senses'],
  [2, 'are'],
  [2, 'of'],
  [0, 'world'],
  [0, 'outrages'],
  [0, 'little'],
  [0, 'which'],
  [0, 'time'],
  [1, 'even'],
  [1, 'to'],
  [1, 'that'],
  [0, 'something'],
  [0, 'for'],
  [1, 'be'],
  [0, 'both'],
  [0, 'without'],
  [0, 'short'],
  [0, 'pass'],
  [0, 'scudding'],
  [0, 'a'],
  [0, 'drift'],
  [0, 'the'],
  [0, 'confused'],
  [1, 'ravenous'],
  [0, 'all'],
  [0, 'strangely'],
  [0, 'missile'],
  [0, 'sensorium'],
  [0, 'their'],
  [1, 'such'],
  [0, 'this'],
  [0, 'spared'],
  [0, 'in'],
  [0, 'offensive'],
  [0, 'crabs'],
  [0, 'cancer'],
  [1, 'as'],
  [1, 'muskets'],
  [0, 'crab'],
  [0, 'describing'],
  [0, 'and'],
  [0, 'reserves'],
  [0, 'there'],
  [0, 'repellent'],
  [0, 'better'],
  [1, 'sometimes'],
  [0, 'nose'],
  [0, 'taste'],
  [0, 'scut'],
  [0, 'although'],
  [1, 'known'],
  [0, 'but'],
  [0, 'clacking'],
  [1, 'horses'],
  [0, 'sight'],
  [1, 'shaped'],
  [0, 'creature'],
  [0, 'tail'],
  [0, 'is'],
  [1, 'especially'],
  [1, 'or'],
  [0, 'said'],
  [0, 'with'],
  [0, 'some'],
  [0, 'named'],
  [1, 'war'],
  [1, 'well'],
  [1, 'eat'],
  [0, 'manages'],
  [0, 'was'],
  [1, 'cannons'],
  [0, 'word'],
  [0, 'claws'],
  [1, 'like'],
  [1, 'pieces'],
  [2, 'clouds'],
  [0, 'noisome'],
  [0, 'meanspirited'],
  [0, 'filthy'],
  [0, 'would'],
  [1, 'will'],
  [1, 'they'],
  [1, 'ordinance'],
  [1, 'bore'],
  [0, 'only'],
  [0, 'heretofore'],
  [1, 'devourers'],
  [0, 'offend'],
  [0, 'do'],
  [0, 'yet'],
  [0, 'animal'],
  [0, 'scud'],
  [0, 'by'],
  [1, 'smooth'],
  [0, 'perception'],
  [0, 'unremarked'],
  [0, 'upon'],
  [0, 'particular'],
  [0, 'most'],
  [0, 'terrific'],
  [0, '1990s'],
  [0, 'eye'],
  [0, 'not']],
 [[0, 'evil'],
  [0, 'others'],
  [0, 'also'],
  [0, 'we'],
  [0, 'civil'],
  [0, 'say'],
  [1, 'senses'],
  [0, 'are'],
  [0, 'of'],
  [0, 'world'],
  [1, 'outrages'],
  [0, 'little'],
  [1, 'which'],
  [0, 'time'],
  [0, 'even'],
  [3, 'to'],
  [0, 'that'],
  [0, 'something'],
  [1, 'for'],
  [0, 'be'],
  [1, 'both'],
  [0, 'without'],
  [0, 'short'],
  [0, 'pass'],
  [0, 'scudding'],
  [1, 'a'],
  [0, 'drift'],
  [2, 'the'],
  [0, 'confused'],
  [0, 'ravenous'],
  [0, 'all'],
  [0, 'strangely'],
  [0, 'missile'],
  [1, 'sensorium'],
  [0, 'their'],
  [0, 'such'],
  [1, 'this'],
  [0, 'spared'],
  [1, 'in'],
  [1, 'offensive'],
  [0, 'crabs'],
  [1, 'cancer'],
  [0, 'as'],
  [0, 'muskets'],
  [2, 'crab'],
  [0, 'describing'],
  [2, 'and'],
  [1, 'reserves'],
  [0, 'there'],
  [1, 'repellent'],
  [0, 'better'],
  [0, 'sometimes'],
  [1, 'nose'],
  [1, 'taste'],
  [0, 'scut'],
  [0, 'although'],
  [0, 'known'],
  [0, 'but'],
  [0, 'clacking'],
  [0, 'horses'],
  [0, 'sight'],
  [0, 'shaped'],
  [1, 'creature'],
  [0, 'tail'],
  [1, 'is'],
  [0, 'especially'],
  [0, 'or'],
  [0, 'said'],
  [0, 'with'],
  [0, 'some'],
  [0, 'named'],
  [0, 'war'],
  [0, 'well'],
  [0, 'eat'],
  [1, 'manages'],
  [0, 'was'],
  [0, 'cannons'],
  [0, 'word'],
  [0, 'claws'],
  [0, 'like'],
  [0, 'pieces'],
  [0, 'clouds'],
  [0, 'noisome'],
  [0, 'meanspirited'],
  [0, 'filthy'],
  [0, 'would'],
  [0, 'will'],
  [0, 'they'],
  [0, 'ordinance'],
  [0, 'bore'],
  [1, 'only'],
  [1, 'heretofore'],
  [0, 'devourers'],
  [1, 'offend'],
  [0, 'do'],
  [1, 'yet'],
  [0, 'animal'],
  [0, 'scud'],
  [0, 'by'],
  [0, 'smooth'],
  [1, 'perception'],
  [1, 'unremarked'],
  [1, 'upon'],
  [1, 'particular'],
  [1, 'most'],
  [1, 'terrific'],
  [0, '1990s'],
  [1, 'eye'],
  [0, 'not']],
 [[0, 'evil'],
  [1, 'others'],
  [1, 'also'],
  [0, 'we'],
  [0, 'civil'],
  [2, 'say'],
  [0, 'senses'],
  [0, 'are'],
  [0, 'of'],
  [0, 'world'],
  [0, 'outrages'],
  [0, 'little'],
  [1, 'which'],
  [1, 'time'],
  [0, 'even'],
  [2, 'to'],
  [2, 'that'],
  [1, 'something'],
  [0, 'for'],
  [1, 'be'],
  [0, 'both'],
  [0, 'without'],
  [1, 'short'],
  [1, 'pass'],
  [1, 'scudding'],
  [3, 'a'],
  [1, 'drift'],
  [2, 'the'],
  [1, 'confused'],
  [0, 'ravenous'],
  [1, 'all'],
  [1, 'strangely'],
  [1, 'missile'],
  [0, 'sensorium'],
  [0, 'their'],
  [0, 'such'],
  [1, 'this'],
  [0, 'spared'],
  [1, 'in'],
  [0, 'offensive'],
  [0, 'crabs'],
  [0, 'cancer'],
  [0, 'as'],
  [0, 'muskets'],
  [0, 'crab'],
  [1, 'describing'],
  [0, 'and'],
  [0, 'reserves'],
  [1, 'there'],
  [0, 'repellent'],
  [0, 'better'],
  [0, 'sometimes'],
  [0, 'nose'],
  [0, 'taste'],
  [1, 'scut'],
  [1, 'although'],
  [0, 'known'],
  [1, 'but'],
  [0, 'clacking'],
  [0, 'horses'],
  [0, 'sight'],
  [0, 'shaped'],
  [0, 'creature'],
  [1, 'tail'],
  [3, 'is'],
  [0, 'especially'],
  [0, 'or'],
  [1, 'said'],
  [1, 'with'],
  [1, 'some'],
  [1, 'named'],
  [0, 'war'],
  [0, 'well'],
  [0, 'eat'],
  [0, 'manages'],
  [2, 'was'],
  [0, 'cannons'],
  [1, 'word'],
  [0, 'claws'],
  [0, 'like'],
  [0, 'pieces'],
  [3, 'clouds'],
  [0, 'noisome'],
  [0, 'meanspirited'],
  [0, 'filthy'],
  [0, 'would'],
  [0, 'will'],
  [1, 'they'],
  [0, 'ordinance'],
  [0, 'bore'],
  [1, 'only'],
  [0, 'heretofore'],
  [0, 'devourers'],
  [0, 'offend'],
  [1, 'do'],
  [0, 'yet'],
  [0, 'animal'],
  [2, 'scud'],
  [1, 'by'],
  [0, 'smooth'],
  [0, 'perception'],
  [0, 'unremarked'],
  [0, 'upon'],
  [0, 'particular'],
  [0, 'most'],
  [0, 'terrific'],
  [1, '1990s'],
  [0, 'eye'],
  [1, 'not']]]
In [44]:
# Now we convert from raw frequency counts to percentages.
# We also sort the frequency lists alphabetically.

for i,fq in enumerate(frequency_counts):
    total_words = sum(x[0] for x in fq)
    for pair in fq:
        ### FILL IN THIS CODE ###
   
print(frequency_counts    )
[[[0.0, '1990s'], [0.037037037037037035, 'a'], [0.0, 'all'], [0.0, 'also'], [0.0, 'although'], [0.037037037037037035, 'and'], [0.037037037037037035, 'animal'], [0.037037037037037035, 'are'], [0.0, 'as'], [0.07407407407407407, 'be'], [0.037037037037037035, 'better'], [0.0, 'bore'], [0.0, 'both'], [0.0, 'but'], [0.0, 'by'], [0.0, 'cancer'], [0.0, 'cannons'], [0.0, 'civil'], [0.037037037037037035, 'clacking'], [0.037037037037037035, 'claws'], [0.0, 'clouds'], [0.0, 'confused'], [0.0, 'crab'], [0.07407407407407407, 'crabs'], [0.0, 'creature'], [0.0, 'describing'], [0.0, 'devourers'], [0.0, 'do'], [0.0, 'drift'], [0.0, 'eat'], [0.0, 'especially'], [0.0, 'even'], [0.037037037037037035, 'evil'], [0.0, 'eye'], [0.037037037037037035, 'filthy'], [0.0, 'for'], [0.0, 'heretofore'], [0.0, 'horses'], [0.0, 'in'], [0.0, 'is'], [0.0, 'known'], [0.0, 'like'], [0.037037037037037035, 'little'], [0.0, 'manages'], [0.037037037037037035, 'meanspirited'], [0.0, 'missile'], [0.0, 'most'], [0.0, 'muskets'], [0.0, 'named'], [0.037037037037037035, 'noisome'], [0.0, 'nose'], [0.0, 'not'], [0.037037037037037035, 'of'], [0.0, 'offend'], [0.0, 'offensive'], [0.0, 'only'], [0.0, 'or'], [0.0, 'ordinance'], [0.0, 'others'], [0.0, 'outrages'], [0.0, 'particular'], [0.0, 'pass'], [0.0, 'perception'], [0.0, 'pieces'], [0.0, 'ravenous'], [0.0, 'repellent'], [0.0, 'reserves'], [0.0, 'said'], [0.0, 'say'], [0.0, 'scud'], [0.0, 'scudding'], [0.0, 'scut'], [0.0, 'senses'], [0.0, 'sensorium'], [0.0, 'shaped'], [0.0, 'short'], [0.037037037037037035, 'sight'], [0.0, 'smooth'], [0.0, 'some'], [0.0, 'something'], [0.0, 'sometimes'], [0.037037037037037035, 'spared'], [0.0, 'strangely'], [0.0, 'such'], [0.0, 'tail'], [0.0, 'taste'], [0.0, 'terrific'], [0.0, 'that'], [0.07407407407407407, 'the'], [0.037037037037037035, 'their'], [0.0, 'there'], [0.0, 'they'], [0.0, 'this'], [0.0, 'time'], [0.0, 'to'], [0.0, 'unremarked'], [0.0, 'upon'], [0.0, 'war'], [0.0, 'was'], [0.037037037037037035, 'we'], [0.0, 'well'], [0.0, 'which'], [0.0, 'will'], [0.0, 'with'], [0.037037037037037035, 'without'], [0.0, 'word'], [0.037037037037037035, 'world'], [0.07407407407407407, 'would'], [0.0, 'yet']], [[0.0, '1990s'], [0.0, 'a'], [0.0, 'all'], [0.0, 'also'], [0.0, 'although'], [0.0, 'and'], [0.0, 'animal'], [0.06060606060606061, 'are'], [0.030303030303030304, 'as'], [0.030303030303030304, 'be'], [0.0, 'better'], [0.030303030303030304, 'bore'], [0.0, 'both'], [0.0, 'but'], [0.0, 'by'], [0.0, 'cancer'], [0.030303030303030304, 'cannons'], [0.030303030303030304, 'civil'], [0.0, 'clacking'], [0.0, 'claws'], [0.06060606060606061, 'clouds'], [0.0, 'confused'], [0.0, 'crab'], [0.0, 'crabs'], [0.0, 'creature'], [0.0, 'describing'], [0.030303030303030304, 'devourers'], [0.0, 'do'], [0.0, 'drift'], [0.030303030303030304, 'eat'], [0.030303030303030304, 'especially'], [0.030303030303030304, 'even'], [0.0, 'evil'], [0.0, 'eye'], [0.0, 'filthy'], [0.0, 'for'], [0.0, 'heretofore'], [0.030303030303030304, 'horses'], [0.0, 'in'], [0.0, 'is'], [0.030303030303030304, 'known'], [0.030303030303030304, 'like'], [0.0, 'little'], [0.0, 'manages'], [0.0, 'meanspirited'], [0.0, 'missile'], [0.0, 'most'], [0.030303030303030304, 'muskets'], [0.0, 'named'], [0.0, 'noisome'], [0.0, 'nose'], [0.0, 'not'], [0.06060606060606061, 'of'], [0.0, 'offend'], [0.0, 'offensive'], [0.0, 'only'], [0.030303030303030304, 'or'], [0.030303030303030304, 'ordinance'], [0.0, 'others'], [0.0, 'outrages'], [0.0, 'particular'], [0.0, 'pass'], [0.0, 'perception'], [0.030303030303030304, 'pieces'], [0.030303030303030304, 'ravenous'], [0.0, 'repellent'], [0.0, 'reserves'], [0.0, 'said'], [0.0, 'say'], [0.0, 'scud'], [0.0, 'scudding'], [0.0, 'scut'], [0.0, 'senses'], [0.0, 'sensorium'], [0.030303030303030304, 'shaped'], [0.0, 'short'], [0.0, 'sight'], [0.030303030303030304, 'smooth'], [0.0, 'some'], [0.0, 'something'], [0.030303030303030304, 'sometimes'], [0.0, 'spared'], [0.0, 'strangely'], [0.030303030303030304, 'such'], [0.0, 'tail'], [0.0, 'taste'], [0.0, 'terrific'], [0.030303030303030304, 'that'], [0.0, 'the'], [0.0, 'their'], [0.0, 'there'], [0.030303030303030304, 'they'], [0.0, 'this'], [0.0, 'time'], [0.030303030303030304, 'to'], [0.0, 'unremarked'], [0.0, 'upon'], [0.030303030303030304, 'war'], [0.0, 'was'], [0.0, 'we'], [0.030303030303030304, 'well'], [0.0, 'which'], [0.030303030303030304, 'will'], [0.0, 'with'], [0.0, 'without'], [0.0, 'word'], [0.0, 'world'], [0.0, 'would'], [0.0, 'yet']], [[0.0, '1990s'], [0.02631578947368421, 'a'], [0.0, 'all'], [0.0, 'also'], [0.0, 'although'], [0.05263157894736842, 'and'], [0.0, 'animal'], [0.0, 'are'], [0.0, 'as'], [0.0, 'be'], [0.0, 'better'], [0.0, 'bore'], [0.02631578947368421, 'both'], [0.0, 'but'], [0.0, 'by'], [0.02631578947368421, 'cancer'], [0.0, 'cannons'], [0.0, 'civil'], [0.0, 'clacking'], [0.0, 'claws'], [0.0, 'clouds'], [0.0, 'confused'], [0.05263157894736842, 'crab'], [0.0, 'crabs'], [0.02631578947368421, 'creature'], [0.0, 'describing'], [0.0, 'devourers'], [0.0, 'do'], [0.0, 'drift'], [0.0, 'eat'], [0.0, 'especially'], [0.0, 'even'], [0.0, 'evil'], [0.02631578947368421, 'eye'], [0.0, 'filthy'], [0.02631578947368421, 'for'], [0.02631578947368421, 'heretofore'], [0.0, 'horses'], [0.02631578947368421, 'in'], [0.02631578947368421, 'is'], [0.0, 'known'], [0.0, 'like'], [0.0, 'little'], [0.02631578947368421, 'manages'], [0.0, 'meanspirited'], [0.0, 'missile'], [0.02631578947368421, 'most'], [0.0, 'muskets'], [0.0, 'named'], [0.0, 'noisome'], [0.02631578947368421, 'nose'], [0.0, 'not'], [0.0, 'of'], [0.02631578947368421, 'offend'], [0.02631578947368421, 'offensive'], [0.02631578947368421, 'only'], [0.0, 'or'], [0.0, 'ordinance'], [0.0, 'others'], [0.02631578947368421, 'outrages'], [0.02631578947368421, 'particular'], [0.0, 'pass'], [0.02631578947368421, 'perception'], [0.0, 'pieces'], [0.0, 'ravenous'], [0.02631578947368421, 'repellent'], [0.02631578947368421, 'reserves'], [0.0, 'said'], [0.0, 'say'], [0.0, 'scud'], [0.0, 'scudding'], [0.0, 'scut'], [0.02631578947368421, 'senses'], [0.02631578947368421, 'sensorium'], [0.0, 'shaped'], [0.0, 'short'], [0.0, 'sight'], [0.0, 'smooth'], [0.0, 'some'], [0.0, 'something'], [0.0, 'sometimes'], [0.0, 'spared'], [0.0, 'strangely'], [0.0, 'such'], [0.0, 'tail'], [0.02631578947368421, 'taste'], [0.02631578947368421, 'terrific'], [0.0, 'that'], [0.05263157894736842, 'the'], [0.0, 'their'], [0.0, 'there'], [0.0, 'they'], [0.02631578947368421, 'this'], [0.0, 'time'], [0.07894736842105263, 'to'], [0.02631578947368421, 'unremarked'], [0.02631578947368421, 'upon'], [0.0, 'war'], [0.0, 'was'], [0.0, 'we'], [0.0, 'well'], [0.02631578947368421, 'which'], [0.0, 'will'], [0.0, 'with'], [0.0, 'without'], [0.0, 'word'], [0.0, 'world'], [0.0, 'would'], [0.02631578947368421, 'yet']], [[0.018518518518518517, '1990s'], [0.05555555555555555, 'a'], [0.018518518518518517, 'all'], [0.018518518518518517, 'also'], [0.018518518518518517, 'although'], [0.0, 'and'], [0.0, 'animal'], [0.0, 'are'], [0.0, 'as'], [0.018518518518518517, 'be'], [0.0, 'better'], [0.0, 'bore'], [0.0, 'both'], [0.018518518518518517, 'but'], [0.018518518518518517, 'by'], [0.0, 'cancer'], [0.0, 'cannons'], [0.0, 'civil'], [0.0, 'clacking'], [0.0, 'claws'], [0.05555555555555555, 'clouds'], [0.018518518518518517, 'confused'], [0.0, 'crab'], [0.0, 'crabs'], [0.0, 'creature'], [0.018518518518518517, 'describing'], [0.0, 'devourers'], [0.018518518518518517, 'do'], [0.018518518518518517, 'drift'], [0.0, 'eat'], [0.0, 'especially'], [0.0, 'even'], [0.0, 'evil'], [0.0, 'eye'], [0.0, 'filthy'], [0.0, 'for'], [0.0, 'heretofore'], [0.0, 'horses'], [0.018518518518518517, 'in'], [0.05555555555555555, 'is'], [0.0, 'known'], [0.0, 'like'], [0.0, 'little'], [0.0, 'manages'], [0.0, 'meanspirited'], [0.018518518518518517, 'missile'], [0.0, 'most'], [0.0, 'muskets'], [0.018518518518518517, 'named'], [0.0, 'noisome'], [0.0, 'nose'], [0.018518518518518517, 'not'], [0.0, 'of'], [0.0, 'offend'], [0.0, 'offensive'], [0.018518518518518517, 'only'], [0.0, 'or'], [0.0, 'ordinance'], [0.018518518518518517, 'others'], [0.0, 'outrages'], [0.0, 'particular'], [0.018518518518518517, 'pass'], [0.0, 'perception'], [0.0, 'pieces'], [0.0, 'ravenous'], [0.0, 'repellent'], [0.0, 'reserves'], [0.018518518518518517, 'said'], [0.037037037037037035, 'say'], [0.037037037037037035, 'scud'], [0.018518518518518517, 'scudding'], [0.018518518518518517, 'scut'], [0.0, 'senses'], [0.0, 'sensorium'], [0.0, 'shaped'], [0.018518518518518517, 'short'], [0.0, 'sight'], [0.0, 'smooth'], [0.018518518518518517, 'some'], [0.018518518518518517, 'something'], [0.0, 'sometimes'], [0.0, 'spared'], [0.018518518518518517, 'strangely'], [0.0, 'such'], [0.018518518518518517, 'tail'], [0.0, 'taste'], [0.0, 'terrific'], [0.037037037037037035, 'that'], [0.037037037037037035, 'the'], [0.0, 'their'], [0.018518518518518517, 'there'], [0.018518518518518517, 'they'], [0.018518518518518517, 'this'], [0.018518518518518517, 'time'], [0.037037037037037035, 'to'], [0.0, 'unremarked'], [0.0, 'upon'], [0.0, 'war'], [0.037037037037037035, 'was'], [0.0, 'we'], [0.0, 'well'], [0.018518518518518517, 'which'], [0.0, 'will'], [0.018518518518518517, 'with'], [0.0, 'without'], [0.018518518518518517, 'word'], [0.0, 'world'], [0.0, 'would'], [0.0, 'yet']]]
In [50]:
# We now compute a similarity score for the two documents.
# The maximum similarity is 1 and the minimum similarity is 0

import numpy as np



#We don't need the words in the frequency counts -- they are redundant and
#complicate the math we need to do.
# just_freqs throws out the word part and retains the frequency count.
# It also converts to a numpy array to ease some math done later.  

just_freqs = [np.array([x[0] for x in fq]) for fq in frequency_counts]


#This is how the score will be defined. 
#It's basically the cosine of the angle between the two word vectors
# https://proofwiki.org/wiki/Cosine_Formula_for_Dot_Product

def score(fqA,fqB):
    return (fqA.dot(fqB))/(np.sqrt(np.sum(fqA**2))*np.sqrt(np.sum(fqB**2)))



for i,fq0 in enumerate(just_freqs):
    for j,fq1 in enumerate(just_freqs):
        print("The score of {} vs {} is {}".format(i,j,score(fq0,fq1)))
The score of 0 vs 0 is 1.0
The score of 0 vs 1 is 0.16239958858822998
The score of 0 vs 2 is 0.16733200530681513
The score of 0 vs 3 is 0.16598500055174645
The score of 1 vs 0 is 0.16239958858822998
The score of 1 vs 1 is 0.9999999999999994
The score of 1 vs 2 is 0.06793662204867573
The score of 1 vs 3 is 0.2096569673443836
The score of 2 vs 0 is 0.16733200530681513
The score of 2 vs 1 is 0.06793662204867573
The score of 2 vs 2 is 1.0
The score of 2 vs 3 is 0.3086066999241839
The score of 3 vs 0 is 0.16598500055174645
The score of 3 vs 1 is 0.2096569673443836
The score of 3 vs 2 is 0.3086066999241839
The score of 3 vs 3 is 1.0000000000000002
In [0]: