CoCalc -- 2021-04-30-word-frequency-over-time.ipynb

¹⁹⁰¹⁸ views
ubuntu2004

Kernel: Python 3 (Anaconda 2020)

In [1]:

from bs4 import BeautifulSoup
import glob
import re
import json
import csv
import os

In [2]:

# had to delete one file from both Metadata and word-count file due to the ngram1 file not containing any information. journal-article-10.2307_1196540-ngram1.txt
total_freq = {}

# The json schema here is {pub-year: {word: count}}

for xml in glob.iglob('Metadata/*.xml'):
    with open(xml) as f:
        bs = BeautifulSoup(f, "lxml-xml")
    
    # Extract pub lish year
    pub_year = bs.year
    year = int(str(pub_year)[6:10])
    
    total_freq[year] = total_freq.get(year, {})

    
    txt = xml.replace("Metadata", "word-count").replace(".xml", "-ngram1.txt")
    
    
    
    with open(txt) as t:
        for line in t:
            sub = re.split("\s+", line)
            word = sub[0]
            count = int(sub[1])
            
            if total_freq[year].get(word, 0) == 0:
                total_freq[year][word] = 0

            total_freq[year][word] += count

    
file = open("count.json", "w")
    
with file:
    json.dump(total_freq, file)

In [0]:

In [3]:

import pandas as pd

In [4]:

df = pd.DataFrame.from_dict(total_freq, orient = 'index')

In [5]:

long_df = pd.melt(df, ignore_index = False)

Out[5]:

/ext/anaconda2020.02/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3343: FutureWarning: This dataframe has a column name that matches the 'value_name' column name of the resultiing Dataframe. In the future this will raise an error, please set the 'value_name' parameter of DataFrame.melt to a unique name.
  exec(code_obj, self.user_global_ns, self.user_ns)

In [6]:

long_df.reset_index(inplace=True)

In [7]:

long_df

Out[7]:

In [0]:

In [8]:

#I tried creatinga loop for removing stop words from the data frame but it pulled out the variable column, so I guess I'm doing this probably the worst way

long_df.drop(long_df.index[long_df['variable'] == 's'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '00'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '95'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '0'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '1'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'i'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'pp'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'has'], inplace = True)

In [9]:

long_df

Out[9]:

In [0]:

In [10]:

long_df.drop(long_df.index[long_df['variable'] == 'new'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'which'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'from'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'his'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'its'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'have'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'all'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'he'], inplace = True)

In [11]:

long_df.drop(long_df.index[long_df['variable'] == 'one'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'been'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'pages'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'we'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'would'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'had'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'j'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'our'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'cloth'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'hardback'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'you'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'p'], inplace = True)

In [12]:

long_df.drop(long_df.index[long_df['variable'] == 'new'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'york'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'press'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'were'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'any'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'isbn'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '35'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '800'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '86554'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'ii'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'e'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'd'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'her'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'b'], inplace = True)

In [13]:

long_df.drop(long_df.index[long_df['variable'] == 'who'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'also'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'what'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'more'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'may'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'should'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'those'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'ibid'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '86554'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'h'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '6'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '50'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '3'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'car'], inplace = True)

In [14]:

long_df.drop(long_df.index[long_df['variable'] == 'same'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'l'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'see'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'my'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'so'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'between'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'other'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'us'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'she'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'many'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'work'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'can'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'your'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '2'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '4'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '75'], inplace = True)

In [15]:

long_df.drop(long_df.index[long_df['variable'] == 'only'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'them'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'than'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'factor'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '5'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'co'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'two'], inplace = True)

In [16]:

long_df.drop(long_df.index[long_df['variable'] == 'most'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'must'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'upon'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'me'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'do'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'cannot'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'when'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'some'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'through'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'people'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'much'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'w'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'm'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'person'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'does'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'him'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'st'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'm'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'yet'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'c'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'x'], inplace = True)

In [17]:

long_df.drop(long_df.index[long_df['variable'] == 'could'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'd'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'when'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'how'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'whether'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'yet'], inplace = True)

In [18]:

long_df.drop(long_df.index[long_df['variable'] == 'out'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'even'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'type'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'action'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'well'], inplace = True)

In [19]:

long_df.drop(long_df.index[long_df['variable'] == 'very'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'mr'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'dr'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'made'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'own'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'far'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'himself'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '25'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '7'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'now'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'each'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'like'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'here'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'about'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'within'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'n'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 't'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'both'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'r'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'n'], inplace = True)

In [20]:

long_df.drop(long_df.index[long_df['variable'] == 'thus'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'itself'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'still'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'make'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'sense'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'focus'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'never'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'while'], inplace = True)

In [21]:

top_fifteen_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(15).reset_index(drop = True))

In [23]:

pd.set_option('display.max_row', 2000)

In [0]:

In [1]:

top_ten_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(10).reset_index(drop = True))

Out[1]:

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-97077e0e5b89> in <module>
----> 1 top_ten_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(10).reset_index(drop = True))

NameError: name 'long_df' is not defined

In [0]:

In [0]:

#You can now see the top_ten_words or the top_fifteen_words used over time. All you need to do is type top_ten_words or top_fifteen_words and then hit command return. 

#top_ten_words
#top_fifteen_words

In [0]:

In [0]:

In [0]:

Product

Resources

Company