Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
18892 views
ubuntu2004
Kernel: Python 3 (Anaconda 2020)
from bs4 import BeautifulSoup import glob import re import json import csv import os
# had to delete one file from both Metadata and word-count file due to the ngram1 file not containing any information. journal-article-10.2307_1196540-ngram1.txt total_freq = {} # The json schema here is {pub-year: {word: count}} for xml in glob.iglob('Metadata/*.xml'): with open(xml) as f: bs = BeautifulSoup(f, "lxml-xml") # Extract pub lish year pub_year = bs.year year = int(str(pub_year)[6:10]) total_freq[year] = total_freq.get(year, {}) txt = xml.replace("Metadata", "word-count").replace(".xml", "-ngram1.txt") with open(txt) as t: for line in t: sub = re.split("\s+", line) word = sub[0] count = int(sub[1]) if total_freq[year].get(word, 0) == 0: total_freq[year][word] = 0 total_freq[year][word] += count file = open("count.json", "w") with file: json.dump(total_freq, file)
import pandas as pd
df = pd.DataFrame.from_dict(total_freq, orient = 'index')
long_df = pd.melt(df, ignore_index = False)
/ext/anaconda2020.02/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3343: FutureWarning: This dataframe has a column name that matches the 'value_name' column name of the resultiing Dataframe. In the future this will raise an error, please set the 'value_name' parameter of DataFrame.melt to a unique name. exec(code_obj, self.user_global_ns, self.user_ns)
long_df.reset_index(inplace=True)
long_df
#I tried creatinga loop for removing stop words from the data frame but it pulled out the variable column, so I guess I'm doing this probably the worst way long_df.drop(long_df.index[long_df['variable'] == 's'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '00'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '95'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '0'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '1'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'i'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'pp'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'has'], inplace = True)
long_df
long_df.drop(long_df.index[long_df['variable'] == 'new'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'which'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'from'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'his'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'its'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'have'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'all'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'he'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'one'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'been'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'pages'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'we'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'would'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'had'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'j'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'our'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'cloth'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'hardback'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'you'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'p'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'new'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'york'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'press'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'were'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'any'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'isbn'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '35'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '800'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '86554'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'ii'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'e'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'd'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'her'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'b'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'who'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'also'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'what'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'more'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'may'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'should'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'those'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'ibid'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '86554'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'h'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '6'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '50'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '3'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'car'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'same'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'l'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'see'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'my'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'so'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'between'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'other'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'us'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'she'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'many'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'work'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'can'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'your'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '2'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '4'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '75'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'only'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'them'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'than'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'factor'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '5'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'co'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'two'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'most'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'must'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'upon'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'me'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'do'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'cannot'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'when'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'some'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'through'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'people'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'much'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'w'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'm'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'person'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'does'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'him'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'st'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'm'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'yet'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'c'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'x'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'could'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'd'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'when'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'how'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'whether'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'yet'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'out'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'even'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'type'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'action'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'well'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'very'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'mr'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'dr'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'made'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'own'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'far'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'himself'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '25'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '7'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'now'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'each'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'like'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'here'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'about'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'within'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'n'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 't'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'both'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'r'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'n'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'thus'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'itself'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'still'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'make'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'sense'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'focus'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'never'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'while'], inplace = True)
top_fifteen_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(15).reset_index(drop = True))
pd.set_option('display.max_row', 2000)
top_ten_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(10).reset_index(drop = True))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-97077e0e5b89> in <module> ----> 1 top_ten_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(10).reset_index(drop = True)) NameError: name 'long_df' is not defined
#You can now see the top_ten_words or the top_fifteen_words used over time. All you need to do is type top_ten_words or top_fifteen_words and then hit command return. #top_ten_words #top_fifteen_words