class data:
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
args = data()
args.filename = "soya_dataset_year.csv"
args.stemmer = "lemma"
args.dictionary = "technology"
# -*- coding: utf-8 -*-
import numpy as np
import csv
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from gensim import corpora, models
from stemming.porter2 import stem
from nltk.stem import *
import unicodecsv
import re
import pyLDAvis.gensim
import gensim
_digits = re.compile('\d')
def contains_digits(d):
return bool(_digits.search(d))
import enchant
d = enchant.Dict("en_US")
# Or using the /usr/share/dict/british-english word list
with open(args.dictionary + "-english") as word_file:
english_words = set(word.strip().lower() for word in word_file)
print(english_words)
def is_english_word(word):
return word.lower() in english_words
def process_tokens(tokens,stemmer):
tokens = [i for i in tokens if not i in en_stop and not contains_digits(i) and is_english_word(i)]
if stemmer == 'porter':
stemmer = PorterStemmer()
tokens = [stemmer.stem(i) for i in tokens]
elif stemmer == 'porter2':
tokens = [stem(i) for i in tokens]
elif stemmer == 'lemma':
lemmatiser = WordNetLemmatizer()
tokens = [lemmatiser.lemmatize(i) for i in tokens]
return tokens
if args.filename == "sample":
X = lda.datasets.load_reuters()
dictionary = lda.datasets.load_reuters_vocab()
titles = lda.datasets.load_reuters_titles()
else:
f = open(args.filename)
reader = unicodecsv.reader(f, encoding='utf-8')
# csv_length = sum(1 for row in reader)
# f.seek(0) #reset reader position
identifiers = reader.next()
contents_idx = identifiers.index("contents")
title_idx = identifiers.index("title")
contents = [ row[contents_idx] for row in reader if row[contents_idx] ]
f.seek(0)
reader.next()
titles = [ row[title_idx] for row in reader if row[contents_idx] ]
texts = list()
tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
for idx,i in enumerate(contents):
# if not idx % 10:
# print "INFO: Tokenizing articles <{}> ".format(idx)
raw = i.lower()
tokens = tokenizer.tokenize(raw)
texts.append(process_tokens(tokens, args.stemmer))
# print idx
# add tokens to list
print "[DEBUG] Length of Texts : {}".format(len(texts))
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
pyLDAvis.enable_notebook()
model_filename = args.filename.split('.')[0] + '.model'
try:
ldamodel = models.LdaModel.load(model_filename)
except IOError:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20)
ldamodel.save(model_filename)
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)