In [3]:
class data:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)
args = data()
args.filename = "soya_dataset_year.csv"
args.stemmer = "lemma"
args.dictionary = "technology"
In [6]:
# -*- coding: utf-8 -*-
import numpy as np
import csv
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from gensim import corpora, models

from stemming.porter2 import stem
from nltk.stem import *
import unicodecsv
import re
import pyLDAvis.gensim
import gensim

_digits = re.compile('\d')
def contains_digits(d):
    return bool(_digits.search(d))

import enchant
d = enchant.Dict("en_US")
# Or using the /usr/share/dict/british-english word list
with open(args.dictionary + "-english") as word_file:
  english_words = set(word.strip().lower() for word in word_file)
  print(english_words)
  def is_english_word(word):
    return word.lower() in english_words

def process_tokens(tokens,stemmer):
  tokens = [i for i in tokens if not i in en_stop and not contains_digits(i) and is_english_word(i)]
  if stemmer == 'porter':
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(i) for i in tokens]
  elif stemmer == 'porter2':
    tokens = [stem(i) for i in tokens]
  elif stemmer == 'lemma':
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(i) for i in tokens]
  return tokens


if args.filename == "sample":
  X = lda.datasets.load_reuters()
  dictionary = lda.datasets.load_reuters_vocab()
  titles = lda.datasets.load_reuters_titles()
else:
  f = open(args.filename)
  reader = unicodecsv.reader(f, encoding='utf-8')
  # csv_length = sum(1 for row in reader)
  # f.seek(0) #reset reader position
  identifiers = reader.next()
  contents_idx = identifiers.index("contents")
  title_idx = identifiers.index("title")

  contents = [ row[contents_idx] for row in reader if row[contents_idx] ]

  f.seek(0)
  reader.next()
  titles = [ row[title_idx] for row in reader if row[contents_idx] ]



  texts = list()
  tokenizer = RegexpTokenizer(r'\w+')
  en_stop = get_stop_words('en')
  for idx,i in enumerate(contents):
#     if not idx % 10:
#       print "INFO: Tokenizing articles <{}> ".format(idx)
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    texts.append(process_tokens(tokens, args.stemmer))
    # print idx
    # add tokens to list

  print "[DEBUG] Length of Texts : {}".format(len(texts))
  dictionary = corpora.Dictionary(texts)
  corpus = [dictionary.doc2bow(text) for text in texts]
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-6-2ccd699120fa> in <module>()
      3 import csv
      4 from nltk.tokenize import RegexpTokenizer
----> 5 from stop_words import get_stop_words
      6 from gensim import corpora, models
      7 

ImportError: No module named stop_words
In [5]:
pyLDAvis.enable_notebook()
model_filename = args.filename.split('.')[0] + '.model'
try:
  ldamodel = models.LdaModel.load(model_filename)
except IOError:
  ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20)
  ldamodel.save(model_filename)
    
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-ca3d9701ffb8> in <module>()
----> 1 pyLDAvis.enable_notebook()
      2 model_filename = args.filename.split('.')[0] + '.model'
      3 try:
      4   ldamodel = models.LdaModel.load(model_filename)
      5 except IOError:

NameError: name 'pyLDAvis' is not defined
In [ ]: