Path: blob/master/deep_learning/word2vec/word2vec_workflow.py
2585 views
"""11) Basic preprocessing of the raw text.22) trains a Phrase model to glue words that commonly appear next to3each other into bigrams.43) trains the Word2vec model (skipgram + negative sampling); currently,5there are zero hyperparameter tuning.6"""7import os8import re9import logging10from joblib import cpu_count11from string import punctuation12from logzero import setup_logger13from nltk.corpus import stopwords14from gensim.models import Phrases15from gensim.models import Word2Vec16from gensim.models.phrases import Phraser17from gensim.models.word2vec import LineSentence18from sklearn.datasets import fetch_20newsgroups19from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS20logger = setup_logger(name = __name__, logfile = 'word2vec.log', level = logging.INFO)212223def main():24# -------------------------------------------------------------------------------25# Parameters2627# the script will most likely work if we swap the TEXTS variable28# with any iterable of text (where one element represents a document,29# and the whole iterable is the corpus)30newsgroups_train = fetch_20newsgroups(subset = 'train')31TEXTS = newsgroups_train.data3233# a set of stopwords built-in to various packages34# we can always expand this set for the35# problem that we are working on, here we also included36# python built-in string punctuation mark37STOPWORDS = set(stopwords.words('english')) | set(punctuation) | set(ENGLISH_STOP_WORDS)3839# create a directory called 'model' to store all outputs in later section40MODEL_DIR = 'model'41UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt')42PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model')43BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt')44WORD2VEC_CHECKPOINT = os.path.join(MODEL_DIR, 'word2vec')4546# -------------------------------------------------------------------------------47logger.info('job started')48if not os.path.isdir(MODEL_DIR):49os.mkdir(MODEL_DIR)5051if not os.path.exists(UNIGRAM_PATH):52logger.info('preprocessing text')53export_unigrams(UNIGRAM_PATH, texts=TEXTS, stop_words=STOPWORDS)5455if os.path.exists(PHRASE_MODEL_CHECKPOINT):56phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT)57else:58logger.info('training phrase model')59# use LineSetence to stream text as oppose to loading it all into memory60unigram_sentences = LineSentence(UNIGRAM_PATH)61phrase_model = Phrases(unigram_sentences)62phrase_model.save(PHRASE_MODEL_CHECKPOINT)6364if not os.path.exists(BIGRAM_PATH):65logger.info('converting words to phrases')66export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model)6768if os.path.exists(WORD2VEC_CHECKPOINT):69word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT)70else:71logger.info('training word2vec')72word2vec = Word2Vec(corpus_file=BIGRAM_PATH, workers=cpu_count())73word2vec.save(WORD2VEC_CHECKPOINT)7475logger.info('job completed')767778def export_unigrams(unigram_path, texts, stop_words):79"""80Preprocessed the raw text and export it to a .txt file,81where each line is one document, for what sort of preprocessing82is done, please refer to the `normalize_text` function8384Parameters85----------86unigram_path : str87output file path of the preprocessed unigram text.8889texts : iterable90iterable can be simply a list, but for larger corpora,91consider an iterable that streams the sentences directly from92disk/network using Gensim's Linsentence or something along93those line.9495stop_words : set96stopword set that will be excluded from the corpus.97"""98with open(unigram_path, 'w', encoding='utf_8') as f:99for text in texts:100cleaned_text = normalize_text(text, stop_words)101f.write(cleaned_text + '\n')102103104def normalize_text(text, stop_words):105# remove special characters\whitespaces106text = re.sub(r'[^a-zA-Z\s]', '', text, re.I | re.A)107108# lower case & tokenize text109tokens = re.split(r'\s+', text.lower().strip())110111# filter stopwords out of text &112# re-create text from filtered tokens113cleaned_text = ' '.join(token for token in tokens if token not in stop_words)114return cleaned_text115116117def export_bigrams(unigram_path, bigram_path, phrase_model):118"""119Use the learned phrase model to create (potential) bigrams,120and output the text that contains bigrams to disk121122Parameters123----------124unigram_path : str125input file path of the preprocessed unigram text126127bigram_path : str128output file path of the transformed bigram text129130phrase_model : gensim's Phrase model object131132References133----------134Gensim Phrase Detection135- https://radimrehurek.com/gensim/models/phrases.html136"""137138# after training the Phrase model, create a performant139# Phraser object to transform any sentence (list of140# token strings) and glue unigrams together into bigrams141phraser = Phraser(phrase_model)142with open(bigram_path, 'w') as fout, open(unigram_path) as fin:143for text in fin:144unigram = text.split()145bigram = phraser[unigram]146bigram_sentence = ' '.join(bigram)147fout.write(bigram_sentence + '\n')148149150if __name__ == '__main__':151main()152153154