Path: blob/master/2 - Natural Language Processing with Probabilistic Models/Week 2/utils_pos.py
65 views
import string123# Punctuation characters4punct = set(string.punctuation)56# Morphology rules used to assign unknown word tokens7noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]8verb_suffix = ["ate", "ify", "ise", "ize"]9adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]10adv_suffix = ["ward", "wards", "wise"]111213def get_word_tag(line, vocab):14if not line.split():15word = "--n--"16tag = "--s--"17return word, tag18else:19word, tag = line.split()20if word not in vocab:21# Handle unknown words22word = assign_unk(word)23return word, tag24return None252627def preprocess(vocab, data_fp):28"""29Preprocess data30"""31orig = []32prep = []3334# Read data35with open(data_fp, "r") as data_file:3637for cnt, word in enumerate(data_file):3839# End of sentence40if not word.split():41orig.append(word.strip())42word = "--n--"43prep.append(word)44continue4546# Handle unknown words47elif word.strip() not in vocab:48orig.append(word.strip())49word = assign_unk(word)50prep.append(word)51continue5253else:54orig.append(word.strip())55prep.append(word.strip())5657assert(len(orig) == len(open(data_fp, "r").readlines()))58assert(len(prep) == len(open(data_fp, "r").readlines()))5960return orig, prep616263def assign_unk(tok):64"""65Assign unknown word tokens66"""67# Digits68if any(char.isdigit() for char in tok):69return "--unk_digit--"7071# Punctuation72elif any(char in punct for char in tok):73return "--unk_punct--"7475# Upper-case76elif any(char.isupper() for char in tok):77return "--unk_upper--"7879# Nouns80elif any(tok.endswith(suffix) for suffix in noun_suffix):81return "--unk_noun--"8283# Verbs84elif any(tok.endswith(suffix) for suffix in verb_suffix):85return "--unk_verb--"8687# Adjectives88elif any(tok.endswith(suffix) for suffix in adj_suffix):89return "--unk_adj--"9091# Adverbs92elif any(tok.endswith(suffix) for suffix in adv_suffix):93return "--unk_adv--"9495return "--unk--"969798