Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
amanchadha
GitHub Repository: amanchadha/coursera-natural-language-processing-specialization
Path: blob/master/2 - Natural Language Processing with Probabilistic Models/Week 2/utils_pos.py
65 views
1
import string
2
3
4
# Punctuation characters
5
punct = set(string.punctuation)
6
7
# Morphology rules used to assign unknown word tokens
8
noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
9
verb_suffix = ["ate", "ify", "ise", "ize"]
10
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
11
adv_suffix = ["ward", "wards", "wise"]
12
13
14
def get_word_tag(line, vocab):
15
if not line.split():
16
word = "--n--"
17
tag = "--s--"
18
return word, tag
19
else:
20
word, tag = line.split()
21
if word not in vocab:
22
# Handle unknown words
23
word = assign_unk(word)
24
return word, tag
25
return None
26
27
28
def preprocess(vocab, data_fp):
29
"""
30
Preprocess data
31
"""
32
orig = []
33
prep = []
34
35
# Read data
36
with open(data_fp, "r") as data_file:
37
38
for cnt, word in enumerate(data_file):
39
40
# End of sentence
41
if not word.split():
42
orig.append(word.strip())
43
word = "--n--"
44
prep.append(word)
45
continue
46
47
# Handle unknown words
48
elif word.strip() not in vocab:
49
orig.append(word.strip())
50
word = assign_unk(word)
51
prep.append(word)
52
continue
53
54
else:
55
orig.append(word.strip())
56
prep.append(word.strip())
57
58
assert(len(orig) == len(open(data_fp, "r").readlines()))
59
assert(len(prep) == len(open(data_fp, "r").readlines()))
60
61
return orig, prep
62
63
64
def assign_unk(tok):
65
"""
66
Assign unknown word tokens
67
"""
68
# Digits
69
if any(char.isdigit() for char in tok):
70
return "--unk_digit--"
71
72
# Punctuation
73
elif any(char in punct for char in tok):
74
return "--unk_punct--"
75
76
# Upper-case
77
elif any(char.isupper() for char in tok):
78
return "--unk_upper--"
79
80
# Nouns
81
elif any(tok.endswith(suffix) for suffix in noun_suffix):
82
return "--unk_noun--"
83
84
# Verbs
85
elif any(tok.endswith(suffix) for suffix in verb_suffix):
86
return "--unk_verb--"
87
88
# Adjectives
89
elif any(tok.endswith(suffix) for suffix in adj_suffix):
90
return "--unk_adj--"
91
92
# Adverbs
93
elif any(tok.endswith(suffix) for suffix in adv_suffix):
94
return "--unk_adv--"
95
96
return "--unk--"
97
98