CoCalc -- utils

GitHub Repository: amanchadha/coursera-natural-language-processing-specialization
Path: blob/master/2 - Natural Language Processing with Probabilistic Models/Week 2/utils_pos.py
⁶⁵ views
1
import string
2

3

4
# Punctuation characters
5
punct = set(string.punctuation)
6

7
# Morphology rules used to assign unknown word tokens
8
noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
9
verb_suffix = ["ate", "ify", "ise", "ize"]
10
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
11
adv_suffix = ["ward", "wards", "wise"]
12

13

14
def get_word_tag(line, vocab): 
15
    if not line.split():
16
        word = "--n--"
17
        tag = "--s--"
18
        return word, tag
19
    else:
20
        word, tag = line.split()
21
        if word not in vocab: 
22
            # Handle unknown words
23
            word = assign_unk(word)
24
        return word, tag
25
    return None 
26

27

28
def preprocess(vocab, data_fp):
29
    """
30
    Preprocess data
31
    """
32
    orig = []
33
    prep = []
34

35
    # Read data
36
    with open(data_fp, "r") as data_file:
37

38
        for cnt, word in enumerate(data_file):
39

40
            # End of sentence
41
            if not word.split():
42
                orig.append(word.strip())
43
                word = "--n--"
44
                prep.append(word)
45
                continue
46

47
            # Handle unknown words
48
            elif word.strip() not in vocab:
49
                orig.append(word.strip())
50
                word = assign_unk(word)
51
                prep.append(word)
52
                continue
53

54
            else:
55
                orig.append(word.strip())
56
                prep.append(word.strip())
57

58
    assert(len(orig) == len(open(data_fp, "r").readlines()))
59
    assert(len(prep) == len(open(data_fp, "r").readlines()))
60

61
    return orig, prep
62

63

64
def assign_unk(tok):
65
    """
66
    Assign unknown word tokens
67
    """
68
    # Digits
69
    if any(char.isdigit() for char in tok):
70
        return "--unk_digit--"
71

72
    # Punctuation
73
    elif any(char in punct for char in tok):
74
        return "--unk_punct--"
75

76
    # Upper-case
77
    elif any(char.isupper() for char in tok):
78
        return "--unk_upper--"
79

80
    # Nouns
81
    elif any(tok.endswith(suffix) for suffix in noun_suffix):
82
        return "--unk_noun--"
83

84
    # Verbs
85
    elif any(tok.endswith(suffix) for suffix in verb_suffix):
86
        return "--unk_verb--"
87

88
    # Adjectives
89
    elif any(tok.endswith(suffix) for suffix in adj_suffix):
90
        return "--unk_adj--"
91

92
    # Adverbs
93
    elif any(tok.endswith(suffix) for suffix in adv_suffix):
94
        return "--unk_adv--"
95

96
    return "--unk--"
97

98
Product

Resources

Company