CoCalc -- parser

GitHub Repository: yiming-wange/cs224n-2023-solution
Path: blob/main/a3/utils/parser_utils.py
⁹⁹⁵ views
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
4
CS224N 2021-2022: Homework 3
5
parser_utils.py: Utilities for training the dependency parser.
6
Sahil Chopra <[email protected]>
7
"""
8

9
import time
10
import os
11
import logging
12
from collections import Counter
13
from . general_utils import get_minibatches
14
from parser_transitions import minibatch_parse
15

16
from tqdm import tqdm
17
import torch
18
import numpy as np
19

20
P_PREFIX = '<p>:'
21
L_PREFIX = '<l>:'
22
UNK = '<UNK>'
23
NULL = '<NULL>'
24
ROOT = '<ROOT>'
25

26

27
class Config(object):
28
    language = 'english'
29
    with_punct = True
30
    unlabeled = True
31
    lowercase = True
32
    use_pos = True
33
    use_dep = True
34
    use_dep = use_dep and (not unlabeled)
35
    data_path = './data'
36
    train_file = 'train.conll'
37
    dev_file = 'dev.conll'
38
    test_file = 'test.conll'
39
    embedding_file = './data/en-cw.txt'
40

41

42
class Parser(object):
43
    """Contains everything needed for transition-based dependency parsing except for the model"""
44

45
    def __init__(self, dataset):
46
        root_labels = list([l for ex in dataset
47
                           for (h, l) in zip(ex['head'], ex['label']) if h == 0])
48
        counter = Counter(root_labels)
49
        if len(counter) > 1:
50
            logging.info('Warning: more than one root label')
51
            logging.info(counter)
52
        self.root_label = counter.most_common()[0][0]
53
        deprel = [self.root_label] + list(set([w for ex in dataset
54
                                               for w in ex['label']
55
                                               if w != self.root_label]))
56
        tok2id = {L_PREFIX + l: i for (i, l) in enumerate(deprel)}
57
        tok2id[L_PREFIX + NULL] = self.L_NULL = len(tok2id)
58

59
        config = Config()
60
        self.unlabeled = config.unlabeled
61
        self.with_punct = config.with_punct
62
        self.use_pos = config.use_pos
63
        self.use_dep = config.use_dep
64
        self.language = config.language
65

66
        if self.unlabeled:
67
            trans = ['L', 'R', 'S']
68
            self.n_deprel = 1
69
        else:
70
            trans = ['L-' + l for l in deprel] + ['R-' + l for l in deprel] + ['S']
71
            self.n_deprel = len(deprel)
72

73
        self.n_trans = len(trans)
74
        self.tran2id = {t: i for (i, t) in enumerate(trans)}
75
        self.id2tran = {i: t for (i, t) in enumerate(trans)}
76

77
        # logging.info('Build dictionary for part-of-speech tags.')
78
        tok2id.update(build_dict([P_PREFIX + w for ex in dataset for w in ex['pos']],
79
                                  offset=len(tok2id)))
80
        tok2id[P_PREFIX + UNK] = self.P_UNK = len(tok2id)
81
        tok2id[P_PREFIX + NULL] = self.P_NULL = len(tok2id)
82
        tok2id[P_PREFIX + ROOT] = self.P_ROOT = len(tok2id)
83

84
        # logging.info('Build dictionary for words.')
85
        tok2id.update(build_dict([w for ex in dataset for w in ex['word']],
86
                                  offset=len(tok2id)))
87
        tok2id[UNK] = self.UNK = len(tok2id)
88
        tok2id[NULL] = self.NULL = len(tok2id)
89
        tok2id[ROOT] = self.ROOT = len(tok2id)
90

91
        self.tok2id = tok2id
92
        self.id2tok = {v: k for (k, v) in tok2id.items()}
93

94
        self.n_features = 18 + (18 if config.use_pos else 0) + (12 if config.use_dep else 0)
95
        self.n_tokens = len(tok2id)
96

97
    def vectorize(self, examples):
98
        vec_examples = []
99
        for ex in examples:
100
            word = [self.ROOT] + [self.tok2id[w] if w in self.tok2id
101
                                  else self.UNK for w in ex['word']]
102
            pos = [self.P_ROOT] + [self.tok2id[P_PREFIX + w] if P_PREFIX + w in self.tok2id
103
                                   else self.P_UNK for w in ex['pos']]
104
            head = [-1] + ex['head']
105
            label = [-1] + [self.tok2id[L_PREFIX + w] if L_PREFIX + w in self.tok2id
106
                            else -1 for w in ex['label']]
107
            vec_examples.append({'word': word, 'pos': pos,
108
                                 'head': head, 'label': label})
109
        return vec_examples
110

111
    def extract_features(self, stack, buf, arcs, ex):
112
        if stack[0] == "ROOT":
113
            stack[0] = 0
114

115
        def get_lc(k):
116
            return sorted([arc[1] for arc in arcs if arc[0] == k and arc[1] < k])
117

118
        def get_rc(k):
119
            return sorted([arc[1] for arc in arcs if arc[0] == k and arc[1] > k],
120
                          reverse=True)
121

122
        p_features = []
123
        l_features = []
124
        features = [self.NULL] * (3 - len(stack)) + [ex['word'][x] for x in stack[-3:]]
125
        features += [ex['word'][x] for x in buf[:3]] + [self.NULL] * (3 - len(buf))
126
        if self.use_pos:
127
            p_features = [self.P_NULL] * (3 - len(stack)) + [ex['pos'][x] for x in stack[-3:]]
128
            p_features += [ex['pos'][x] for x in buf[:3]] + [self.P_NULL] * (3 - len(buf))
129

130
        for i in range(2):
131
            if i < len(stack):
132
                k = stack[-i-1]
133
                lc = get_lc(k)
134
                rc = get_rc(k)
135
                llc = get_lc(lc[0]) if len(lc) > 0 else []
136
                rrc = get_rc(rc[0]) if len(rc) > 0 else []
137

138
                features.append(ex['word'][lc[0]] if len(lc) > 0 else self.NULL)
139
                features.append(ex['word'][rc[0]] if len(rc) > 0 else self.NULL)
140
                features.append(ex['word'][lc[1]] if len(lc) > 1 else self.NULL)
141
                features.append(ex['word'][rc[1]] if len(rc) > 1 else self.NULL)
142
                features.append(ex['word'][llc[0]] if len(llc) > 0 else self.NULL)
143
                features.append(ex['word'][rrc[0]] if len(rrc) > 0 else self.NULL)
144

145
                if self.use_pos:
146
                    p_features.append(ex['pos'][lc[0]] if len(lc) > 0 else self.P_NULL)
147
                    p_features.append(ex['pos'][rc[0]] if len(rc) > 0 else self.P_NULL)
148
                    p_features.append(ex['pos'][lc[1]] if len(lc) > 1 else self.P_NULL)
149
                    p_features.append(ex['pos'][rc[1]] if len(rc) > 1 else self.P_NULL)
150
                    p_features.append(ex['pos'][llc[0]] if len(llc) > 0 else self.P_NULL)
151
                    p_features.append(ex['pos'][rrc[0]] if len(rrc) > 0 else self.P_NULL)
152

153
                if self.use_dep:
154
                    l_features.append(ex['label'][lc[0]] if len(lc) > 0 else self.L_NULL)
155
                    l_features.append(ex['label'][rc[0]] if len(rc) > 0 else self.L_NULL)
156
                    l_features.append(ex['label'][lc[1]] if len(lc) > 1 else self.L_NULL)
157
                    l_features.append(ex['label'][rc[1]] if len(rc) > 1 else self.L_NULL)
158
                    l_features.append(ex['label'][llc[0]] if len(llc) > 0 else self.L_NULL)
159
                    l_features.append(ex['label'][rrc[0]] if len(rrc) > 0 else self.L_NULL)
160
            else:
161
                features += [self.NULL] * 6
162
                if self.use_pos:
163
                    p_features += [self.P_NULL] * 6
164
                if self.use_dep:
165
                    l_features += [self.L_NULL] * 6
166

167
        features += p_features + l_features
168
        assert len(features) == self.n_features
169
        return features
170

171
    def get_oracle(self, stack, buf, ex):
172
        if len(stack) < 2:
173
            return self.n_trans - 1
174

175
        i0 = stack[-1]
176
        i1 = stack[-2]
177
        h0 = ex['head'][i0]
178
        h1 = ex['head'][i1]
179
        l0 = ex['label'][i0]
180
        l1 = ex['label'][i1]
181

182
        if self.unlabeled:
183
            if (i1 > 0) and (h1 == i0):
184
                return 0
185
            elif (i1 >= 0) and (h0 == i1) and \
186
                 (not any([x for x in buf if ex['head'][x] == i0])):
187
                return 1
188
            else:
189
                return None if len(buf) == 0 else 2
190
        else:
191
            if (i1 > 0) and (h1 == i0):
192
                return l1 if (l1 >= 0) and (l1 < self.n_deprel) else None
193
            elif (i1 >= 0) and (h0 == i1) and \
194
                 (not any([x for x in buf if ex['head'][x] == i0])):
195
                return l0 + self.n_deprel if (l0 >= 0) and (l0 < self.n_deprel) else None
196
            else:
197
                return None if len(buf) == 0 else self.n_trans - 1
198

199
    def create_instances(self, examples):
200
        all_instances = []
201
        succ = 0
202
        for id, ex in enumerate(examples):
203
            n_words = len(ex['word']) - 1
204

205
            # arcs = {(h, t, label)}
206
            stack = [0]
207
            buf = [i + 1 for i in range(n_words)]
208
            arcs = []
209
            instances = []
210
            for i in range(n_words * 2):
211
                gold_t = self.get_oracle(stack, buf, ex)
212
                if gold_t is None:
213
                    break
214
                legal_labels = self.legal_labels(stack, buf)
215
                assert legal_labels[gold_t] == 1
216
                instances.append((self.extract_features(stack, buf, arcs, ex),
217
                                  legal_labels, gold_t))
218
                if gold_t == self.n_trans - 1:
219
                    stack.append(buf[0])
220
                    buf = buf[1:]
221
                elif gold_t < self.n_deprel:
222
                    arcs.append((stack[-1], stack[-2], gold_t))
223
                    stack = stack[:-2] + [stack[-1]]
224
                else:
225
                    arcs.append((stack[-2], stack[-1], gold_t - self.n_deprel))
226
                    stack = stack[:-1]
227
            else:
228
                succ += 1
229
                all_instances += instances
230

231
        return all_instances
232

233
    def legal_labels(self, stack, buf):
234
        labels = ([1] if len(stack) > 2 else [0]) * self.n_deprel
235
        labels += ([1] if len(stack) >= 2 else [0]) * self.n_deprel
236
        labels += [1] if len(buf) > 0 else [0]
237
        return labels
238

239
    def parse(self, dataset, eval_batch_size=5000):
240
        sentences = []
241
        sentence_id_to_idx = {}
242
        for i, example in enumerate(dataset):
243
            n_words = len(example['word']) - 1
244
            sentence = [j + 1 for j in range(n_words)]
245
            sentences.append(sentence)
246
            sentence_id_to_idx[id(sentence)] = i
247

248
        model = ModelWrapper(self, dataset, sentence_id_to_idx)
249
        dependencies = minibatch_parse(sentences, model, eval_batch_size)
250

251
        UAS = all_tokens = 0.0
252
        with tqdm(total=len(dataset)) as prog:
253
            for i, ex in enumerate(dataset):
254
                head = [-1] * len(ex['word'])
255
                for h, t, in dependencies[i]:
256
                    head[t] = h
257
                for pred_h, gold_h, gold_l, pos in \
258
                        zip(head[1:], ex['head'][1:], ex['label'][1:], ex['pos'][1:]):
259
                        assert self.id2tok[pos].startswith(P_PREFIX)
260
                        pos_str = self.id2tok[pos][len(P_PREFIX):]
261
                        if (self.with_punct) or (not punct(self.language, pos_str)):
262
                            UAS += 1 if pred_h == gold_h else 0
263
                            all_tokens += 1
264
                prog.update(i + 1)
265
        UAS /= all_tokens
266
        return UAS, dependencies
267

268

269
class ModelWrapper(object):
270
    def __init__(self, parser, dataset, sentence_id_to_idx):
271
        self.parser = parser
272
        self.dataset = dataset
273
        self.sentence_id_to_idx = sentence_id_to_idx
274

275
    def predict(self, partial_parses):
276
        mb_x = [self.parser.extract_features(p.stack, p.buffer, p.dependencies,
277
                                             self.dataset[self.sentence_id_to_idx[id(p.sentence)]])
278
                for p in partial_parses]
279
        mb_x = np.array(mb_x).astype('int32')
280
        mb_x = torch.from_numpy(mb_x).long()
281
        mb_l = [self.parser.legal_labels(p.stack, p.buffer) for p in partial_parses]
282

283
        pred = self.parser.model(mb_x)
284
        pred = pred.detach().numpy()
285
        pred = np.argmax(pred + 10000 * np.array(mb_l).astype('float32'), 1)
286
        pred = ["S" if p == 2 else ("LA" if p == 0 else "RA") for p in pred]
287
        return pred
288

289

290
def read_conll(in_file, lowercase=False, max_example=None):
291
    examples = []
292
    with open(in_file) as f:
293
        word, pos, head, label = [], [], [], []
294
        for line in f.readlines():
295
            sp = line.strip().split('\t')
296
            if len(sp) == 10:
297
                if '-' not in sp[0]:
298
                    word.append(sp[1].lower() if lowercase else sp[1])
299
                    pos.append(sp[4])
300
                    head.append(int(sp[6]))
301
                    label.append(sp[7])
302
            elif len(word) > 0:
303
                examples.append({'word': word, 'pos': pos, 'head': head, 'label': label})
304
                word, pos, head, label = [], [], [], []
305
                if (max_example is not None) and (len(examples) == max_example):
306
                    break
307
        if len(word) > 0:
308
            examples.append({'word': word, 'pos': pos, 'head': head, 'label': label})
309
    return examples
310

311

312
def build_dict(keys, n_max=None, offset=0):
313
    count = Counter()
314
    for key in keys:
315
        count[key] += 1
316
    ls = count.most_common() if n_max is None \
317
        else count.most_common(n_max)
318

319
    return {w[0]: index + offset for (index, w) in enumerate(ls)}
320

321

322
def punct(language, pos):
323
    if language == 'english':
324
        return pos in ["''", ",", ".", ":", "``", "-LRB-", "-RRB-"]
325
    elif language == 'chinese':
326
        return pos == 'PU'
327
    elif language == 'french':
328
        return pos == 'PUNC'
329
    elif language == 'german':
330
        return pos in ["$.", "$,", "$["]
331
    elif language == 'spanish':
332
        # http://nlp.stanford.edu/software/spanish-faq.shtml
333
        return pos in ["f0", "faa", "fat", "fc", "fd", "fe", "fg", "fh",
334
                       "fia", "fit", "fp", "fpa", "fpt", "fs", "ft",
335
                       "fx", "fz"]
336
    elif language == 'universal':
337
        return pos == 'PUNCT'
338
    else:
339
        raise ValueError('language: %s is not supported.' % language)
340

341

342
def minibatches(data, batch_size):
343
    x = np.array([d[0] for d in data])
344
    y = np.array([d[2] for d in data])
345
    one_hot = np.zeros((y.size, 3))
346
    one_hot[np.arange(y.size), y] = 1
347
    return get_minibatches([x, one_hot], batch_size)
348

349

350
def load_and_preprocess_data(reduced=True):
351
    config = Config()
352

353
    print("Loading data...",)
354
    start = time.time()
355
    train_set = read_conll(os.path.join(config.data_path, config.train_file),
356
                           lowercase=config.lowercase)
357
    dev_set = read_conll(os.path.join(config.data_path, config.dev_file),
358
                         lowercase=config.lowercase)
359
    test_set = read_conll(os.path.join(config.data_path, config.test_file),
360
                          lowercase=config.lowercase)
361
    if reduced:
362
        train_set = train_set[:1000]
363
        dev_set = dev_set[:500]
364
        test_set = test_set[:500]
365
    print("took {:.2f} seconds".format(time.time() - start))
366

367
    print("Building parser...",)
368
    start = time.time()
369
    parser = Parser(train_set)
370
    print("took {:.2f} seconds".format(time.time() - start))
371

372
    print("Loading pretrained embeddings...",)
373
    start = time.time()
374
    word_vectors = {}
375
    for line in open(config.embedding_file).readlines():
376
        sp = line.strip().split()
377
        word_vectors[sp[0]] = [float(x) for x in sp[1:]]
378
    embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (parser.n_tokens, 50)), dtype='float32')
379

380
    for token in parser.tok2id:
381
        i = parser.tok2id[token]
382
        if token in word_vectors:
383
            embeddings_matrix[i] = word_vectors[token]
384
        elif token.lower() in word_vectors:
385
            embeddings_matrix[i] = word_vectors[token.lower()]
386
    print("took {:.2f} seconds".format(time.time() - start))
387

388
    print("Vectorizing data...",)
389
    start = time.time()
390
    train_set = parser.vectorize(train_set)
391
    dev_set = parser.vectorize(dev_set)
392
    test_set = parser.vectorize(test_set)
393
    print("took {:.2f} seconds".format(time.time() - start))
394

395
    print("Preprocessing training data...",)
396
    start = time.time()
397
    train_examples = parser.create_instances(train_set)
398
    print("took {:.2f} seconds".format(time.time() - start))
399

400
    return parser, embeddings_matrix, train_examples, dev_set, test_set,
401

402

403
class AverageMeter(object):
404
    """Computes and stores the average and current value"""
405
    def __init__(self):
406
        self.reset()
407

408
    def reset(self):
409
        self.val = 0
410
        self.avg = 0
411
        self.sum = 0
412
        self.count = 0
413

414
    def update(self, val, n=1):
415
        self.val = val
416
        self.sum += val * n
417
        self.count += n
418
        self.avg = self.sum / self.count
419

420

421
if __name__ == '__main__':
422
    pass
423

424
Product

Resources

Company