CoCalc -- nlp4e.py

GitHub Repository: aimacode/aima-python
Path: blob/master/nlp4e.py
⁶¹⁵ views
1
"""Natural Language Processing (Chapter 22)"""
2

3
from collections import defaultdict
4
from utils4e import weighted_choice
5
import copy
6
import operator
7
import heapq
8
from search import Problem
9

10

11
# ______________________________________________________________________________
12
# 22.2 Grammars
13

14

15
def Rules(**rules):
16
    """Create a dictionary mapping symbols to alternative sequences.
17
    >>> Rules(A = "B C | D E")
18
    {'A': [['B', 'C'], ['D', 'E']]}
19
    """
20
    for (lhs, rhs) in rules.items():
21
        rules[lhs] = [alt.strip().split() for alt in rhs.split('|')]
22
    return rules
23

24

25
def Lexicon(**rules):
26
    """Create a dictionary mapping symbols to alternative words.
27
    >>> Lexicon(Article = "the | a | an")
28
    {'Article': ['the', 'a', 'an']}
29
    """
30
    for (lhs, rhs) in rules.items():
31
        rules[lhs] = [word.strip() for word in rhs.split('|')]
32
    return rules
33

34

35
class Grammar:
36

37
    def __init__(self, name, rules, lexicon):
38
        """A grammar has a set of rules and a lexicon."""
39
        self.name = name
40
        self.rules = rules
41
        self.lexicon = lexicon
42
        self.categories = defaultdict(list)
43
        for lhs in lexicon:
44
            for word in lexicon[lhs]:
45
                self.categories[word].append(lhs)
46

47
    def rewrites_for(self, cat):
48
        """Return a sequence of possible rhs's that cat can be rewritten as."""
49
        return self.rules.get(cat, ())
50

51
    def isa(self, word, cat):
52
        """Return True iff word is of category cat"""
53
        return cat in self.categories[word]
54

55
    def cnf_rules(self):
56
        """Returns the tuple (X, Y, Z) for rules in the form:
57
        X -> Y Z"""
58
        cnf = []
59
        for X, rules in self.rules.items():
60
            for (Y, Z) in rules:
61
                cnf.append((X, Y, Z))
62

63
        return cnf
64

65
    def generate_random(self, S='S'):
66
        """Replace each token in S by a random entry in grammar (recursively)."""
67
        import random
68

69
        def rewrite(tokens, into):
70
            for token in tokens:
71
                if token in self.rules:
72
                    rewrite(random.choice(self.rules[token]), into)
73
                elif token in self.lexicon:
74
                    into.append(random.choice(self.lexicon[token]))
75
                else:
76
                    into.append(token)
77
            return into
78

79
        return ' '.join(rewrite(S.split(), []))
80

81
    def __repr__(self):
82
        return '<Grammar {}>'.format(self.name)
83

84

85
def ProbRules(**rules):
86
    """Create a dictionary mapping symbols to alternative sequences,
87
    with probabilities.
88
    >>> ProbRules(A = "B C [0.3] | D E [0.7]")
89
    {'A': [(['B', 'C'], 0.3), (['D', 'E'], 0.7)]}
90
    """
91
    for (lhs, rhs) in rules.items():
92
        rules[lhs] = []
93
        rhs_separate = [alt.strip().split() for alt in rhs.split('|')]
94
        for r in rhs_separate:
95
            prob = float(r[-1][1:-1])  # remove brackets, convert to float
96
            rhs_rule = (r[:-1], prob)
97
            rules[lhs].append(rhs_rule)
98

99
    return rules
100

101

102
def ProbLexicon(**rules):
103
    """Create a dictionary mapping symbols to alternative words,
104
    with probabilities.
105
    >>> ProbLexicon(Article = "the [0.5] | a [0.25] | an [0.25]")
106
    {'Article': [('the', 0.5), ('a', 0.25), ('an', 0.25)]}
107
    """
108
    for (lhs, rhs) in rules.items():
109
        rules[lhs] = []
110
        rhs_separate = [word.strip().split() for word in rhs.split('|')]
111
        for r in rhs_separate:
112
            prob = float(r[-1][1:-1])  # remove brackets, convert to float
113
            word = r[:-1][0]
114
            rhs_rule = (word, prob)
115
            rules[lhs].append(rhs_rule)
116

117
    return rules
118

119

120
class ProbGrammar:
121

122
    def __init__(self, name, rules, lexicon):
123
        """A grammar has a set of rules and a lexicon.
124
        Each rule has a probability."""
125
        self.name = name
126
        self.rules = rules
127
        self.lexicon = lexicon
128
        self.categories = defaultdict(list)
129

130
        for lhs in lexicon:
131
            for word, prob in lexicon[lhs]:
132
                self.categories[word].append((lhs, prob))
133

134
    def rewrites_for(self, cat):
135
        """Return a sequence of possible rhs's that cat can be rewritten as."""
136
        return self.rules.get(cat, ())
137

138
    def isa(self, word, cat):
139
        """Return True iff word is of category cat"""
140
        return cat in [c for c, _ in self.categories[word]]
141

142
    def cnf_rules(self):
143
        """Returns the tuple (X, Y, Z, p) for rules in the form:
144
        X -> Y Z [p]"""
145
        cnf = []
146
        for X, rules in self.rules.items():
147
            for (Y, Z), p in rules:
148
                cnf.append((X, Y, Z, p))
149

150
        return cnf
151

152
    def generate_random(self, S='S'):
153
        """Replace each token in S by a random entry in grammar (recursively).
154
        Returns a tuple of (sentence, probability)."""
155

156
        def rewrite(tokens, into):
157
            for token in tokens:
158
                if token in self.rules:
159
                    non_terminal, prob = weighted_choice(self.rules[token])
160
                    into[1] *= prob
161
                    rewrite(non_terminal, into)
162
                elif token in self.lexicon:
163
                    terminal, prob = weighted_choice(self.lexicon[token])
164
                    into[0].append(terminal)
165
                    into[1] *= prob
166
                else:
167
                    into[0].append(token)
168
            return into
169

170
        rewritten_as, prob = rewrite(S.split(), [[], 1])
171
        return (' '.join(rewritten_as), prob)
172

173
    def __repr__(self):
174
        return '<Grammar {}>'.format(self.name)
175

176

177
E0 = Grammar('E0',
178
             Rules(  # Grammar for E_0 [Figure 22.2]
179
                 S='NP VP | S Conjunction S',
180
                 NP='Pronoun | Name | Noun | Article Noun | Digit Digit | NP PP | NP RelClause',
181
                 VP='Verb | VP NP | VP Adjective | VP PP | VP Adverb',
182
                 PP='Preposition NP',
183
                 RelClause='That VP'),
184

185
             Lexicon(  # Lexicon for E_0 [Figure 22.3]
186
                 Noun="stench | breeze | glitter | nothing | wumpus | pit | pits | gold | east",
187
                 Verb="is | see | smell | shoot | fell | stinks | go | grab | carry | kill | turn | feel",  # noqa
188
                 Adjective="right | left | east | south | back | smelly | dead",
189
                 Adverb="here | there | nearby | ahead | right | left | east | south | back",
190
                 Pronoun="me | you | I | it",
191
                 Name="John | Mary | Boston | Aristotle",
192
                 Article="the | a | an",
193
                 Preposition="to | in | on | near",
194
                 Conjunction="and | or | but",
195
                 Digit="0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9",
196
                 That="that"
197
             ))
198

199
E_ = Grammar('E_',  # Trivial Grammar and lexicon for testing
200
             Rules(
201
                 S='NP VP',
202
                 NP='Art N | Pronoun',
203
                 VP='V NP'),
204

205
             Lexicon(
206
                 Art='the | a',
207
                 N='man | woman | table | shoelace | saw',
208
                 Pronoun='I | you | it',
209
                 V='saw | liked | feel'
210
             ))
211

212
E_NP_ = Grammar('E_NP_',  # Another Trivial Grammar for testing
213
                Rules(NP='Adj NP | N'),
214
                Lexicon(Adj='happy | handsome | hairy',
215
                        N='man'))
216

217
E_Prob = ProbGrammar('E_Prob',  # The Probabilistic Grammar from the notebook
218
                     ProbRules(
219
                         S="NP VP [0.6] | S Conjunction S [0.4]",
220
                         NP="Pronoun [0.2] | Name [0.05] | Noun [0.2] | Article Noun [0.15] \
221
                             | Article Adjs Noun [0.1] | Digit [0.05] | NP PP [0.15] | NP RelClause [0.1]",
222
                         VP="Verb [0.3] | VP NP [0.2] | VP Adjective [0.25] | VP PP [0.15] | VP Adverb [0.1]",
223
                         Adjs="Adjective [0.5] | Adjective Adjs [0.5]",
224
                         PP="Preposition NP [1]",
225
                         RelClause="RelPro VP [1]"
226
                     ),
227
                     ProbLexicon(
228
                         Verb="is [0.5] | say [0.3] | are [0.2]",
229
                         Noun="robot [0.4] | sheep [0.4] | fence [0.2]",
230
                         Adjective="good [0.5] | new [0.2] | sad [0.3]",
231
                         Adverb="here [0.6] | lightly [0.1] | now [0.3]",
232
                         Pronoun="me [0.3] | you [0.4] | he [0.3]",
233
                         RelPro="that [0.5] | who [0.3] | which [0.2]",
234
                         Name="john [0.4] | mary [0.4] | peter [0.2]",
235
                         Article="the [0.5] | a [0.25] | an [0.25]",
236
                         Preposition="to [0.4] | in [0.3] | at [0.3]",
237
                         Conjunction="and [0.5] | or [0.2] | but [0.3]",
238
                         Digit="0 [0.35] | 1 [0.35] | 2 [0.3]"
239
                     ))
240

241
E_Chomsky = Grammar('E_Prob_Chomsky',  # A Grammar in Chomsky Normal Form
242
                    Rules(
243
                        S='NP VP',
244
                        NP='Article Noun | Adjective Noun',
245
                        VP='Verb NP | Verb Adjective',
246
                    ),
247
                    Lexicon(
248
                        Article='the | a | an',
249
                        Noun='robot | sheep | fence',
250
                        Adjective='good | new | sad',
251
                        Verb='is | say | are'
252
                    ))
253

254
E_Prob_Chomsky = ProbGrammar('E_Prob_Chomsky',  # A Probabilistic Grammar in CNF
255
                             ProbRules(
256
                                 S='NP VP [1]',
257
                                 NP='Article Noun [0.6] | Adjective Noun [0.4]',
258
                                 VP='Verb NP [0.5] | Verb Adjective [0.5]',
259
                             ),
260
                             ProbLexicon(
261
                                 Article='the [0.5] | a [0.25] | an [0.25]',
262
                                 Noun='robot [0.4] | sheep [0.4] | fence [0.2]',
263
                                 Adjective='good [0.5] | new [0.2] | sad [0.3]',
264
                                 Verb='is [0.5] | say [0.3] | are [0.2]'
265
                             ))
266
E_Prob_Chomsky_ = ProbGrammar('E_Prob_Chomsky_',
267
                              ProbRules(
268
                                  S='NP VP [1]',
269
                                  NP='NP PP [0.4] | Noun Verb [0.6]',
270
                                  PP='Preposition NP [1]',
271
                                  VP='Verb NP [0.7] | VP PP [0.3]',
272
                              ),
273
                              ProbLexicon(
274
                                  Noun='astronomers [0.18] | eyes [0.32] | stars [0.32] | telescopes [0.18]',
275
                                  Verb='saw [0.5] | \'\' [0.5]',
276
                                  Preposition='with [1]'
277
                              ))
278

279

280
# ______________________________________________________________________________
281
# 22.3 Parsing
282

283

284
class Chart:
285
    """Class for parsing sentences using a chart data structure.
286
    >>> chart = Chart(E0)
287
    >>> len(chart.parses('the stench is in 2 2'))
288
    1
289
    """
290

291
    def __init__(self, grammar, trace=False):
292
        """A datastructure for parsing a string; and methods to do the parse.
293
        self.chart[i] holds the edges that end just before the i'th word.
294
        Edges are 5-element lists of [start, end, lhs, [found], [expects]]."""
295
        self.grammar = grammar
296
        self.trace = trace
297

298
    def parses(self, words, S='S'):
299
        """Return a list of parses; words can be a list or string."""
300
        if isinstance(words, str):
301
            words = words.split()
302
        self.parse(words, S)
303
        # Return all the parses that span the whole input
304
        # 'span the whole input' => begin at 0, end at len(words)
305
        return [[i, j, S, found, []]
306
                for (i, j, lhs, found, expects) in self.chart[len(words)]
307
                # assert j == len(words)
308
                if i == 0 and lhs == S and expects == []]
309

310
    def parse(self, words, S='S'):
311
        """Parse a list of words; according to the grammar.
312
        Leave results in the chart."""
313
        self.chart = [[] for i in range(len(words) + 1)]
314
        self.add_edge([0, 0, 'S_', [], [S]])
315
        for i in range(len(words)):
316
            self.scanner(i, words[i])
317
        return self.chart
318

319
    def add_edge(self, edge):
320
        """Add edge to chart, and see if it extends or predicts another edge."""
321
        start, end, lhs, found, expects = edge
322
        if edge not in self.chart[end]:
323
            self.chart[end].append(edge)
324
            if self.trace:
325
                print('Chart: added {}'.format(edge))
326
            if not expects:
327
                self.extender(edge)
328
            else:
329
                self.predictor(edge)
330

331
    def scanner(self, j, word):
332
        """For each edge expecting a word of this category here, extend the edge."""
333
        for (i, j, A, alpha, Bb) in self.chart[j]:
334
            if Bb and self.grammar.isa(word, Bb[0]):
335
                self.add_edge([i, j + 1, A, alpha + [(Bb[0], word)], Bb[1:]])
336

337
    def predictor(self, edge):
338
        """Add to chart any rules for B that could help extend this edge."""
339
        (i, j, A, alpha, Bb) = edge
340
        B = Bb[0]
341
        if B in self.grammar.rules:
342
            for rhs in self.grammar.rewrites_for(B):
343
                self.add_edge([j, j, B, [], rhs])
344

345
    def extender(self, edge):
346
        """See what edges can be extended by this edge."""
347
        (j, k, B, _, _) = edge
348
        for (i, j, A, alpha, B1b) in self.chart[j]:
349
            if B1b and B == B1b[0]:
350
                self.add_edge([i, k, A, alpha + [edge], B1b[1:]])
351

352

353
# ______________________________________________________________________________
354
# CYK Parsing
355

356

357
class Tree:
358
    def __init__(self, root, *args):
359
        self.root = root
360
        self.leaves = [leaf for leaf in args]
361

362

363
def CYK_parse(words, grammar):
364
    """ [Figure 22.6] """
365
    # We use 0-based indexing instead of the book's 1-based.
366
    P = defaultdict(float)
367
    T = defaultdict(Tree)
368

369
    # Insert lexical categories for each word.
370
    for (i, word) in enumerate(words):
371
        for (X, p) in grammar.categories[word]:
372
            P[X, i, i] = p
373
            T[X, i, i] = Tree(X, word)
374

375
    # Construct X(i:k) from Y(i:j) and Z(j+1:k), shortest span first
376
    for i, j, k in subspan(len(words)):
377
        for (X, Y, Z, p) in grammar.cnf_rules():
378
            PYZ = P[Y, i, j] * P[Z, j + 1, k] * p
379
            if PYZ > P[X, i, k]:
380
                P[X, i, k] = PYZ
381
                T[X, i, k] = Tree(X, T[Y, i, j], T[Z, j + 1, k])
382

383
    return T
384

385

386
def subspan(N):
387
    """returns all tuple(i, j, k) covering a span (i, k) with i <= j < k"""
388
    for length in range(2, N + 1):
389
        for i in range(1, N + 2 - length):
390
            k = i + length - 1
391
            for j in range(i, k):
392
                yield (i, j, k)
393

394

395
# using search algorithms in the searching part
396

397

398
class TextParsingProblem(Problem):
399
    def __init__(self, initial, grammar, goal='S'):
400
        """
401
        :param initial: the initial state of words in a list.
402
        :param grammar: a grammar object
403
        :param goal: the goal state, usually S
404
        """
405
        super(TextParsingProblem, self).__init__(initial, goal)
406
        self.grammar = grammar
407
        self.combinations = defaultdict(list)  # article combinations
408
        # backward lookup of rules
409
        for rule in grammar.rules:
410
            for comb in grammar.rules[rule]:
411
                self.combinations[' '.join(comb)].append(rule)
412

413
    def actions(self, state):
414
        actions = []
415
        categories = self.grammar.categories
416
        # first change each word to the article of its category
417
        for i in range(len(state)):
418
            word = state[i]
419
            if word in categories:
420
                for X in categories[word]:
421
                    state[i] = X
422
                    actions.append(copy.copy(state))
423
                    state[i] = word
424
        # if all words are replaced by articles, replace combinations of articles by inferring rules.
425
        if not actions:
426
            for start in range(len(state)):
427
                for end in range(start, len(state) + 1):
428
                    # try combinations between (start, end)
429
                    articles = ' '.join(state[start:end])
430
                    for c in self.combinations[articles]:
431
                        actions.append(state[:start] + [c] + state[end:])
432
        return actions
433

434
    def result(self, state, action):
435
        return action
436

437
    def h(self, state):
438
        # heuristic function
439
        return len(state)
440

441

442
def astar_search_parsing(words, gramma):
443
    """bottom-up parsing using A* search to find whether a list of words is a sentence"""
444
    # init the problem
445
    problem = TextParsingProblem(words, gramma, 'S')
446
    state = problem.initial
447
    # init the searching frontier
448
    frontier = [(len(state) + problem.h(state), state)]
449
    heapq.heapify(frontier)
450

451
    while frontier:
452
        # search the frontier node with lowest cost first
453
        cost, state = heapq.heappop(frontier)
454
        actions = problem.actions(state)
455
        for action in actions:
456
            new_state = problem.result(state, action)
457
            # update the new frontier node to the frontier
458
            if new_state == [problem.goal]:
459
                return problem.goal
460
            if new_state != state:
461
                heapq.heappush(frontier, (len(new_state) + problem.h(new_state), new_state))
462
    return False
463

464

465
def beam_search_parsing(words, gramma, b=3):
466
    """bottom-up text parsing using beam search"""
467
    # init problem
468
    problem = TextParsingProblem(words, gramma, 'S')
469
    # init frontier
470
    frontier = [(len(problem.initial), problem.initial)]
471
    heapq.heapify(frontier)
472

473
    # explore the current frontier and keep b new states with lowest cost
474
    def explore(frontier):
475
        new_frontier = []
476
        for cost, state in frontier:
477
            # expand the possible children states of current state
478
            if not problem.goal_test(' '.join(state)):
479
                actions = problem.actions(state)
480
                for action in actions:
481
                    new_state = problem.result(state, action)
482
                    if [len(new_state), new_state] not in new_frontier and new_state != state:
483
                        new_frontier.append([len(new_state), new_state])
484
            else:
485
                return problem.goal
486
        heapq.heapify(new_frontier)
487
        # only keep b states
488
        return heapq.nsmallest(b, new_frontier)
489

490
    while frontier:
491
        frontier = explore(frontier)
492
        if frontier == problem.goal:
493
            return frontier
494
    return False
495

496

497
# ______________________________________________________________________________
498
# 22.4 Augmented Grammar
499

500

501
g = Grammar("arithmetic_expression",  # A Grammar of Arithmetic Expression
502
            rules={
503
                'Number_0': 'Digit_0', 'Number_1': 'Digit_1', 'Number_2': 'Digit_2',
504
                'Number_10': 'Number_1 Digit_0', 'Number_11': 'Number_1 Digit_1',
505
                'Number_100': 'Number_10 Digit_0',
506
                'Exp_5': ['Number_5', '( Exp_5 )', 'Exp_1, Operator_+ Exp_4', 'Exp_2, Operator_+ Exp_3',
507
                          'Exp_0, Operator_+ Exp_5', 'Exp_3, Operator_+ Exp_2', 'Exp_4, Operator_+ Exp_1',
508
                          'Exp_5, Operator_+ Exp_0', 'Exp_1, Operator_* Exp_5'],  # more possible combinations
509
                'Operator_+': operator.add, 'Operator_-': operator.sub, 'Operator_*': operator.mul,
510
                'Operator_/': operator.truediv,
511
                'Digit_0': 0, 'Digit_1': 1, 'Digit_2': 2, 'Digit_3': 3, 'Digit_4': 4
512
            },
513
            lexicon={})
514

515
g = Grammar("Ali loves Bob",  # A example grammer of Ali loves Bob example
516
            rules={
517
                "S_loves_ali_bob": "NP_ali, VP_x_loves_x_bob", "S_loves_bob_ali": "NP_bob, VP_x_loves_x_ali",
518
                "VP_x_loves_x_bob": "Verb_xy_loves_xy NP_bob", "VP_x_loves_x_ali": "Verb_xy_loves_xy NP_ali",
519
                "NP_bob": "Name_bob", "NP_ali": "Name_ali"
520
            },
521
            lexicon={
522
                "Name_ali": "Ali", "Name_bob": "Bob", "Verb_xy_loves_xy": "loves"
523
            })
524

525
Product

Resources

Company