Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
aimacode
GitHub Repository: aimacode/aima-python
Path: blob/master/nlp4e.py
615 views
1
"""Natural Language Processing (Chapter 22)"""
2
3
from collections import defaultdict
4
from utils4e import weighted_choice
5
import copy
6
import operator
7
import heapq
8
from search import Problem
9
10
11
# ______________________________________________________________________________
12
# 22.2 Grammars
13
14
15
def Rules(**rules):
16
"""Create a dictionary mapping symbols to alternative sequences.
17
>>> Rules(A = "B C | D E")
18
{'A': [['B', 'C'], ['D', 'E']]}
19
"""
20
for (lhs, rhs) in rules.items():
21
rules[lhs] = [alt.strip().split() for alt in rhs.split('|')]
22
return rules
23
24
25
def Lexicon(**rules):
26
"""Create a dictionary mapping symbols to alternative words.
27
>>> Lexicon(Article = "the | a | an")
28
{'Article': ['the', 'a', 'an']}
29
"""
30
for (lhs, rhs) in rules.items():
31
rules[lhs] = [word.strip() for word in rhs.split('|')]
32
return rules
33
34
35
class Grammar:
36
37
def __init__(self, name, rules, lexicon):
38
"""A grammar has a set of rules and a lexicon."""
39
self.name = name
40
self.rules = rules
41
self.lexicon = lexicon
42
self.categories = defaultdict(list)
43
for lhs in lexicon:
44
for word in lexicon[lhs]:
45
self.categories[word].append(lhs)
46
47
def rewrites_for(self, cat):
48
"""Return a sequence of possible rhs's that cat can be rewritten as."""
49
return self.rules.get(cat, ())
50
51
def isa(self, word, cat):
52
"""Return True iff word is of category cat"""
53
return cat in self.categories[word]
54
55
def cnf_rules(self):
56
"""Returns the tuple (X, Y, Z) for rules in the form:
57
X -> Y Z"""
58
cnf = []
59
for X, rules in self.rules.items():
60
for (Y, Z) in rules:
61
cnf.append((X, Y, Z))
62
63
return cnf
64
65
def generate_random(self, S='S'):
66
"""Replace each token in S by a random entry in grammar (recursively)."""
67
import random
68
69
def rewrite(tokens, into):
70
for token in tokens:
71
if token in self.rules:
72
rewrite(random.choice(self.rules[token]), into)
73
elif token in self.lexicon:
74
into.append(random.choice(self.lexicon[token]))
75
else:
76
into.append(token)
77
return into
78
79
return ' '.join(rewrite(S.split(), []))
80
81
def __repr__(self):
82
return '<Grammar {}>'.format(self.name)
83
84
85
def ProbRules(**rules):
86
"""Create a dictionary mapping symbols to alternative sequences,
87
with probabilities.
88
>>> ProbRules(A = "B C [0.3] | D E [0.7]")
89
{'A': [(['B', 'C'], 0.3), (['D', 'E'], 0.7)]}
90
"""
91
for (lhs, rhs) in rules.items():
92
rules[lhs] = []
93
rhs_separate = [alt.strip().split() for alt in rhs.split('|')]
94
for r in rhs_separate:
95
prob = float(r[-1][1:-1]) # remove brackets, convert to float
96
rhs_rule = (r[:-1], prob)
97
rules[lhs].append(rhs_rule)
98
99
return rules
100
101
102
def ProbLexicon(**rules):
103
"""Create a dictionary mapping symbols to alternative words,
104
with probabilities.
105
>>> ProbLexicon(Article = "the [0.5] | a [0.25] | an [0.25]")
106
{'Article': [('the', 0.5), ('a', 0.25), ('an', 0.25)]}
107
"""
108
for (lhs, rhs) in rules.items():
109
rules[lhs] = []
110
rhs_separate = [word.strip().split() for word in rhs.split('|')]
111
for r in rhs_separate:
112
prob = float(r[-1][1:-1]) # remove brackets, convert to float
113
word = r[:-1][0]
114
rhs_rule = (word, prob)
115
rules[lhs].append(rhs_rule)
116
117
return rules
118
119
120
class ProbGrammar:
121
122
def __init__(self, name, rules, lexicon):
123
"""A grammar has a set of rules and a lexicon.
124
Each rule has a probability."""
125
self.name = name
126
self.rules = rules
127
self.lexicon = lexicon
128
self.categories = defaultdict(list)
129
130
for lhs in lexicon:
131
for word, prob in lexicon[lhs]:
132
self.categories[word].append((lhs, prob))
133
134
def rewrites_for(self, cat):
135
"""Return a sequence of possible rhs's that cat can be rewritten as."""
136
return self.rules.get(cat, ())
137
138
def isa(self, word, cat):
139
"""Return True iff word is of category cat"""
140
return cat in [c for c, _ in self.categories[word]]
141
142
def cnf_rules(self):
143
"""Returns the tuple (X, Y, Z, p) for rules in the form:
144
X -> Y Z [p]"""
145
cnf = []
146
for X, rules in self.rules.items():
147
for (Y, Z), p in rules:
148
cnf.append((X, Y, Z, p))
149
150
return cnf
151
152
def generate_random(self, S='S'):
153
"""Replace each token in S by a random entry in grammar (recursively).
154
Returns a tuple of (sentence, probability)."""
155
156
def rewrite(tokens, into):
157
for token in tokens:
158
if token in self.rules:
159
non_terminal, prob = weighted_choice(self.rules[token])
160
into[1] *= prob
161
rewrite(non_terminal, into)
162
elif token in self.lexicon:
163
terminal, prob = weighted_choice(self.lexicon[token])
164
into[0].append(terminal)
165
into[1] *= prob
166
else:
167
into[0].append(token)
168
return into
169
170
rewritten_as, prob = rewrite(S.split(), [[], 1])
171
return (' '.join(rewritten_as), prob)
172
173
def __repr__(self):
174
return '<Grammar {}>'.format(self.name)
175
176
177
E0 = Grammar('E0',
178
Rules( # Grammar for E_0 [Figure 22.2]
179
S='NP VP | S Conjunction S',
180
NP='Pronoun | Name | Noun | Article Noun | Digit Digit | NP PP | NP RelClause',
181
VP='Verb | VP NP | VP Adjective | VP PP | VP Adverb',
182
PP='Preposition NP',
183
RelClause='That VP'),
184
185
Lexicon( # Lexicon for E_0 [Figure 22.3]
186
Noun="stench | breeze | glitter | nothing | wumpus | pit | pits | gold | east",
187
Verb="is | see | smell | shoot | fell | stinks | go | grab | carry | kill | turn | feel", # noqa
188
Adjective="right | left | east | south | back | smelly | dead",
189
Adverb="here | there | nearby | ahead | right | left | east | south | back",
190
Pronoun="me | you | I | it",
191
Name="John | Mary | Boston | Aristotle",
192
Article="the | a | an",
193
Preposition="to | in | on | near",
194
Conjunction="and | or | but",
195
Digit="0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9",
196
That="that"
197
))
198
199
E_ = Grammar('E_', # Trivial Grammar and lexicon for testing
200
Rules(
201
S='NP VP',
202
NP='Art N | Pronoun',
203
VP='V NP'),
204
205
Lexicon(
206
Art='the | a',
207
N='man | woman | table | shoelace | saw',
208
Pronoun='I | you | it',
209
V='saw | liked | feel'
210
))
211
212
E_NP_ = Grammar('E_NP_', # Another Trivial Grammar for testing
213
Rules(NP='Adj NP | N'),
214
Lexicon(Adj='happy | handsome | hairy',
215
N='man'))
216
217
E_Prob = ProbGrammar('E_Prob', # The Probabilistic Grammar from the notebook
218
ProbRules(
219
S="NP VP [0.6] | S Conjunction S [0.4]",
220
NP="Pronoun [0.2] | Name [0.05] | Noun [0.2] | Article Noun [0.15] \
221
| Article Adjs Noun [0.1] | Digit [0.05] | NP PP [0.15] | NP RelClause [0.1]",
222
VP="Verb [0.3] | VP NP [0.2] | VP Adjective [0.25] | VP PP [0.15] | VP Adverb [0.1]",
223
Adjs="Adjective [0.5] | Adjective Adjs [0.5]",
224
PP="Preposition NP [1]",
225
RelClause="RelPro VP [1]"
226
),
227
ProbLexicon(
228
Verb="is [0.5] | say [0.3] | are [0.2]",
229
Noun="robot [0.4] | sheep [0.4] | fence [0.2]",
230
Adjective="good [0.5] | new [0.2] | sad [0.3]",
231
Adverb="here [0.6] | lightly [0.1] | now [0.3]",
232
Pronoun="me [0.3] | you [0.4] | he [0.3]",
233
RelPro="that [0.5] | who [0.3] | which [0.2]",
234
Name="john [0.4] | mary [0.4] | peter [0.2]",
235
Article="the [0.5] | a [0.25] | an [0.25]",
236
Preposition="to [0.4] | in [0.3] | at [0.3]",
237
Conjunction="and [0.5] | or [0.2] | but [0.3]",
238
Digit="0 [0.35] | 1 [0.35] | 2 [0.3]"
239
))
240
241
E_Chomsky = Grammar('E_Prob_Chomsky', # A Grammar in Chomsky Normal Form
242
Rules(
243
S='NP VP',
244
NP='Article Noun | Adjective Noun',
245
VP='Verb NP | Verb Adjective',
246
),
247
Lexicon(
248
Article='the | a | an',
249
Noun='robot | sheep | fence',
250
Adjective='good | new | sad',
251
Verb='is | say | are'
252
))
253
254
E_Prob_Chomsky = ProbGrammar('E_Prob_Chomsky', # A Probabilistic Grammar in CNF
255
ProbRules(
256
S='NP VP [1]',
257
NP='Article Noun [0.6] | Adjective Noun [0.4]',
258
VP='Verb NP [0.5] | Verb Adjective [0.5]',
259
),
260
ProbLexicon(
261
Article='the [0.5] | a [0.25] | an [0.25]',
262
Noun='robot [0.4] | sheep [0.4] | fence [0.2]',
263
Adjective='good [0.5] | new [0.2] | sad [0.3]',
264
Verb='is [0.5] | say [0.3] | are [0.2]'
265
))
266
E_Prob_Chomsky_ = ProbGrammar('E_Prob_Chomsky_',
267
ProbRules(
268
S='NP VP [1]',
269
NP='NP PP [0.4] | Noun Verb [0.6]',
270
PP='Preposition NP [1]',
271
VP='Verb NP [0.7] | VP PP [0.3]',
272
),
273
ProbLexicon(
274
Noun='astronomers [0.18] | eyes [0.32] | stars [0.32] | telescopes [0.18]',
275
Verb='saw [0.5] | \'\' [0.5]',
276
Preposition='with [1]'
277
))
278
279
280
# ______________________________________________________________________________
281
# 22.3 Parsing
282
283
284
class Chart:
285
"""Class for parsing sentences using a chart data structure.
286
>>> chart = Chart(E0)
287
>>> len(chart.parses('the stench is in 2 2'))
288
1
289
"""
290
291
def __init__(self, grammar, trace=False):
292
"""A datastructure for parsing a string; and methods to do the parse.
293
self.chart[i] holds the edges that end just before the i'th word.
294
Edges are 5-element lists of [start, end, lhs, [found], [expects]]."""
295
self.grammar = grammar
296
self.trace = trace
297
298
def parses(self, words, S='S'):
299
"""Return a list of parses; words can be a list or string."""
300
if isinstance(words, str):
301
words = words.split()
302
self.parse(words, S)
303
# Return all the parses that span the whole input
304
# 'span the whole input' => begin at 0, end at len(words)
305
return [[i, j, S, found, []]
306
for (i, j, lhs, found, expects) in self.chart[len(words)]
307
# assert j == len(words)
308
if i == 0 and lhs == S and expects == []]
309
310
def parse(self, words, S='S'):
311
"""Parse a list of words; according to the grammar.
312
Leave results in the chart."""
313
self.chart = [[] for i in range(len(words) + 1)]
314
self.add_edge([0, 0, 'S_', [], [S]])
315
for i in range(len(words)):
316
self.scanner(i, words[i])
317
return self.chart
318
319
def add_edge(self, edge):
320
"""Add edge to chart, and see if it extends or predicts another edge."""
321
start, end, lhs, found, expects = edge
322
if edge not in self.chart[end]:
323
self.chart[end].append(edge)
324
if self.trace:
325
print('Chart: added {}'.format(edge))
326
if not expects:
327
self.extender(edge)
328
else:
329
self.predictor(edge)
330
331
def scanner(self, j, word):
332
"""For each edge expecting a word of this category here, extend the edge."""
333
for (i, j, A, alpha, Bb) in self.chart[j]:
334
if Bb and self.grammar.isa(word, Bb[0]):
335
self.add_edge([i, j + 1, A, alpha + [(Bb[0], word)], Bb[1:]])
336
337
def predictor(self, edge):
338
"""Add to chart any rules for B that could help extend this edge."""
339
(i, j, A, alpha, Bb) = edge
340
B = Bb[0]
341
if B in self.grammar.rules:
342
for rhs in self.grammar.rewrites_for(B):
343
self.add_edge([j, j, B, [], rhs])
344
345
def extender(self, edge):
346
"""See what edges can be extended by this edge."""
347
(j, k, B, _, _) = edge
348
for (i, j, A, alpha, B1b) in self.chart[j]:
349
if B1b and B == B1b[0]:
350
self.add_edge([i, k, A, alpha + [edge], B1b[1:]])
351
352
353
# ______________________________________________________________________________
354
# CYK Parsing
355
356
357
class Tree:
358
def __init__(self, root, *args):
359
self.root = root
360
self.leaves = [leaf for leaf in args]
361
362
363
def CYK_parse(words, grammar):
364
""" [Figure 22.6] """
365
# We use 0-based indexing instead of the book's 1-based.
366
P = defaultdict(float)
367
T = defaultdict(Tree)
368
369
# Insert lexical categories for each word.
370
for (i, word) in enumerate(words):
371
for (X, p) in grammar.categories[word]:
372
P[X, i, i] = p
373
T[X, i, i] = Tree(X, word)
374
375
# Construct X(i:k) from Y(i:j) and Z(j+1:k), shortest span first
376
for i, j, k in subspan(len(words)):
377
for (X, Y, Z, p) in grammar.cnf_rules():
378
PYZ = P[Y, i, j] * P[Z, j + 1, k] * p
379
if PYZ > P[X, i, k]:
380
P[X, i, k] = PYZ
381
T[X, i, k] = Tree(X, T[Y, i, j], T[Z, j + 1, k])
382
383
return T
384
385
386
def subspan(N):
387
"""returns all tuple(i, j, k) covering a span (i, k) with i <= j < k"""
388
for length in range(2, N + 1):
389
for i in range(1, N + 2 - length):
390
k = i + length - 1
391
for j in range(i, k):
392
yield (i, j, k)
393
394
395
# using search algorithms in the searching part
396
397
398
class TextParsingProblem(Problem):
399
def __init__(self, initial, grammar, goal='S'):
400
"""
401
:param initial: the initial state of words in a list.
402
:param grammar: a grammar object
403
:param goal: the goal state, usually S
404
"""
405
super(TextParsingProblem, self).__init__(initial, goal)
406
self.grammar = grammar
407
self.combinations = defaultdict(list) # article combinations
408
# backward lookup of rules
409
for rule in grammar.rules:
410
for comb in grammar.rules[rule]:
411
self.combinations[' '.join(comb)].append(rule)
412
413
def actions(self, state):
414
actions = []
415
categories = self.grammar.categories
416
# first change each word to the article of its category
417
for i in range(len(state)):
418
word = state[i]
419
if word in categories:
420
for X in categories[word]:
421
state[i] = X
422
actions.append(copy.copy(state))
423
state[i] = word
424
# if all words are replaced by articles, replace combinations of articles by inferring rules.
425
if not actions:
426
for start in range(len(state)):
427
for end in range(start, len(state) + 1):
428
# try combinations between (start, end)
429
articles = ' '.join(state[start:end])
430
for c in self.combinations[articles]:
431
actions.append(state[:start] + [c] + state[end:])
432
return actions
433
434
def result(self, state, action):
435
return action
436
437
def h(self, state):
438
# heuristic function
439
return len(state)
440
441
442
def astar_search_parsing(words, gramma):
443
"""bottom-up parsing using A* search to find whether a list of words is a sentence"""
444
# init the problem
445
problem = TextParsingProblem(words, gramma, 'S')
446
state = problem.initial
447
# init the searching frontier
448
frontier = [(len(state) + problem.h(state), state)]
449
heapq.heapify(frontier)
450
451
while frontier:
452
# search the frontier node with lowest cost first
453
cost, state = heapq.heappop(frontier)
454
actions = problem.actions(state)
455
for action in actions:
456
new_state = problem.result(state, action)
457
# update the new frontier node to the frontier
458
if new_state == [problem.goal]:
459
return problem.goal
460
if new_state != state:
461
heapq.heappush(frontier, (len(new_state) + problem.h(new_state), new_state))
462
return False
463
464
465
def beam_search_parsing(words, gramma, b=3):
466
"""bottom-up text parsing using beam search"""
467
# init problem
468
problem = TextParsingProblem(words, gramma, 'S')
469
# init frontier
470
frontier = [(len(problem.initial), problem.initial)]
471
heapq.heapify(frontier)
472
473
# explore the current frontier and keep b new states with lowest cost
474
def explore(frontier):
475
new_frontier = []
476
for cost, state in frontier:
477
# expand the possible children states of current state
478
if not problem.goal_test(' '.join(state)):
479
actions = problem.actions(state)
480
for action in actions:
481
new_state = problem.result(state, action)
482
if [len(new_state), new_state] not in new_frontier and new_state != state:
483
new_frontier.append([len(new_state), new_state])
484
else:
485
return problem.goal
486
heapq.heapify(new_frontier)
487
# only keep b states
488
return heapq.nsmallest(b, new_frontier)
489
490
while frontier:
491
frontier = explore(frontier)
492
if frontier == problem.goal:
493
return frontier
494
return False
495
496
497
# ______________________________________________________________________________
498
# 22.4 Augmented Grammar
499
500
501
g = Grammar("arithmetic_expression", # A Grammar of Arithmetic Expression
502
rules={
503
'Number_0': 'Digit_0', 'Number_1': 'Digit_1', 'Number_2': 'Digit_2',
504
'Number_10': 'Number_1 Digit_0', 'Number_11': 'Number_1 Digit_1',
505
'Number_100': 'Number_10 Digit_0',
506
'Exp_5': ['Number_5', '( Exp_5 )', 'Exp_1, Operator_+ Exp_4', 'Exp_2, Operator_+ Exp_3',
507
'Exp_0, Operator_+ Exp_5', 'Exp_3, Operator_+ Exp_2', 'Exp_4, Operator_+ Exp_1',
508
'Exp_5, Operator_+ Exp_0', 'Exp_1, Operator_* Exp_5'], # more possible combinations
509
'Operator_+': operator.add, 'Operator_-': operator.sub, 'Operator_*': operator.mul,
510
'Operator_/': operator.truediv,
511
'Digit_0': 0, 'Digit_1': 1, 'Digit_2': 2, 'Digit_3': 3, 'Digit_4': 4
512
},
513
lexicon={})
514
515
g = Grammar("Ali loves Bob", # A example grammer of Ali loves Bob example
516
rules={
517
"S_loves_ali_bob": "NP_ali, VP_x_loves_x_bob", "S_loves_bob_ali": "NP_bob, VP_x_loves_x_ali",
518
"VP_x_loves_x_bob": "Verb_xy_loves_xy NP_bob", "VP_x_loves_x_ali": "Verb_xy_loves_xy NP_ali",
519
"NP_bob": "Name_bob", "NP_ali": "Name_ali"
520
},
521
lexicon={
522
"Name_ali": "Ali", "Name_bob": "Bob", "Verb_xy_loves_xy": "loves"
523
})
524
525