CoCalc -- nlp.py

GitHub Repository: aimacode/aima-python
Path: blob/master/nlp.py
⁶¹⁵ views
1
"""Natural Language Processing; Chart Parsing and PageRanking (Chapter 22-23)"""
2

3
from collections import defaultdict
4
from utils import weighted_choice
5
import urllib.request
6
import re
7

8

9
# ______________________________________________________________________________
10
# Grammars and Lexicons
11

12

13
def Rules(**rules):
14
    """Create a dictionary mapping symbols to alternative sequences.
15
    >>> Rules(A = "B C | D E")
16
    {'A': [['B', 'C'], ['D', 'E']]}
17
    """
18
    for (lhs, rhs) in rules.items():
19
        rules[lhs] = [alt.strip().split() for alt in rhs.split('|')]
20
    return rules
21

22

23
def Lexicon(**rules):
24
    """Create a dictionary mapping symbols to alternative words.
25
    >>> Lexicon(Article = "the | a | an")
26
    {'Article': ['the', 'a', 'an']}
27
    """
28
    for (lhs, rhs) in rules.items():
29
        rules[lhs] = [word.strip() for word in rhs.split('|')]
30
    return rules
31

32

33
class Grammar:
34

35
    def __init__(self, name, rules, lexicon):
36
        """A grammar has a set of rules and a lexicon."""
37
        self.name = name
38
        self.rules = rules
39
        self.lexicon = lexicon
40
        self.categories = defaultdict(list)
41
        for lhs in lexicon:
42
            for word in lexicon[lhs]:
43
                self.categories[word].append(lhs)
44

45
    def rewrites_for(self, cat):
46
        """Return a sequence of possible rhs's that cat can be rewritten as."""
47
        return self.rules.get(cat, ())
48

49
    def isa(self, word, cat):
50
        """Return True iff word is of category cat"""
51
        return cat in self.categories[word]
52

53
    def cnf_rules(self):
54
        """Returns the tuple (X, Y, Z) for rules in the form:
55
        X -> Y Z"""
56
        cnf = []
57
        for X, rules in self.rules.items():
58
            for (Y, Z) in rules:
59
                cnf.append((X, Y, Z))
60

61
        return cnf
62

63
    def generate_random(self, S='S'):
64
        """Replace each token in S by a random entry in grammar (recursively)."""
65
        import random
66

67
        def rewrite(tokens, into):
68
            for token in tokens:
69
                if token in self.rules:
70
                    rewrite(random.choice(self.rules[token]), into)
71
                elif token in self.lexicon:
72
                    into.append(random.choice(self.lexicon[token]))
73
                else:
74
                    into.append(token)
75
            return into
76

77
        return ' '.join(rewrite(S.split(), []))
78

79
    def __repr__(self):
80
        return '<Grammar {}>'.format(self.name)
81

82

83
def ProbRules(**rules):
84
    """Create a dictionary mapping symbols to alternative sequences,
85
    with probabilities.
86
    >>> ProbRules(A = "B C [0.3] | D E [0.7]")
87
    {'A': [(['B', 'C'], 0.3), (['D', 'E'], 0.7)]}
88
    """
89
    for (lhs, rhs) in rules.items():
90
        rules[lhs] = []
91
        rhs_separate = [alt.strip().split() for alt in rhs.split('|')]
92
        for r in rhs_separate:
93
            prob = float(r[-1][1:-1])  # remove brackets, convert to float
94
            rhs_rule = (r[:-1], prob)
95
            rules[lhs].append(rhs_rule)
96

97
    return rules
98

99

100
def ProbLexicon(**rules):
101
    """Create a dictionary mapping symbols to alternative words,
102
    with probabilities.
103
    >>> ProbLexicon(Article = "the [0.5] | a [0.25] | an [0.25]")
104
    {'Article': [('the', 0.5), ('a', 0.25), ('an', 0.25)]}
105
    """
106
    for (lhs, rhs) in rules.items():
107
        rules[lhs] = []
108
        rhs_separate = [word.strip().split() for word in rhs.split('|')]
109
        for r in rhs_separate:
110
            prob = float(r[-1][1:-1])  # remove brackets, convert to float
111
            word = r[:-1][0]
112
            rhs_rule = (word, prob)
113
            rules[lhs].append(rhs_rule)
114

115
    return rules
116

117

118
class ProbGrammar:
119

120
    def __init__(self, name, rules, lexicon):
121
        """A grammar has a set of rules and a lexicon.
122
        Each rule has a probability."""
123
        self.name = name
124
        self.rules = rules
125
        self.lexicon = lexicon
126
        self.categories = defaultdict(list)
127

128
        for lhs in lexicon:
129
            for word, prob in lexicon[lhs]:
130
                self.categories[word].append((lhs, prob))
131

132
    def rewrites_for(self, cat):
133
        """Return a sequence of possible rhs's that cat can be rewritten as."""
134
        return self.rules.get(cat, ())
135

136
    def isa(self, word, cat):
137
        """Return True iff word is of category cat"""
138
        return cat in [c for c, _ in self.categories[word]]
139

140
    def cnf_rules(self):
141
        """Returns the tuple (X, Y, Z, p) for rules in the form:
142
        X -> Y Z [p]"""
143
        cnf = []
144
        for X, rules in self.rules.items():
145
            for (Y, Z), p in rules:
146
                cnf.append((X, Y, Z, p))
147

148
        return cnf
149

150
    def generate_random(self, S='S'):
151
        """Replace each token in S by a random entry in grammar (recursively).
152
        Returns a tuple of (sentence, probability)."""
153
        import random
154

155
        def rewrite(tokens, into):
156
            for token in tokens:
157
                if token in self.rules:
158
                    non_terminal, prob = weighted_choice(self.rules[token])
159
                    into[1] *= prob
160
                    rewrite(non_terminal, into)
161
                elif token in self.lexicon:
162
                    terminal, prob = weighted_choice(self.lexicon[token])
163
                    into[0].append(terminal)
164
                    into[1] *= prob
165
                else:
166
                    into[0].append(token)
167
            return into
168

169
        rewritten_as, prob = rewrite(S.split(), [[], 1])
170
        return (' '.join(rewritten_as), prob)
171

172
    def __repr__(self):
173
        return '<Grammar {}>'.format(self.name)
174

175

176
E0 = Grammar('E0',
177
             Rules(  # Grammar for E_0 [Figure 22.4]
178
                 S='NP VP | S Conjunction S',
179
                 NP='Pronoun | Name | Noun | Article Noun | Digit Digit | NP PP | NP RelClause',
180
                 VP='Verb | VP NP | VP Adjective | VP PP | VP Adverb',
181
                 PP='Preposition NP',
182
                 RelClause='That VP'),
183

184
             Lexicon(  # Lexicon for E_0 [Figure 22.3]
185
                 Noun="stench | breeze | glitter | nothing | wumpus | pit | pits | gold | east",
186
                 Verb="is | see | smell | shoot | fell | stinks | go | grab | carry | kill | turn | feel",  # noqa
187
                 Adjective="right | left | east | south | back | smelly",
188
                 Adverb="here | there | nearby | ahead | right | left | east | south | back",
189
                 Pronoun="me | you | I | it",
190
                 Name="John | Mary | Boston | Aristotle",
191
                 Article="the | a | an",
192
                 Preposition="to | in | on | near",
193
                 Conjunction="and | or | but",
194
                 Digit="0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9",
195
                 That="that"
196
             ))
197

198
E_ = Grammar('E_',  # Trivial Grammar and lexicon for testing
199
             Rules(
200
                 S='NP VP',
201
                 NP='Art N | Pronoun',
202
                 VP='V NP'),
203

204
             Lexicon(
205
                 Art='the | a',
206
                 N='man | woman | table | shoelace | saw',
207
                 Pronoun='I | you | it',
208
                 V='saw | liked | feel'
209
             ))
210

211
E_NP_ = Grammar('E_NP_',  # Another Trivial Grammar for testing
212
                Rules(NP='Adj NP | N'),
213
                Lexicon(Adj='happy | handsome | hairy',
214
                        N='man'))
215

216
E_Prob = ProbGrammar('E_Prob',  # The Probabilistic Grammar from the notebook
217
                     ProbRules(
218
                         S="NP VP [0.6] | S Conjunction S [0.4]",
219
                         NP="Pronoun [0.2] | Name [0.05] | Noun [0.2] | Article Noun [0.15] \
220
                             | Article Adjs Noun [0.1] | Digit [0.05] | NP PP [0.15] | NP RelClause [0.1]",
221
                         VP="Verb [0.3] | VP NP [0.2] | VP Adjective [0.25] | VP PP [0.15] | VP Adverb [0.1]",
222
                         Adjs="Adjective [0.5] | Adjective Adjs [0.5]",
223
                         PP="Preposition NP [1]",
224
                         RelClause="RelPro VP [1]"
225
                     ),
226
                     ProbLexicon(
227
                         Verb="is [0.5] | say [0.3] | are [0.2]",
228
                         Noun="robot [0.4] | sheep [0.4] | fence [0.2]",
229
                         Adjective="good [0.5] | new [0.2] | sad [0.3]",
230
                         Adverb="here [0.6] | lightly [0.1] | now [0.3]",
231
                         Pronoun="me [0.3] | you [0.4] | he [0.3]",
232
                         RelPro="that [0.5] | who [0.3] | which [0.2]",
233
                         Name="john [0.4] | mary [0.4] | peter [0.2]",
234
                         Article="the [0.5] | a [0.25] | an [0.25]",
235
                         Preposition="to [0.4] | in [0.3] | at [0.3]",
236
                         Conjunction="and [0.5] | or [0.2] | but [0.3]",
237
                         Digit="0 [0.35] | 1 [0.35] | 2 [0.3]"
238
                     ))
239

240
E_Chomsky = Grammar('E_Prob_Chomsky',  # A Grammar in Chomsky Normal Form
241
                    Rules(
242
                        S='NP VP',
243
                        NP='Article Noun | Adjective Noun',
244
                        VP='Verb NP | Verb Adjective',
245
                    ),
246
                    Lexicon(
247
                        Article='the | a | an',
248
                        Noun='robot | sheep | fence',
249
                        Adjective='good | new | sad',
250
                        Verb='is | say | are'
251
                    ))
252

253
E_Prob_Chomsky = ProbGrammar('E_Prob_Chomsky',  # A Probabilistic Grammar in CNF
254
                             ProbRules(
255
                                 S='NP VP [1]',
256
                                 NP='Article Noun [0.6] | Adjective Noun [0.4]',
257
                                 VP='Verb NP [0.5] | Verb Adjective [0.5]',
258
                             ),
259
                             ProbLexicon(
260
                                 Article='the [0.5] | a [0.25] | an [0.25]',
261
                                 Noun='robot [0.4] | sheep [0.4] | fence [0.2]',
262
                                 Adjective='good [0.5] | new [0.2] | sad [0.3]',
263
                                 Verb='is [0.5] | say [0.3] | are [0.2]'
264
                             ))
265
E_Prob_Chomsky_ = ProbGrammar('E_Prob_Chomsky_',
266
                              ProbRules(
267
                                  S='NP VP [1]',
268
                                  NP='NP PP [0.4] | Noun Verb [0.6]',
269
                                  PP='Preposition NP [1]',
270
                                  VP='Verb NP [0.7] | VP PP [0.3]',
271
                              ),
272
                              ProbLexicon(
273
                                  Noun='astronomers [0.18] | eyes [0.32] | stars [0.32] | telescopes [0.18]',
274
                                  Verb='saw [0.5] | \'\' [0.5]',
275
                                  Preposition='with [1]'
276
                              ))
277

278

279
# ______________________________________________________________________________
280
# Chart Parsing
281

282

283
class Chart:
284
    """Class for parsing sentences using a chart data structure.
285
    >>> chart = Chart(E0)
286
    >>> len(chart.parses('the stench is in 2 2'))
287
    1
288
    """
289

290
    def __init__(self, grammar, trace=False):
291
        """A datastructure for parsing a string; and methods to do the parse.
292
        self.chart[i] holds the edges that end just before the i'th word.
293
        Edges are 5-element lists of [start, end, lhs, [found], [expects]]."""
294
        self.grammar = grammar
295
        self.trace = trace
296

297
    def parses(self, words, S='S'):
298
        """Return a list of parses; words can be a list or string."""
299
        if isinstance(words, str):
300
            words = words.split()
301
        self.parse(words, S)
302
        # Return all the parses that span the whole input
303
        # 'span the whole input' => begin at 0, end at len(words)
304
        return [[i, j, S, found, []]
305
                for (i, j, lhs, found, expects) in self.chart[len(words)]
306
                # assert j == len(words)
307
                if i == 0 and lhs == S and expects == []]
308

309
    def parse(self, words, S='S'):
310
        """Parse a list of words; according to the grammar.
311
        Leave results in the chart."""
312
        self.chart = [[] for i in range(len(words) + 1)]
313
        self.add_edge([0, 0, 'S_', [], [S]])
314
        for i in range(len(words)):
315
            self.scanner(i, words[i])
316
        return self.chart
317

318
    def add_edge(self, edge):
319
        """Add edge to chart, and see if it extends or predicts another edge."""
320
        start, end, lhs, found, expects = edge
321
        if edge not in self.chart[end]:
322
            self.chart[end].append(edge)
323
            if self.trace:
324
                print('Chart: added {}'.format(edge))
325
            if not expects:
326
                self.extender(edge)
327
            else:
328
                self.predictor(edge)
329

330
    def scanner(self, j, word):
331
        """For each edge expecting a word of this category here, extend the edge."""
332
        for (i, j, A, alpha, Bb) in self.chart[j]:
333
            if Bb and self.grammar.isa(word, Bb[0]):
334
                self.add_edge([i, j + 1, A, alpha + [(Bb[0], word)], Bb[1:]])
335

336
    def predictor(self, edge):
337
        """Add to chart any rules for B that could help extend this edge."""
338
        (i, j, A, alpha, Bb) = edge
339
        B = Bb[0]
340
        if B in self.grammar.rules:
341
            for rhs in self.grammar.rewrites_for(B):
342
                self.add_edge([j, j, B, [], rhs])
343

344
    def extender(self, edge):
345
        """See what edges can be extended by this edge."""
346
        (j, k, B, _, _) = edge
347
        for (i, j, A, alpha, B1b) in self.chart[j]:
348
            if B1b and B == B1b[0]:
349
                self.add_edge([i, k, A, alpha + [edge], B1b[1:]])
350

351

352
# ______________________________________________________________________________
353
# CYK Parsing
354

355
def CYK_parse(words, grammar):
356
    """ [Figure 23.5] """
357
    # We use 0-based indexing instead of the book's 1-based.
358
    N = len(words)
359
    P = defaultdict(float)
360

361
    # Insert lexical rules for each word.
362
    for (i, word) in enumerate(words):
363
        for (X, p) in grammar.categories[word]:
364
            P[X, i, 1] = p
365

366
    # Combine first and second parts of right-hand sides of rules,
367
    # from short to long.
368
    for length in range(2, N + 1):
369
        for start in range(N - length + 1):
370
            for len1 in range(1, length):  # N.B. the book incorrectly has N instead of length
371
                len2 = length - len1
372
                for (X, Y, Z, p) in grammar.cnf_rules():
373
                    P[X, start, length] = max(P[X, start, length],
374
                                              P[Y, start, len1] * P[Z, start + len1, len2] * p)
375

376
    return P
377

378

379
# ______________________________________________________________________________
380
# Page Ranking
381

382
# First entry in list is the base URL, and then following are relative URL pages
383
examplePagesSet = ["https://en.wikipedia.org/wiki/", "Aesthetics", "Analytic_philosophy",
384
                   "Ancient_Greek", "Aristotle", "Astrology", "Atheism", "Baruch_Spinoza",
385
                   "Belief", "Betrand Russell", "Confucius", "Consciousness",
386
                   "Continental Philosophy", "Dialectic", "Eastern_Philosophy",
387
                   "Epistemology", "Ethics", "Existentialism", "Friedrich_Nietzsche",
388
                   "Idealism", "Immanuel_Kant", "List_of_political_philosophers", "Logic",
389
                   "Metaphysics", "Philosophers", "Philosophy", "Philosophy_of_mind", "Physics",
390
                   "Plato", "Political_philosophy", "Pythagoras", "Rationalism",
391
                   "Social_philosophy", "Socrates", "Subjectivity", "Theology",
392
                   "Truth", "Western_philosophy"]
393

394

395
def loadPageHTML(addressList):
396
    """Download HTML page content for every URL address passed as argument"""
397
    contentDict = {}
398
    for addr in addressList:
399
        with urllib.request.urlopen(addr) as response:
400
            raw_html = response.read().decode('utf-8')
401
            # Strip raw html of unnessecary content. Basically everything that isn't link or text
402
            html = stripRawHTML(raw_html)
403
            contentDict[addr] = html
404
    return contentDict
405

406

407
def initPages(addressList):
408
    """Create a dictionary of pages from a list of URL addresses"""
409
    pages = {}
410
    for addr in addressList:
411
        pages[addr] = Page(addr)
412
    return pages
413

414

415
def stripRawHTML(raw_html):
416
    """Remove the <head> section of the HTML which contains links to stylesheets etc.,
417
    and remove all other unnessecary HTML"""
418
    # TODO: Strip more out of the raw html
419
    return re.sub("<head>.*?</head>", "", raw_html, flags=re.DOTALL)  # remove <head> section
420

421

422
def determineInlinks(page):
423
    """Given a set of pages that have their outlinks determined, we can fill
424
    out a page's inlinks by looking through all other page's outlinks"""
425
    inlinks = []
426
    for addr, indexPage in pagesIndex.items():
427
        if page.address == indexPage.address:
428
            continue
429
        elif page.address in indexPage.outlinks:
430
            inlinks.append(addr)
431
    return inlinks
432

433

434
def findOutlinks(page, handleURLs=None):
435
    """Search a page's HTML content for URL links to other pages"""
436
    urls = re.findall(r'href=[\'"]?([^\'" >]+)', pagesContent[page.address])
437
    if handleURLs:
438
        urls = handleURLs(urls)
439
    return urls
440

441

442
def onlyWikipediaURLS(urls):
443
    """Some example HTML page data is from wikipedia. This function converts
444
    relative wikipedia links to full wikipedia URLs"""
445
    wikiURLs = [url for url in urls if url.startswith('/wiki/')]
446
    return ["https://en.wikipedia.org" + url for url in wikiURLs]
447

448

449
# ______________________________________________________________________________
450
# HITS Helper Functions
451

452
def expand_pages(pages):
453
    """Adds in every page that links to or is linked from one of
454
    the relevant pages."""
455
    expanded = {}
456
    for addr, page in pages.items():
457
        if addr not in expanded:
458
            expanded[addr] = page
459
        for inlink in page.inlinks:
460
            if inlink not in expanded:
461
                expanded[inlink] = pagesIndex[inlink]
462
        for outlink in page.outlinks:
463
            if outlink not in expanded:
464
                expanded[outlink] = pagesIndex[outlink]
465
    return expanded
466

467

468
def relevant_pages(query):
469
    """Relevant pages are pages that contain all of the query words. They are obtained by
470
    intersecting the hit lists of the query words."""
471
    hit_intersection = {addr for addr in pagesIndex}
472
    query_words = query.split()
473
    for query_word in query_words:
474
        hit_list = set()
475
        for addr in pagesIndex:
476
            if query_word.lower() in pagesContent[addr].lower():
477
                hit_list.add(addr)
478
        hit_intersection = hit_intersection.intersection(hit_list)
479
    return {addr: pagesIndex[addr] for addr in hit_intersection}
480

481

482
def normalize(pages):
483
    """Normalize divides each page's score by the sum of the squares of all
484
    pages' scores (separately for both the authority and hub scores).
485
    """
486
    summed_hub = sum(page.hub ** 2 for _, page in pages.items())
487
    summed_auth = sum(page.authority ** 2 for _, page in pages.items())
488
    for _, page in pages.items():
489
        page.hub /= summed_hub ** 0.5
490
        page.authority /= summed_auth ** 0.5
491

492

493
class ConvergenceDetector(object):
494
    """If the hub and authority values of the pages are no longer changing, we have
495
    reached a convergence and further iterations will have no effect. This detects convergence
496
    so that we can stop the HITS algorithm as early as possible."""
497

498
    def __init__(self):
499
        self.hub_history = None
500
        self.auth_history = None
501

502
    def __call__(self):
503
        return self.detect()
504

505
    def detect(self):
506
        curr_hubs = [page.hub for addr, page in pagesIndex.items()]
507
        curr_auths = [page.authority for addr, page in pagesIndex.items()]
508
        if self.hub_history is None:
509
            self.hub_history, self.auth_history = [], []
510
        else:
511
            diffsHub = [abs(x - y) for x, y in zip(curr_hubs, self.hub_history[-1])]
512
            diffsAuth = [abs(x - y) for x, y in zip(curr_auths, self.auth_history[-1])]
513
            aveDeltaHub = sum(diffsHub) / float(len(pagesIndex))
514
            aveDeltaAuth = sum(diffsAuth) / float(len(pagesIndex))
515
            if aveDeltaHub < 0.01 and aveDeltaAuth < 0.01:  # may need tweaking
516
                return True
517
        if len(self.hub_history) > 2:  # prevent list from getting long
518
            del self.hub_history[0]
519
            del self.auth_history[0]
520
        self.hub_history.append([x for x in curr_hubs])
521
        self.auth_history.append([x for x in curr_auths])
522
        return False
523

524

525
def getInLinks(page):
526
    if not page.inlinks:
527
        page.inlinks = determineInlinks(page)
528
    return [addr for addr, p in pagesIndex.items() if addr in page.inlinks]
529

530

531
def getOutLinks(page):
532
    if not page.outlinks:
533
        page.outlinks = findOutlinks(page)
534
    return [addr for addr, p in pagesIndex.items() if addr in page.outlinks]
535

536

537
# ______________________________________________________________________________
538
# HITS Algorithm
539

540
class Page(object):
541
    def __init__(self, address, inLinks=None, outLinks=None, hub=0, authority=0):
542
        self.address = address
543
        self.hub = hub
544
        self.authority = authority
545
        self.inlinks = inLinks
546
        self.outlinks = outLinks
547

548

549
pagesContent = {}  # maps Page relative or absolute URL/location to page's HTML content
550
pagesIndex = {}
551
convergence = ConvergenceDetector()  # assign function to variable to mimic pseudocode's syntax
552

553

554
def HITS(query):
555
    """The HITS algorithm for computing hubs and authorities with respect to a query."""
556
    pages = expand_pages(relevant_pages(query))
557
    for p in pages.values():
558
        p.authority = 1
559
        p.hub = 1
560
    while not convergence():
561
        authority = {p: pages[p].authority for p in pages}
562
        hub = {p: pages[p].hub for p in pages}
563
        for p in pages:
564
            # p.authority ← ∑i Inlinki(p).Hub
565
            pages[p].authority = sum(hub[x] for x in getInLinks(pages[p]))
566
            # p.hub ← ∑i Outlinki(p).Authority
567
            pages[p].hub = sum(authority[x] for x in getOutLinks(pages[p]))
568
        normalize(pages)
569
    return pages
570

571
Product

Resources

Company