CoCalc -- sgmllib.py

GitHub Repository: sqlmapproject/sqlmap
Path: blob/master/lib/utils/sgmllib.py
²⁹⁸⁹ views
1
"""A parser for SGML, using the derived class as a static DTD."""
2

3
# Note: missing in Python3
4

5
# XXX This only supports those SGML features used by HTML.
6

7
# XXX There should be a way to distinguish between PCDATA (parsed
8
# character data -- the normal case), RCDATA (replaceable character
9
# data -- only char and entity references and end tags are special)
10
# and CDATA (character data -- only end tags are special).  RCDATA is
11
# not supported at all.
12

13
from __future__ import print_function
14

15
try:
16
    import _markupbase as markupbase
17
except:
18
    import markupbase
19

20
import re
21

22
__all__ = ["SGMLParser", "SGMLParseError"]
23

24
# Regular expressions used for parsing
25

26
interesting = re.compile('[&<]')
27
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
28
                        '<([a-zA-Z][^<>]*|'
29
                        '/([a-zA-Z][^<>]*)?|'
30
                        '![^<>]*)?')
31

32
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
33
charref = re.compile('&#([0-9]+)[^0-9]')
34

35
starttagopen = re.compile('<[>a-zA-Z]')
36
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
37
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
38
piclose = re.compile('>')
39
endbracket = re.compile('[<>]')
40
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
41
attrfind = re.compile(
42
    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
43
    r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
44

45

46
class SGMLParseError(RuntimeError):
47
    """Exception raised for all parse errors."""
48
    pass
49

50

51
# SGML parser base class -- find tags and call handler functions.
52
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
53
# The dtd is defined by deriving a class which defines methods
54
# with special names to handle tags: start_foo and end_foo to handle
55
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
56
# (Tags are converted to lower case for this purpose.)  The data
57
# between tags is passed to the parser by calling self.handle_data()
58
# with some data as argument (the data may be split up in arbitrary
59
# chunks).  Entity references are passed by calling
60
# self.handle_entityref() with the entity reference as argument.
61

62
class SGMLParser(markupbase.ParserBase):
63
    # Definition of entities -- derived classes may override
64
    entity_or_charref = re.compile('&(?:'
65
                                   '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
66
                                   ')(;?)')
67

68
    def __init__(self, verbose=0):
69
        """Initialize and reset this instance."""
70
        self.verbose = verbose
71
        self.reset()
72

73
    def reset(self):
74
        """Reset this instance. Loses all unprocessed data."""
75
        self.__starttag_text = None
76
        self.rawdata = ''
77
        self.stack = []
78
        self.lasttag = '???'
79
        self.nomoretags = 0
80
        self.literal = 0
81
        markupbase.ParserBase.reset(self)
82

83
    def setnomoretags(self):
84
        """Enter literal mode (CDATA) till EOF.
85

86
        Intended for derived classes only.
87
        """
88
        self.nomoretags = self.literal = 1
89

90
    def setliteral(self, *args):
91
        """Enter literal mode (CDATA).
92

93
        Intended for derived classes only.
94
        """
95
        self.literal = 1
96

97
    def feed(self, data):
98
        """Feed some data to the parser.
99

100
        Call this as often as you want, with as little or as much text
101
        as you want (may include '\n').  (This just saves the text,
102
        all the processing is done by goahead().)
103
        """
104

105
        self.rawdata = self.rawdata + data
106
        self.goahead(0)
107

108
    def close(self):
109
        """Handle the remaining data."""
110
        self.goahead(1)
111

112
    def error(self, message):
113
        raise SGMLParseError(message)
114

115
    # Internal -- handle data as far as reasonable.  May leave state
116
    # and data to be processed by a subsequent call.  If 'end' is
117
    # true, force handling all data as if followed by EOF marker.
118
    def goahead(self, end):
119
        rawdata = self.rawdata
120
        i = 0
121
        n = len(rawdata)
122
        while i < n:
123
            if self.nomoretags:
124
                self.handle_data(rawdata[i:n])
125
                i = n
126
                break
127
            match = interesting.search(rawdata, i)
128
            if match:
129
                j = match.start()
130
            else:
131
                j = n
132
            if i < j:
133
                self.handle_data(rawdata[i:j])
134
            i = j
135
            if i == n:
136
                break
137
            if rawdata[i] == '<':
138
                if starttagopen.match(rawdata, i):
139
                    if self.literal:
140
                        self.handle_data(rawdata[i])
141
                        i = i + 1
142
                        continue
143
                    k = self.parse_starttag(i)
144
                    if k < 0:
145
                        break
146
                    i = k
147
                    continue
148
                if rawdata.startswith("</", i):
149
                    k = self.parse_endtag(i)
150
                    if k < 0:
151
                        break
152
                    i = k
153
                    self.literal = 0
154
                    continue
155
                if self.literal:
156
                    if n > (i + 1):
157
                        self.handle_data("<")
158
                        i = i + 1
159
                    else:
160
                        # incomplete
161
                        break
162
                    continue
163
                if rawdata.startswith("<!--", i):
164
                        # Strictly speaking, a comment is --.*--
165
                        # within a declaration tag <!...>.
166
                        # This should be removed,
167
                        # and comments handled only in parse_declaration.
168
                    k = self.parse_comment(i)
169
                    if k < 0:
170
                        break
171
                    i = k
172
                    continue
173
                if rawdata.startswith("<?", i):
174
                    k = self.parse_pi(i)
175
                    if k < 0:
176
                        break
177
                    i = i + k
178
                    continue
179
                if rawdata.startswith("<!", i):
180
                    # This is some sort of declaration; in "HTML as
181
                    # deployed," this should only be the document type
182
                    # declaration ("<!DOCTYPE html...>").
183
                    k = self.parse_declaration(i)
184
                    if k < 0:
185
                        break
186
                    i = k
187
                    continue
188
            elif rawdata[i] == '&':
189
                if self.literal:
190
                    self.handle_data(rawdata[i])
191
                    i = i + 1
192
                    continue
193
                match = charref.match(rawdata, i)
194
                if match:
195
                    name = match.group(1)
196
                    self.handle_charref(name)
197
                    i = match.end(0)
198
                    if rawdata[i - 1] != ';':
199
                        i = i - 1
200
                    continue
201
                match = entityref.match(rawdata, i)
202
                if match:
203
                    name = match.group(1)
204
                    self.handle_entityref(name)
205
                    i = match.end(0)
206
                    if rawdata[i - 1] != ';':
207
                        i = i - 1
208
                    continue
209
            else:
210
                self.error('neither < nor & ??')
211
            # We get here only if incomplete matches but
212
            # nothing else
213
            match = incomplete.match(rawdata, i)
214
            if not match:
215
                self.handle_data(rawdata[i])
216
                i = i + 1
217
                continue
218
            j = match.end(0)
219
            if j == n:
220
                break  # Really incomplete
221
            self.handle_data(rawdata[i:j])
222
            i = j
223
        # end while
224
        if end and i < n:
225
            self.handle_data(rawdata[i:n])
226
            i = n
227
        self.rawdata = rawdata[i:]
228
        # XXX if end: check for empty stack
229

230
    # Extensions for the DOCTYPE scanner:
231
    _decl_otherchars = '='
232

233
    # Internal -- parse processing instr, return length or -1 if not terminated
234
    def parse_pi(self, i):
235
        rawdata = self.rawdata
236
        if rawdata[i:i + 2] != '<?':
237
            self.error('unexpected call to parse_pi()')
238
        match = piclose.search(rawdata, i + 2)
239
        if not match:
240
            return -1
241
        j = match.start(0)
242
        self.handle_pi(rawdata[i + 2: j])
243
        j = match.end(0)
244
        return j - i
245

246
    def get_starttag_text(self):
247
        return self.__starttag_text
248

249
    # Internal -- handle starttag, return length or -1 if not terminated
250
    def parse_starttag(self, i):
251
        self.__starttag_text = None
252
        start_pos = i
253
        rawdata = self.rawdata
254
        if shorttagopen.match(rawdata, i):
255
            # SGML shorthand: <tag/data/ == <tag>data</tag>
256
            # XXX Can data contain &... (entity or char refs)?
257
            # XXX Can data contain < or > (tag characters)?
258
            # XXX Can there be whitespace before the first /?
259
            match = shorttag.match(rawdata, i)
260
            if not match:
261
                return -1
262
            tag, data = match.group(1, 2)
263
            self.__starttag_text = '<%s/' % tag
264
            tag = tag.lower()
265
            k = match.end(0)
266
            self.finish_shorttag(tag, data)
267
            self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
268
            return k
269
        # XXX The following should skip matching quotes (' or ")
270
        # As a shortcut way to exit, this isn't so bad, but shouldn't
271
        # be used to locate the actual end of the start tag since the
272
        # < or > characters may be embedded in an attribute value.
273
        match = endbracket.search(rawdata, i + 1)
274
        if not match:
275
            return -1
276
        j = match.start(0)
277
        # Now parse the data between i + 1 and j into a tag and attrs
278
        attrs = []
279
        if rawdata[i:i + 2] == '<>':
280
            # SGML shorthand: <> == <last open tag seen>
281
            k = j
282
            tag = self.lasttag
283
        else:
284
            match = tagfind.match(rawdata, i + 1)
285
            if not match:
286
                self.error('unexpected call to parse_starttag')
287
            k = match.end(0)
288
            tag = rawdata[i + 1:k].lower()
289
            self.lasttag = tag
290
        while k < j:
291
            match = attrfind.match(rawdata, k)
292
            if not match:
293
                break
294
            attrname, rest, attrvalue = match.group(1, 2, 3)
295
            if not rest:
296
                attrvalue = attrname
297
            else:
298
                if (attrvalue[:1] == "'" == attrvalue[-1:] or
299
                   attrvalue[:1] == '"' == attrvalue[-1:]):
300
                    # strip quotes
301
                    attrvalue = attrvalue[1:-1]
302
                attrvalue = self.entity_or_charref.sub(
303
                    self._convert_ref, attrvalue)
304
            attrs.append((attrname.lower(), attrvalue))
305
            k = match.end(0)
306
        if rawdata[j] == '>':
307
            j = j + 1
308
        self.__starttag_text = rawdata[start_pos:j]
309
        self.finish_starttag(tag, attrs)
310
        return j
311

312
    # Internal -- convert entity or character reference
313
    def _convert_ref(self, match):
314
        if match.group(2):
315
            return self.convert_charref(match.group(2)) or \
316
                '&#%s%s' % match.groups()[1:]
317
        elif match.group(3):
318
            return self.convert_entityref(match.group(1)) or \
319
                '&%s;' % match.group(1)
320
        else:
321
            return '&%s' % match.group(1)
322

323
    # Internal -- parse endtag
324
    def parse_endtag(self, i):
325
        rawdata = self.rawdata
326
        match = endbracket.search(rawdata, i + 1)
327
        if not match:
328
            return -1
329
        j = match.start(0)
330
        tag = rawdata[i + 2:j].strip().lower()
331
        if rawdata[j] == '>':
332
            j = j + 1
333
        self.finish_endtag(tag)
334
        return j
335

336
    # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
337
    def finish_shorttag(self, tag, data):
338
        self.finish_starttag(tag, [])
339
        self.handle_data(data)
340
        self.finish_endtag(tag)
341

342
    # Internal -- finish processing of start tag
343
    # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
344
    def finish_starttag(self, tag, attrs):
345
        try:
346
            method = getattr(self, 'start_' + tag)
347
        except AttributeError:
348
            try:
349
                method = getattr(self, 'do_' + tag)
350
            except AttributeError:
351
                self.unknown_starttag(tag, attrs)
352
                return -1
353
            else:
354
                self.handle_starttag(tag, method, attrs)
355
                return 0
356
        else:
357
            self.stack.append(tag)
358
            self.handle_starttag(tag, method, attrs)
359
            return 1
360

361
    # Internal -- finish processing of end tag
362
    def finish_endtag(self, tag):
363
        if not tag:
364
            found = len(self.stack) - 1
365
            if found < 0:
366
                self.unknown_endtag(tag)
367
                return
368
        else:
369
            if tag not in self.stack:
370
                try:
371
                    method = getattr(self, 'end_' + tag)
372
                except AttributeError:
373
                    self.unknown_endtag(tag)
374
                else:
375
                    self.report_unbalanced(tag)
376
                return
377
            found = len(self.stack)
378
            for i in range(found):
379
                if self.stack[i] == tag:
380
                    found = i
381
        while len(self.stack) > found:
382
            tag = self.stack[-1]
383
            try:
384
                method = getattr(self, 'end_' + tag)
385
            except AttributeError:
386
                method = None
387
            if method:
388
                self.handle_endtag(tag, method)
389
            else:
390
                self.unknown_endtag(tag)
391
            del self.stack[-1]
392

393
    # Overridable -- handle start tag
394
    def handle_starttag(self, tag, method, attrs):
395
        method(attrs)
396

397
    # Overridable -- handle end tag
398
    def handle_endtag(self, tag, method):
399
        method()
400

401
    # Example -- report an unbalanced </...> tag.
402
    def report_unbalanced(self, tag):
403
        if self.verbose:
404
            print('*** Unbalanced </' + tag + '>')
405
            print('*** Stack:', self.stack)
406

407
    def convert_charref(self, name):
408
        """Convert character reference, may be overridden."""
409
        try:
410
            n = int(name)
411
        except ValueError:
412
            return
413
        if not 0 <= n <= 127:
414
            return
415
        return self.convert_codepoint(n)
416

417
    def convert_codepoint(self, codepoint):
418
        return chr(codepoint)
419

420
    def handle_charref(self, name):
421
        """Handle character reference, no need to override."""
422
        replacement = self.convert_charref(name)
423
        if replacement is None:
424
            self.unknown_charref(name)
425
        else:
426
            self.handle_data(replacement)
427

428
    # Definition of entities -- derived classes may override
429
    entitydefs = \
430
        {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
431

432
    def convert_entityref(self, name):
433
        """Convert entity references.
434

435
        As an alternative to overriding this method; one can tailor the
436
        results by setting up the self.entitydefs mapping appropriately.
437
        """
438
        table = self.entitydefs
439
        if name in table:
440
            return table[name]
441
        else:
442
            return
443

444
    def handle_entityref(self, name):
445
        """Handle entity references, no need to override."""
446
        replacement = self.convert_entityref(name)
447
        if replacement is None:
448
            self.unknown_entityref(name)
449
        else:
450
            self.handle_data(replacement)
451

452
    # Example -- handle data, should be overridden
453
    def handle_data(self, data):
454
        pass
455

456
    # Example -- handle comment, could be overridden
457
    def handle_comment(self, data):
458
        pass
459

460
    # Example -- handle declaration, could be overridden
461
    def handle_decl(self, decl):
462
        pass
463

464
    # Example -- handle processing instruction, could be overridden
465
    def handle_pi(self, data):
466
        pass
467

468
    # To be overridden -- handlers for unknown objects
469
    def unknown_starttag(self, tag, attrs):
470
        pass
471

472
    def unknown_endtag(self, tag):
473
        pass
474

475
    def unknown_charref(self, ref):
476
        pass
477

478
    def unknown_entityref(self, ref):
479
        pass
480

481

482
class TestSGMLParser(SGMLParser):
483

484
    def __init__(self, verbose=0):
485
        self.testdata = ""
486
        SGMLParser.__init__(self, verbose)
487

488
    def handle_data(self, data):
489
        self.testdata = self.testdata + data
490
        if len(repr(self.testdata)) >= 70:
491
            self.flush()
492

493
    def flush(self):
494
        data = self.testdata
495
        if data:
496
            self.testdata = ""
497
            print('data:', repr(data))
498

499
    def handle_comment(self, data):
500
        self.flush()
501
        r = repr(data)
502
        if len(r) > 68:
503
            r = r[:32] + '...' + r[-32:]
504
        print('comment:', r)
505

506
    def unknown_starttag(self, tag, attrs):
507
        self.flush()
508
        if not attrs:
509
            print('start tag: <' + tag + '>')
510
        else:
511
            print('start tag: <' + tag, end=' ')
512
            for name, value in attrs:
513
                print(name + '=' + '"' + value + '"', end=' ')
514
            print('>')
515

516
    def unknown_endtag(self, tag):
517
        self.flush()
518
        print('end tag: </' + tag + '>')
519

520
    def unknown_entityref(self, ref):
521
        self.flush()
522
        print('*** unknown entity ref: &' + ref + ';')
523

524
    def unknown_charref(self, ref):
525
        self.flush()
526
        print('*** unknown char ref: &#' + ref + ';')
527

528
    def unknown_decl(self, data):
529
        self.flush()
530
        print('*** unknown decl: [' + data + ']')
531

532
    def close(self):
533
        SGMLParser.close(self)
534
        self.flush()
535

536

537
def test(args=None):
538
    import sys
539

540
    if args is None:
541
        args = sys.argv[1:]
542

543
    if args and args[0] == '-s':
544
        args = args[1:]
545
        klass = SGMLParser
546
    else:
547
        klass = TestSGMLParser
548

549
    if args:
550
        file = args[0]
551
    else:
552
        file = 'test.html'
553

554
    if file == '-':
555
        f = sys.stdin
556
    else:
557
        try:
558
            f = open(file, 'r')
559
        except IOError as msg:
560
            print(file, ":", msg)
561
            sys.exit(1)
562

563
    data = f.read()
564
    if f is not sys.stdin:
565
        f.close()
566

567
    x = klass()
568
    for c in data:
569
        x.feed(c)
570
    x.close()
571

572

573
if __name__ == '__main__':
574
    test()
575

576
Product

Resources

Company