Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sqlmapproject
GitHub Repository: sqlmapproject/sqlmap
Path: blob/master/lib/utils/sgmllib.py
2989 views
1
"""A parser for SGML, using the derived class as a static DTD."""
2
3
# Note: missing in Python3
4
5
# XXX This only supports those SGML features used by HTML.
6
7
# XXX There should be a way to distinguish between PCDATA (parsed
8
# character data -- the normal case), RCDATA (replaceable character
9
# data -- only char and entity references and end tags are special)
10
# and CDATA (character data -- only end tags are special). RCDATA is
11
# not supported at all.
12
13
from __future__ import print_function
14
15
try:
16
import _markupbase as markupbase
17
except:
18
import markupbase
19
20
import re
21
22
__all__ = ["SGMLParser", "SGMLParseError"]
23
24
# Regular expressions used for parsing
25
26
interesting = re.compile('[&<]')
27
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
28
'<([a-zA-Z][^<>]*|'
29
'/([a-zA-Z][^<>]*)?|'
30
'![^<>]*)?')
31
32
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
33
charref = re.compile('&#([0-9]+)[^0-9]')
34
35
starttagopen = re.compile('<[>a-zA-Z]')
36
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
37
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
38
piclose = re.compile('>')
39
endbracket = re.compile('[<>]')
40
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
41
attrfind = re.compile(
42
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
43
r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
44
45
46
class SGMLParseError(RuntimeError):
47
"""Exception raised for all parse errors."""
48
pass
49
50
51
# SGML parser base class -- find tags and call handler functions.
52
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
53
# The dtd is defined by deriving a class which defines methods
54
# with special names to handle tags: start_foo and end_foo to handle
55
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
56
# (Tags are converted to lower case for this purpose.) The data
57
# between tags is passed to the parser by calling self.handle_data()
58
# with some data as argument (the data may be split up in arbitrary
59
# chunks). Entity references are passed by calling
60
# self.handle_entityref() with the entity reference as argument.
61
62
class SGMLParser(markupbase.ParserBase):
63
# Definition of entities -- derived classes may override
64
entity_or_charref = re.compile('&(?:'
65
'([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
66
')(;?)')
67
68
def __init__(self, verbose=0):
69
"""Initialize and reset this instance."""
70
self.verbose = verbose
71
self.reset()
72
73
def reset(self):
74
"""Reset this instance. Loses all unprocessed data."""
75
self.__starttag_text = None
76
self.rawdata = ''
77
self.stack = []
78
self.lasttag = '???'
79
self.nomoretags = 0
80
self.literal = 0
81
markupbase.ParserBase.reset(self)
82
83
def setnomoretags(self):
84
"""Enter literal mode (CDATA) till EOF.
85
86
Intended for derived classes only.
87
"""
88
self.nomoretags = self.literal = 1
89
90
def setliteral(self, *args):
91
"""Enter literal mode (CDATA).
92
93
Intended for derived classes only.
94
"""
95
self.literal = 1
96
97
def feed(self, data):
98
"""Feed some data to the parser.
99
100
Call this as often as you want, with as little or as much text
101
as you want (may include '\n'). (This just saves the text,
102
all the processing is done by goahead().)
103
"""
104
105
self.rawdata = self.rawdata + data
106
self.goahead(0)
107
108
def close(self):
109
"""Handle the remaining data."""
110
self.goahead(1)
111
112
def error(self, message):
113
raise SGMLParseError(message)
114
115
# Internal -- handle data as far as reasonable. May leave state
116
# and data to be processed by a subsequent call. If 'end' is
117
# true, force handling all data as if followed by EOF marker.
118
def goahead(self, end):
119
rawdata = self.rawdata
120
i = 0
121
n = len(rawdata)
122
while i < n:
123
if self.nomoretags:
124
self.handle_data(rawdata[i:n])
125
i = n
126
break
127
match = interesting.search(rawdata, i)
128
if match:
129
j = match.start()
130
else:
131
j = n
132
if i < j:
133
self.handle_data(rawdata[i:j])
134
i = j
135
if i == n:
136
break
137
if rawdata[i] == '<':
138
if starttagopen.match(rawdata, i):
139
if self.literal:
140
self.handle_data(rawdata[i])
141
i = i + 1
142
continue
143
k = self.parse_starttag(i)
144
if k < 0:
145
break
146
i = k
147
continue
148
if rawdata.startswith("</", i):
149
k = self.parse_endtag(i)
150
if k < 0:
151
break
152
i = k
153
self.literal = 0
154
continue
155
if self.literal:
156
if n > (i + 1):
157
self.handle_data("<")
158
i = i + 1
159
else:
160
# incomplete
161
break
162
continue
163
if rawdata.startswith("<!--", i):
164
# Strictly speaking, a comment is --.*--
165
# within a declaration tag <!...>.
166
# This should be removed,
167
# and comments handled only in parse_declaration.
168
k = self.parse_comment(i)
169
if k < 0:
170
break
171
i = k
172
continue
173
if rawdata.startswith("<?", i):
174
k = self.parse_pi(i)
175
if k < 0:
176
break
177
i = i + k
178
continue
179
if rawdata.startswith("<!", i):
180
# This is some sort of declaration; in "HTML as
181
# deployed," this should only be the document type
182
# declaration ("<!DOCTYPE html...>").
183
k = self.parse_declaration(i)
184
if k < 0:
185
break
186
i = k
187
continue
188
elif rawdata[i] == '&':
189
if self.literal:
190
self.handle_data(rawdata[i])
191
i = i + 1
192
continue
193
match = charref.match(rawdata, i)
194
if match:
195
name = match.group(1)
196
self.handle_charref(name)
197
i = match.end(0)
198
if rawdata[i - 1] != ';':
199
i = i - 1
200
continue
201
match = entityref.match(rawdata, i)
202
if match:
203
name = match.group(1)
204
self.handle_entityref(name)
205
i = match.end(0)
206
if rawdata[i - 1] != ';':
207
i = i - 1
208
continue
209
else:
210
self.error('neither < nor & ??')
211
# We get here only if incomplete matches but
212
# nothing else
213
match = incomplete.match(rawdata, i)
214
if not match:
215
self.handle_data(rawdata[i])
216
i = i + 1
217
continue
218
j = match.end(0)
219
if j == n:
220
break # Really incomplete
221
self.handle_data(rawdata[i:j])
222
i = j
223
# end while
224
if end and i < n:
225
self.handle_data(rawdata[i:n])
226
i = n
227
self.rawdata = rawdata[i:]
228
# XXX if end: check for empty stack
229
230
# Extensions for the DOCTYPE scanner:
231
_decl_otherchars = '='
232
233
# Internal -- parse processing instr, return length or -1 if not terminated
234
def parse_pi(self, i):
235
rawdata = self.rawdata
236
if rawdata[i:i + 2] != '<?':
237
self.error('unexpected call to parse_pi()')
238
match = piclose.search(rawdata, i + 2)
239
if not match:
240
return -1
241
j = match.start(0)
242
self.handle_pi(rawdata[i + 2: j])
243
j = match.end(0)
244
return j - i
245
246
def get_starttag_text(self):
247
return self.__starttag_text
248
249
# Internal -- handle starttag, return length or -1 if not terminated
250
def parse_starttag(self, i):
251
self.__starttag_text = None
252
start_pos = i
253
rawdata = self.rawdata
254
if shorttagopen.match(rawdata, i):
255
# SGML shorthand: <tag/data/ == <tag>data</tag>
256
# XXX Can data contain &... (entity or char refs)?
257
# XXX Can data contain < or > (tag characters)?
258
# XXX Can there be whitespace before the first /?
259
match = shorttag.match(rawdata, i)
260
if not match:
261
return -1
262
tag, data = match.group(1, 2)
263
self.__starttag_text = '<%s/' % tag
264
tag = tag.lower()
265
k = match.end(0)
266
self.finish_shorttag(tag, data)
267
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
268
return k
269
# XXX The following should skip matching quotes (' or ")
270
# As a shortcut way to exit, this isn't so bad, but shouldn't
271
# be used to locate the actual end of the start tag since the
272
# < or > characters may be embedded in an attribute value.
273
match = endbracket.search(rawdata, i + 1)
274
if not match:
275
return -1
276
j = match.start(0)
277
# Now parse the data between i + 1 and j into a tag and attrs
278
attrs = []
279
if rawdata[i:i + 2] == '<>':
280
# SGML shorthand: <> == <last open tag seen>
281
k = j
282
tag = self.lasttag
283
else:
284
match = tagfind.match(rawdata, i + 1)
285
if not match:
286
self.error('unexpected call to parse_starttag')
287
k = match.end(0)
288
tag = rawdata[i + 1:k].lower()
289
self.lasttag = tag
290
while k < j:
291
match = attrfind.match(rawdata, k)
292
if not match:
293
break
294
attrname, rest, attrvalue = match.group(1, 2, 3)
295
if not rest:
296
attrvalue = attrname
297
else:
298
if (attrvalue[:1] == "'" == attrvalue[-1:] or
299
attrvalue[:1] == '"' == attrvalue[-1:]):
300
# strip quotes
301
attrvalue = attrvalue[1:-1]
302
attrvalue = self.entity_or_charref.sub(
303
self._convert_ref, attrvalue)
304
attrs.append((attrname.lower(), attrvalue))
305
k = match.end(0)
306
if rawdata[j] == '>':
307
j = j + 1
308
self.__starttag_text = rawdata[start_pos:j]
309
self.finish_starttag(tag, attrs)
310
return j
311
312
# Internal -- convert entity or character reference
313
def _convert_ref(self, match):
314
if match.group(2):
315
return self.convert_charref(match.group(2)) or \
316
'&#%s%s' % match.groups()[1:]
317
elif match.group(3):
318
return self.convert_entityref(match.group(1)) or \
319
'&%s;' % match.group(1)
320
else:
321
return '&%s' % match.group(1)
322
323
# Internal -- parse endtag
324
def parse_endtag(self, i):
325
rawdata = self.rawdata
326
match = endbracket.search(rawdata, i + 1)
327
if not match:
328
return -1
329
j = match.start(0)
330
tag = rawdata[i + 2:j].strip().lower()
331
if rawdata[j] == '>':
332
j = j + 1
333
self.finish_endtag(tag)
334
return j
335
336
# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
337
def finish_shorttag(self, tag, data):
338
self.finish_starttag(tag, [])
339
self.handle_data(data)
340
self.finish_endtag(tag)
341
342
# Internal -- finish processing of start tag
343
# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
344
def finish_starttag(self, tag, attrs):
345
try:
346
method = getattr(self, 'start_' + tag)
347
except AttributeError:
348
try:
349
method = getattr(self, 'do_' + tag)
350
except AttributeError:
351
self.unknown_starttag(tag, attrs)
352
return -1
353
else:
354
self.handle_starttag(tag, method, attrs)
355
return 0
356
else:
357
self.stack.append(tag)
358
self.handle_starttag(tag, method, attrs)
359
return 1
360
361
# Internal -- finish processing of end tag
362
def finish_endtag(self, tag):
363
if not tag:
364
found = len(self.stack) - 1
365
if found < 0:
366
self.unknown_endtag(tag)
367
return
368
else:
369
if tag not in self.stack:
370
try:
371
method = getattr(self, 'end_' + tag)
372
except AttributeError:
373
self.unknown_endtag(tag)
374
else:
375
self.report_unbalanced(tag)
376
return
377
found = len(self.stack)
378
for i in range(found):
379
if self.stack[i] == tag:
380
found = i
381
while len(self.stack) > found:
382
tag = self.stack[-1]
383
try:
384
method = getattr(self, 'end_' + tag)
385
except AttributeError:
386
method = None
387
if method:
388
self.handle_endtag(tag, method)
389
else:
390
self.unknown_endtag(tag)
391
del self.stack[-1]
392
393
# Overridable -- handle start tag
394
def handle_starttag(self, tag, method, attrs):
395
method(attrs)
396
397
# Overridable -- handle end tag
398
def handle_endtag(self, tag, method):
399
method()
400
401
# Example -- report an unbalanced </...> tag.
402
def report_unbalanced(self, tag):
403
if self.verbose:
404
print('*** Unbalanced </' + tag + '>')
405
print('*** Stack:', self.stack)
406
407
def convert_charref(self, name):
408
"""Convert character reference, may be overridden."""
409
try:
410
n = int(name)
411
except ValueError:
412
return
413
if not 0 <= n <= 127:
414
return
415
return self.convert_codepoint(n)
416
417
def convert_codepoint(self, codepoint):
418
return chr(codepoint)
419
420
def handle_charref(self, name):
421
"""Handle character reference, no need to override."""
422
replacement = self.convert_charref(name)
423
if replacement is None:
424
self.unknown_charref(name)
425
else:
426
self.handle_data(replacement)
427
428
# Definition of entities -- derived classes may override
429
entitydefs = \
430
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
431
432
def convert_entityref(self, name):
433
"""Convert entity references.
434
435
As an alternative to overriding this method; one can tailor the
436
results by setting up the self.entitydefs mapping appropriately.
437
"""
438
table = self.entitydefs
439
if name in table:
440
return table[name]
441
else:
442
return
443
444
def handle_entityref(self, name):
445
"""Handle entity references, no need to override."""
446
replacement = self.convert_entityref(name)
447
if replacement is None:
448
self.unknown_entityref(name)
449
else:
450
self.handle_data(replacement)
451
452
# Example -- handle data, should be overridden
453
def handle_data(self, data):
454
pass
455
456
# Example -- handle comment, could be overridden
457
def handle_comment(self, data):
458
pass
459
460
# Example -- handle declaration, could be overridden
461
def handle_decl(self, decl):
462
pass
463
464
# Example -- handle processing instruction, could be overridden
465
def handle_pi(self, data):
466
pass
467
468
# To be overridden -- handlers for unknown objects
469
def unknown_starttag(self, tag, attrs):
470
pass
471
472
def unknown_endtag(self, tag):
473
pass
474
475
def unknown_charref(self, ref):
476
pass
477
478
def unknown_entityref(self, ref):
479
pass
480
481
482
class TestSGMLParser(SGMLParser):
483
484
def __init__(self, verbose=0):
485
self.testdata = ""
486
SGMLParser.__init__(self, verbose)
487
488
def handle_data(self, data):
489
self.testdata = self.testdata + data
490
if len(repr(self.testdata)) >= 70:
491
self.flush()
492
493
def flush(self):
494
data = self.testdata
495
if data:
496
self.testdata = ""
497
print('data:', repr(data))
498
499
def handle_comment(self, data):
500
self.flush()
501
r = repr(data)
502
if len(r) > 68:
503
r = r[:32] + '...' + r[-32:]
504
print('comment:', r)
505
506
def unknown_starttag(self, tag, attrs):
507
self.flush()
508
if not attrs:
509
print('start tag: <' + tag + '>')
510
else:
511
print('start tag: <' + tag, end=' ')
512
for name, value in attrs:
513
print(name + '=' + '"' + value + '"', end=' ')
514
print('>')
515
516
def unknown_endtag(self, tag):
517
self.flush()
518
print('end tag: </' + tag + '>')
519
520
def unknown_entityref(self, ref):
521
self.flush()
522
print('*** unknown entity ref: &' + ref + ';')
523
524
def unknown_charref(self, ref):
525
self.flush()
526
print('*** unknown char ref: &#' + ref + ';')
527
528
def unknown_decl(self, data):
529
self.flush()
530
print('*** unknown decl: [' + data + ']')
531
532
def close(self):
533
SGMLParser.close(self)
534
self.flush()
535
536
537
def test(args=None):
538
import sys
539
540
if args is None:
541
args = sys.argv[1:]
542
543
if args and args[0] == '-s':
544
args = args[1:]
545
klass = SGMLParser
546
else:
547
klass = TestSGMLParser
548
549
if args:
550
file = args[0]
551
else:
552
file = 'test.html'
553
554
if file == '-':
555
f = sys.stdin
556
else:
557
try:
558
f = open(file, 'r')
559
except IOError as msg:
560
print(file, ":", msg)
561
sys.exit(1)
562
563
data = f.read()
564
if f is not sys.stdin:
565
f.close()
566
567
x = klass()
568
for c in data:
569
x.feed(c)
570
x.close()
571
572
573
if __name__ == '__main__':
574
test()
575
576