Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Lib/_markupbase.py
12 views
1
"""Shared support for scanning document type declarations in HTML and XHTML.
2
3
This module is used as a foundation for the html.parser module. It has no
4
documented public API and should not be used directly.
5
6
"""
7
8
import re
9
10
_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
11
_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
12
_commentclose = re.compile(r'--\s*>')
13
_markedsectionclose = re.compile(r']\s*]\s*>')
14
15
# An analysis of the MS-Word extensions is available at
16
# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
17
18
_msmarkedsectionclose = re.compile(r']\s*>')
19
20
del re
21
22
23
class ParserBase:
24
"""Parser base class which provides some common support methods used
25
by the SGML/HTML and XHTML parsers."""
26
27
def __init__(self):
28
if self.__class__ is ParserBase:
29
raise RuntimeError(
30
"_markupbase.ParserBase must be subclassed")
31
32
def reset(self):
33
self.lineno = 1
34
self.offset = 0
35
36
def getpos(self):
37
"""Return current line number and offset."""
38
return self.lineno, self.offset
39
40
# Internal -- update line number and offset. This should be
41
# called for each piece of data exactly once, in order -- in other
42
# words the concatenation of all the input strings to this
43
# function should be exactly the entire input.
44
def updatepos(self, i, j):
45
if i >= j:
46
return j
47
rawdata = self.rawdata
48
nlines = rawdata.count("\n", i, j)
49
if nlines:
50
self.lineno = self.lineno + nlines
51
pos = rawdata.rindex("\n", i, j) # Should not fail
52
self.offset = j-(pos+1)
53
else:
54
self.offset = self.offset + j-i
55
return j
56
57
_decl_otherchars = ''
58
59
# Internal -- parse declaration (for use by subclasses).
60
def parse_declaration(self, i):
61
# This is some sort of declaration; in "HTML as
62
# deployed," this should only be the document type
63
# declaration ("<!DOCTYPE html...>").
64
# ISO 8879:1986, however, has more complex
65
# declaration syntax for elements in <!...>, including:
66
# --comment--
67
# [marked section]
68
# name in the following list: ENTITY, DOCTYPE, ELEMENT,
69
# ATTLIST, NOTATION, SHORTREF, USEMAP,
70
# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
71
rawdata = self.rawdata
72
j = i + 2
73
assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
74
if rawdata[j:j+1] == ">":
75
# the empty comment <!>
76
return j + 1
77
if rawdata[j:j+1] in ("-", ""):
78
# Start of comment followed by buffer boundary,
79
# or just a buffer boundary.
80
return -1
81
# A simple, practical version could look like: ((name|stringlit) S*) + '>'
82
n = len(rawdata)
83
if rawdata[j:j+2] == '--': #comment
84
# Locate --.*-- as the body of the comment
85
return self.parse_comment(i)
86
elif rawdata[j] == '[': #marked section
87
# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
88
# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
89
# Note that this is extended by Microsoft Office "Save as Web" function
90
# to include [if...] and [endif].
91
return self.parse_marked_section(i)
92
else: #all other declaration elements
93
decltype, j = self._scan_name(j, i)
94
if j < 0:
95
return j
96
if decltype == "doctype":
97
self._decl_otherchars = ''
98
while j < n:
99
c = rawdata[j]
100
if c == ">":
101
# end of declaration syntax
102
data = rawdata[i+2:j]
103
if decltype == "doctype":
104
self.handle_decl(data)
105
else:
106
# According to the HTML5 specs sections "8.2.4.44 Bogus
107
# comment state" and "8.2.4.45 Markup declaration open
108
# state", a comment token should be emitted.
109
# Calling unknown_decl provides more flexibility though.
110
self.unknown_decl(data)
111
return j + 1
112
if c in "\"'":
113
m = _declstringlit_match(rawdata, j)
114
if not m:
115
return -1 # incomplete
116
j = m.end()
117
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
118
name, j = self._scan_name(j, i)
119
elif c in self._decl_otherchars:
120
j = j + 1
121
elif c == "[":
122
# this could be handled in a separate doctype parser
123
if decltype == "doctype":
124
j = self._parse_doctype_subset(j + 1, i)
125
elif decltype in {"attlist", "linktype", "link", "element"}:
126
# must tolerate []'d groups in a content model in an element declaration
127
# also in data attribute specifications of attlist declaration
128
# also link type declaration subsets in linktype declarations
129
# also link attribute specification lists in link declarations
130
raise AssertionError("unsupported '[' char in %s declaration" % decltype)
131
else:
132
raise AssertionError("unexpected '[' char in declaration")
133
else:
134
raise AssertionError("unexpected %r char in declaration" % rawdata[j])
135
if j < 0:
136
return j
137
return -1 # incomplete
138
139
# Internal -- parse a marked section
140
# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
141
def parse_marked_section(self, i, report=1):
142
rawdata= self.rawdata
143
assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
144
sectName, j = self._scan_name( i+3, i )
145
if j < 0:
146
return j
147
if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}:
148
# look for standard ]]> ending
149
match= _markedsectionclose.search(rawdata, i+3)
150
elif sectName in {"if", "else", "endif"}:
151
# look for MS Office ]> ending
152
match= _msmarkedsectionclose.search(rawdata, i+3)
153
else:
154
raise AssertionError(
155
'unknown status keyword %r in marked section' % rawdata[i+3:j]
156
)
157
if not match:
158
return -1
159
if report:
160
j = match.start(0)
161
self.unknown_decl(rawdata[i+3: j])
162
return match.end(0)
163
164
# Internal -- parse comment, return length or -1 if not terminated
165
def parse_comment(self, i, report=1):
166
rawdata = self.rawdata
167
if rawdata[i:i+4] != '<!--':
168
raise AssertionError('unexpected call to parse_comment()')
169
match = _commentclose.search(rawdata, i+4)
170
if not match:
171
return -1
172
if report:
173
j = match.start(0)
174
self.handle_comment(rawdata[i+4: j])
175
return match.end(0)
176
177
# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
178
# returning the index just past any whitespace following the trailing ']'.
179
def _parse_doctype_subset(self, i, declstartpos):
180
rawdata = self.rawdata
181
n = len(rawdata)
182
j = i
183
while j < n:
184
c = rawdata[j]
185
if c == "<":
186
s = rawdata[j:j+2]
187
if s == "<":
188
# end of buffer; incomplete
189
return -1
190
if s != "<!":
191
self.updatepos(declstartpos, j + 1)
192
raise AssertionError(
193
"unexpected char in internal subset (in %r)" % s
194
)
195
if (j + 2) == n:
196
# end of buffer; incomplete
197
return -1
198
if (j + 4) > n:
199
# end of buffer; incomplete
200
return -1
201
if rawdata[j:j+4] == "<!--":
202
j = self.parse_comment(j, report=0)
203
if j < 0:
204
return j
205
continue
206
name, j = self._scan_name(j + 2, declstartpos)
207
if j == -1:
208
return -1
209
if name not in {"attlist", "element", "entity", "notation"}:
210
self.updatepos(declstartpos, j + 2)
211
raise AssertionError(
212
"unknown declaration %r in internal subset" % name
213
)
214
# handle the individual names
215
meth = getattr(self, "_parse_doctype_" + name)
216
j = meth(j, declstartpos)
217
if j < 0:
218
return j
219
elif c == "%":
220
# parameter entity reference
221
if (j + 1) == n:
222
# end of buffer; incomplete
223
return -1
224
s, j = self._scan_name(j + 1, declstartpos)
225
if j < 0:
226
return j
227
if rawdata[j] == ";":
228
j = j + 1
229
elif c == "]":
230
j = j + 1
231
while j < n and rawdata[j].isspace():
232
j = j + 1
233
if j < n:
234
if rawdata[j] == ">":
235
return j
236
self.updatepos(declstartpos, j)
237
raise AssertionError("unexpected char after internal subset")
238
else:
239
return -1
240
elif c.isspace():
241
j = j + 1
242
else:
243
self.updatepos(declstartpos, j)
244
raise AssertionError("unexpected char %r in internal subset" % c)
245
# end of buffer reached
246
return -1
247
248
# Internal -- scan past <!ELEMENT declarations
249
def _parse_doctype_element(self, i, declstartpos):
250
name, j = self._scan_name(i, declstartpos)
251
if j == -1:
252
return -1
253
# style content model; just skip until '>'
254
rawdata = self.rawdata
255
if '>' in rawdata[j:]:
256
return rawdata.find(">", j) + 1
257
return -1
258
259
# Internal -- scan past <!ATTLIST declarations
260
def _parse_doctype_attlist(self, i, declstartpos):
261
rawdata = self.rawdata
262
name, j = self._scan_name(i, declstartpos)
263
c = rawdata[j:j+1]
264
if c == "":
265
return -1
266
if c == ">":
267
return j + 1
268
while 1:
269
# scan a series of attribute descriptions; simplified:
270
# name type [value] [#constraint]
271
name, j = self._scan_name(j, declstartpos)
272
if j < 0:
273
return j
274
c = rawdata[j:j+1]
275
if c == "":
276
return -1
277
if c == "(":
278
# an enumerated type; look for ')'
279
if ")" in rawdata[j:]:
280
j = rawdata.find(")", j) + 1
281
else:
282
return -1
283
while rawdata[j:j+1].isspace():
284
j = j + 1
285
if not rawdata[j:]:
286
# end of buffer, incomplete
287
return -1
288
else:
289
name, j = self._scan_name(j, declstartpos)
290
c = rawdata[j:j+1]
291
if not c:
292
return -1
293
if c in "'\"":
294
m = _declstringlit_match(rawdata, j)
295
if m:
296
j = m.end()
297
else:
298
return -1
299
c = rawdata[j:j+1]
300
if not c:
301
return -1
302
if c == "#":
303
if rawdata[j:] == "#":
304
# end of buffer
305
return -1
306
name, j = self._scan_name(j + 1, declstartpos)
307
if j < 0:
308
return j
309
c = rawdata[j:j+1]
310
if not c:
311
return -1
312
if c == '>':
313
# all done
314
return j + 1
315
316
# Internal -- scan past <!NOTATION declarations
317
def _parse_doctype_notation(self, i, declstartpos):
318
name, j = self._scan_name(i, declstartpos)
319
if j < 0:
320
return j
321
rawdata = self.rawdata
322
while 1:
323
c = rawdata[j:j+1]
324
if not c:
325
# end of buffer; incomplete
326
return -1
327
if c == '>':
328
return j + 1
329
if c in "'\"":
330
m = _declstringlit_match(rawdata, j)
331
if not m:
332
return -1
333
j = m.end()
334
else:
335
name, j = self._scan_name(j, declstartpos)
336
if j < 0:
337
return j
338
339
# Internal -- scan past <!ENTITY declarations
340
def _parse_doctype_entity(self, i, declstartpos):
341
rawdata = self.rawdata
342
if rawdata[i:i+1] == "%":
343
j = i + 1
344
while 1:
345
c = rawdata[j:j+1]
346
if not c:
347
return -1
348
if c.isspace():
349
j = j + 1
350
else:
351
break
352
else:
353
j = i
354
name, j = self._scan_name(j, declstartpos)
355
if j < 0:
356
return j
357
while 1:
358
c = self.rawdata[j:j+1]
359
if not c:
360
return -1
361
if c in "'\"":
362
m = _declstringlit_match(rawdata, j)
363
if m:
364
j = m.end()
365
else:
366
return -1 # incomplete
367
elif c == ">":
368
return j + 1
369
else:
370
name, j = self._scan_name(j, declstartpos)
371
if j < 0:
372
return j
373
374
# Internal -- scan a name token and the new position and the token, or
375
# return -1 if we've reached the end of the buffer.
376
def _scan_name(self, i, declstartpos):
377
rawdata = self.rawdata
378
n = len(rawdata)
379
if i == n:
380
return None, -1
381
m = _declname_match(rawdata, i)
382
if m:
383
s = m.group()
384
name = s.strip()
385
if (i + len(s)) == n:
386
return None, -1 # end of buffer
387
return name.lower(), m.end()
388
else:
389
self.updatepos(declstartpos, i)
390
raise AssertionError(
391
"expected name token at %r" % rawdata[declstartpos:declstartpos+20]
392
)
393
394
# To be overridden -- handlers for unknown objects
395
def unknown_decl(self, data):
396
pass
397
398