Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Kitware
GitHub Repository: Kitware/CMake
Path: blob/master/Utilities/cmexpat/lib/xmltok_impl.c
3153 views
1
/* This file is included (from xmltok.c, 1-3 times depending on XML_MIN_SIZE)!
2
__ __ _
3
___\ \/ /_ __ __ _| |_
4
/ _ \\ /| '_ \ / _` | __|
5
| __// \| |_) | (_| | |_
6
\___/_/\_\ .__/ \__,_|\__|
7
|_| XML parser
8
9
Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10
Copyright (c) 2000 Clark Cooper <[email protected]>
11
Copyright (c) 2002 Fred L. Drake, Jr. <[email protected]>
12
Copyright (c) 2002-2016 Karl Waclawek <[email protected]>
13
Copyright (c) 2016-2022 Sebastian Pipping <[email protected]>
14
Copyright (c) 2017 Rhodri James <[email protected]>
15
Copyright (c) 2018 Benjamin Peterson <[email protected]>
16
Copyright (c) 2018 Anton Maklakov <[email protected]>
17
Copyright (c) 2019 David Loffredo <[email protected]>
18
Copyright (c) 2020 Boris Kolpackov <[email protected]>
19
Copyright (c) 2022 Martin Ettl <[email protected]>
20
Licensed under the MIT license:
21
22
Permission is hereby granted, free of charge, to any person obtaining
23
a copy of this software and associated documentation files (the
24
"Software"), to deal in the Software without restriction, including
25
without limitation the rights to use, copy, modify, merge, publish,
26
distribute, sublicense, and/or sell copies of the Software, and to permit
27
persons to whom the Software is furnished to do so, subject to the
28
following conditions:
29
30
The above copyright notice and this permission notice shall be included
31
in all copies or substantial portions of the Software.
32
33
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
34
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
35
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
36
NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
37
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
38
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
39
USE OR OTHER DEALINGS IN THE SOFTWARE.
40
*/
41
42
#ifdef XML_TOK_IMPL_C
43
44
# ifndef IS_INVALID_CHAR // i.e. for UTF-16 and XML_MIN_SIZE not defined
45
# define IS_INVALID_CHAR(enc, ptr, n) (0)
46
# endif
47
48
# define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
49
case BT_LEAD##n: \
50
if (end - ptr < n) \
51
return XML_TOK_PARTIAL_CHAR; \
52
if (IS_INVALID_CHAR(enc, ptr, n)) { \
53
*(nextTokPtr) = (ptr); \
54
return XML_TOK_INVALID; \
55
} \
56
ptr += n; \
57
break;
58
59
# define INVALID_CASES(ptr, nextTokPtr) \
60
INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
61
INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
62
INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
63
case BT_NONXML: \
64
case BT_MALFORM: \
65
case BT_TRAIL: \
66
*(nextTokPtr) = (ptr); \
67
return XML_TOK_INVALID;
68
69
# define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
70
case BT_LEAD##n: \
71
if (end - ptr < n) \
72
return XML_TOK_PARTIAL_CHAR; \
73
if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \
74
*nextTokPtr = ptr; \
75
return XML_TOK_INVALID; \
76
} \
77
ptr += n; \
78
break;
79
80
# define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
81
case BT_NONASCII: \
82
if (! IS_NAME_CHAR_MINBPC(enc, ptr)) { \
83
*nextTokPtr = ptr; \
84
return XML_TOK_INVALID; \
85
} \
86
/* fall through */ \
87
case BT_NMSTRT: \
88
case BT_HEX: \
89
case BT_DIGIT: \
90
case BT_NAME: \
91
case BT_MINUS: \
92
ptr += MINBPC(enc); \
93
break; \
94
CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
95
CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
96
CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
97
98
# define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
99
case BT_LEAD##n: \
100
if ((end) - (ptr) < (n)) \
101
return XML_TOK_PARTIAL_CHAR; \
102
if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \
103
*nextTokPtr = ptr; \
104
return XML_TOK_INVALID; \
105
} \
106
ptr += n; \
107
break;
108
109
# define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
110
case BT_NONASCII: \
111
if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
112
*nextTokPtr = ptr; \
113
return XML_TOK_INVALID; \
114
} \
115
/* fall through */ \
116
case BT_NMSTRT: \
117
case BT_HEX: \
118
ptr += MINBPC(enc); \
119
break; \
120
CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
121
CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
122
CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
123
124
# ifndef PREFIX
125
# define PREFIX(ident) ident
126
# endif
127
128
# define HAS_CHARS(enc, ptr, end, count) \
129
((end) - (ptr) >= ((count) * MINBPC(enc)))
130
131
# define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
132
133
# define REQUIRE_CHARS(enc, ptr, end, count) \
134
{ \
135
if (! HAS_CHARS(enc, ptr, end, count)) { \
136
return XML_TOK_PARTIAL; \
137
} \
138
}
139
140
# define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
141
142
/* ptr points to character following "<!-" */
143
144
static int PTRCALL
145
PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
146
const char **nextTokPtr) {
147
if (HAS_CHAR(enc, ptr, end)) {
148
if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
149
*nextTokPtr = ptr;
150
return XML_TOK_INVALID;
151
}
152
ptr += MINBPC(enc);
153
while (HAS_CHAR(enc, ptr, end)) {
154
switch (BYTE_TYPE(enc, ptr)) {
155
INVALID_CASES(ptr, nextTokPtr)
156
case BT_MINUS:
157
ptr += MINBPC(enc);
158
REQUIRE_CHAR(enc, ptr, end);
159
if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
160
ptr += MINBPC(enc);
161
REQUIRE_CHAR(enc, ptr, end);
162
if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
163
*nextTokPtr = ptr;
164
return XML_TOK_INVALID;
165
}
166
*nextTokPtr = ptr + MINBPC(enc);
167
return XML_TOK_COMMENT;
168
}
169
break;
170
default:
171
ptr += MINBPC(enc);
172
break;
173
}
174
}
175
}
176
return XML_TOK_PARTIAL;
177
}
178
179
/* ptr points to character following "<!" */
180
181
static int PTRCALL
182
PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
183
const char **nextTokPtr) {
184
REQUIRE_CHAR(enc, ptr, end);
185
switch (BYTE_TYPE(enc, ptr)) {
186
case BT_MINUS:
187
return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
188
case BT_LSQB:
189
*nextTokPtr = ptr + MINBPC(enc);
190
return XML_TOK_COND_SECT_OPEN;
191
case BT_NMSTRT:
192
case BT_HEX:
193
ptr += MINBPC(enc);
194
break;
195
default:
196
*nextTokPtr = ptr;
197
return XML_TOK_INVALID;
198
}
199
while (HAS_CHAR(enc, ptr, end)) {
200
switch (BYTE_TYPE(enc, ptr)) {
201
case BT_PERCNT:
202
REQUIRE_CHARS(enc, ptr, end, 2);
203
/* don't allow <!ENTITY% foo "whatever"> */
204
switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
205
case BT_S:
206
case BT_CR:
207
case BT_LF:
208
case BT_PERCNT:
209
*nextTokPtr = ptr;
210
return XML_TOK_INVALID;
211
}
212
/* fall through */
213
case BT_S:
214
case BT_CR:
215
case BT_LF:
216
*nextTokPtr = ptr;
217
return XML_TOK_DECL_OPEN;
218
case BT_NMSTRT:
219
case BT_HEX:
220
ptr += MINBPC(enc);
221
break;
222
default:
223
*nextTokPtr = ptr;
224
return XML_TOK_INVALID;
225
}
226
}
227
return XML_TOK_PARTIAL;
228
}
229
230
static int PTRCALL
231
PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
232
int *tokPtr) {
233
int upper = 0;
234
UNUSED_P(enc);
235
*tokPtr = XML_TOK_PI;
236
if (end - ptr != MINBPC(enc) * 3)
237
return 1;
238
switch (BYTE_TO_ASCII(enc, ptr)) {
239
case ASCII_x:
240
break;
241
case ASCII_X:
242
upper = 1;
243
break;
244
default:
245
return 1;
246
}
247
ptr += MINBPC(enc);
248
switch (BYTE_TO_ASCII(enc, ptr)) {
249
case ASCII_m:
250
break;
251
case ASCII_M:
252
upper = 1;
253
break;
254
default:
255
return 1;
256
}
257
ptr += MINBPC(enc);
258
switch (BYTE_TO_ASCII(enc, ptr)) {
259
case ASCII_l:
260
break;
261
case ASCII_L:
262
upper = 1;
263
break;
264
default:
265
return 1;
266
}
267
if (upper)
268
return 0;
269
*tokPtr = XML_TOK_XML_DECL;
270
return 1;
271
}
272
273
/* ptr points to character following "<?" */
274
275
static int PTRCALL
276
PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
277
const char **nextTokPtr) {
278
int tok;
279
const char *target = ptr;
280
REQUIRE_CHAR(enc, ptr, end);
281
switch (BYTE_TYPE(enc, ptr)) {
282
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
283
default:
284
*nextTokPtr = ptr;
285
return XML_TOK_INVALID;
286
}
287
while (HAS_CHAR(enc, ptr, end)) {
288
switch (BYTE_TYPE(enc, ptr)) {
289
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
290
case BT_S:
291
case BT_CR:
292
case BT_LF:
293
if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
294
*nextTokPtr = ptr;
295
return XML_TOK_INVALID;
296
}
297
ptr += MINBPC(enc);
298
while (HAS_CHAR(enc, ptr, end)) {
299
switch (BYTE_TYPE(enc, ptr)) {
300
INVALID_CASES(ptr, nextTokPtr)
301
case BT_QUEST:
302
ptr += MINBPC(enc);
303
REQUIRE_CHAR(enc, ptr, end);
304
if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
305
*nextTokPtr = ptr + MINBPC(enc);
306
return tok;
307
}
308
break;
309
default:
310
ptr += MINBPC(enc);
311
break;
312
}
313
}
314
return XML_TOK_PARTIAL;
315
case BT_QUEST:
316
if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
317
*nextTokPtr = ptr;
318
return XML_TOK_INVALID;
319
}
320
ptr += MINBPC(enc);
321
REQUIRE_CHAR(enc, ptr, end);
322
if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
323
*nextTokPtr = ptr + MINBPC(enc);
324
return tok;
325
}
326
/* fall through */
327
default:
328
*nextTokPtr = ptr;
329
return XML_TOK_INVALID;
330
}
331
}
332
return XML_TOK_PARTIAL;
333
}
334
335
static int PTRCALL
336
PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
337
const char **nextTokPtr) {
338
static const char CDATA_LSQB[]
339
= {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
340
int i;
341
UNUSED_P(enc);
342
/* CDATA[ */
343
REQUIRE_CHARS(enc, ptr, end, 6);
344
for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
345
if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
346
*nextTokPtr = ptr;
347
return XML_TOK_INVALID;
348
}
349
}
350
*nextTokPtr = ptr;
351
return XML_TOK_CDATA_SECT_OPEN;
352
}
353
354
static int PTRCALL
355
PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
356
const char **nextTokPtr) {
357
if (ptr >= end)
358
return XML_TOK_NONE;
359
if (MINBPC(enc) > 1) {
360
size_t n = end - ptr;
361
if (n & (MINBPC(enc) - 1)) {
362
n &= ~(MINBPC(enc) - 1);
363
if (n == 0)
364
return XML_TOK_PARTIAL;
365
end = ptr + n;
366
}
367
}
368
switch (BYTE_TYPE(enc, ptr)) {
369
case BT_RSQB:
370
ptr += MINBPC(enc);
371
REQUIRE_CHAR(enc, ptr, end);
372
if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
373
break;
374
ptr += MINBPC(enc);
375
REQUIRE_CHAR(enc, ptr, end);
376
if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
377
ptr -= MINBPC(enc);
378
break;
379
}
380
*nextTokPtr = ptr + MINBPC(enc);
381
return XML_TOK_CDATA_SECT_CLOSE;
382
case BT_CR:
383
ptr += MINBPC(enc);
384
REQUIRE_CHAR(enc, ptr, end);
385
if (BYTE_TYPE(enc, ptr) == BT_LF)
386
ptr += MINBPC(enc);
387
*nextTokPtr = ptr;
388
return XML_TOK_DATA_NEWLINE;
389
case BT_LF:
390
*nextTokPtr = ptr + MINBPC(enc);
391
return XML_TOK_DATA_NEWLINE;
392
INVALID_CASES(ptr, nextTokPtr)
393
default:
394
ptr += MINBPC(enc);
395
break;
396
}
397
while (HAS_CHAR(enc, ptr, end)) {
398
switch (BYTE_TYPE(enc, ptr)) {
399
# define LEAD_CASE(n) \
400
case BT_LEAD##n: \
401
if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
402
*nextTokPtr = ptr; \
403
return XML_TOK_DATA_CHARS; \
404
} \
405
ptr += n; \
406
break;
407
LEAD_CASE(2)
408
LEAD_CASE(3)
409
LEAD_CASE(4)
410
# undef LEAD_CASE
411
case BT_NONXML:
412
case BT_MALFORM:
413
case BT_TRAIL:
414
case BT_CR:
415
case BT_LF:
416
case BT_RSQB:
417
*nextTokPtr = ptr;
418
return XML_TOK_DATA_CHARS;
419
default:
420
ptr += MINBPC(enc);
421
break;
422
}
423
}
424
*nextTokPtr = ptr;
425
return XML_TOK_DATA_CHARS;
426
}
427
428
/* ptr points to character following "</" */
429
430
static int PTRCALL
431
PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
432
const char **nextTokPtr) {
433
REQUIRE_CHAR(enc, ptr, end);
434
switch (BYTE_TYPE(enc, ptr)) {
435
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
436
default:
437
*nextTokPtr = ptr;
438
return XML_TOK_INVALID;
439
}
440
while (HAS_CHAR(enc, ptr, end)) {
441
switch (BYTE_TYPE(enc, ptr)) {
442
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
443
case BT_S:
444
case BT_CR:
445
case BT_LF:
446
for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
447
switch (BYTE_TYPE(enc, ptr)) {
448
case BT_S:
449
case BT_CR:
450
case BT_LF:
451
break;
452
case BT_GT:
453
*nextTokPtr = ptr + MINBPC(enc);
454
return XML_TOK_END_TAG;
455
default:
456
*nextTokPtr = ptr;
457
return XML_TOK_INVALID;
458
}
459
}
460
return XML_TOK_PARTIAL;
461
# ifdef XML_NS
462
case BT_COLON:
463
/* no need to check qname syntax here,
464
since end-tag must match exactly */
465
ptr += MINBPC(enc);
466
break;
467
# endif
468
case BT_GT:
469
*nextTokPtr = ptr + MINBPC(enc);
470
return XML_TOK_END_TAG;
471
default:
472
*nextTokPtr = ptr;
473
return XML_TOK_INVALID;
474
}
475
}
476
return XML_TOK_PARTIAL;
477
}
478
479
/* ptr points to character following "&#X" */
480
481
static int PTRCALL
482
PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
483
const char **nextTokPtr) {
484
if (HAS_CHAR(enc, ptr, end)) {
485
switch (BYTE_TYPE(enc, ptr)) {
486
case BT_DIGIT:
487
case BT_HEX:
488
break;
489
default:
490
*nextTokPtr = ptr;
491
return XML_TOK_INVALID;
492
}
493
for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
494
switch (BYTE_TYPE(enc, ptr)) {
495
case BT_DIGIT:
496
case BT_HEX:
497
break;
498
case BT_SEMI:
499
*nextTokPtr = ptr + MINBPC(enc);
500
return XML_TOK_CHAR_REF;
501
default:
502
*nextTokPtr = ptr;
503
return XML_TOK_INVALID;
504
}
505
}
506
}
507
return XML_TOK_PARTIAL;
508
}
509
510
/* ptr points to character following "&#" */
511
512
static int PTRCALL
513
PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
514
const char **nextTokPtr) {
515
if (HAS_CHAR(enc, ptr, end)) {
516
if (CHAR_MATCHES(enc, ptr, ASCII_x))
517
return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
518
switch (BYTE_TYPE(enc, ptr)) {
519
case BT_DIGIT:
520
break;
521
default:
522
*nextTokPtr = ptr;
523
return XML_TOK_INVALID;
524
}
525
for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
526
switch (BYTE_TYPE(enc, ptr)) {
527
case BT_DIGIT:
528
break;
529
case BT_SEMI:
530
*nextTokPtr = ptr + MINBPC(enc);
531
return XML_TOK_CHAR_REF;
532
default:
533
*nextTokPtr = ptr;
534
return XML_TOK_INVALID;
535
}
536
}
537
}
538
return XML_TOK_PARTIAL;
539
}
540
541
/* ptr points to character following "&" */
542
543
static int PTRCALL
544
PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
545
const char **nextTokPtr) {
546
REQUIRE_CHAR(enc, ptr, end);
547
switch (BYTE_TYPE(enc, ptr)) {
548
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
549
case BT_NUM:
550
return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
551
default:
552
*nextTokPtr = ptr;
553
return XML_TOK_INVALID;
554
}
555
while (HAS_CHAR(enc, ptr, end)) {
556
switch (BYTE_TYPE(enc, ptr)) {
557
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
558
case BT_SEMI:
559
*nextTokPtr = ptr + MINBPC(enc);
560
return XML_TOK_ENTITY_REF;
561
default:
562
*nextTokPtr = ptr;
563
return XML_TOK_INVALID;
564
}
565
}
566
return XML_TOK_PARTIAL;
567
}
568
569
/* ptr points to character following first character of attribute name */
570
571
static int PTRCALL
572
PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
573
const char **nextTokPtr) {
574
# ifdef XML_NS
575
int hadColon = 0;
576
# endif
577
while (HAS_CHAR(enc, ptr, end)) {
578
switch (BYTE_TYPE(enc, ptr)) {
579
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
580
# ifdef XML_NS
581
case BT_COLON:
582
if (hadColon) {
583
*nextTokPtr = ptr;
584
return XML_TOK_INVALID;
585
}
586
hadColon = 1;
587
ptr += MINBPC(enc);
588
REQUIRE_CHAR(enc, ptr, end);
589
switch (BYTE_TYPE(enc, ptr)) {
590
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
591
default:
592
*nextTokPtr = ptr;
593
return XML_TOK_INVALID;
594
}
595
break;
596
# endif
597
case BT_S:
598
case BT_CR:
599
case BT_LF:
600
for (;;) {
601
int t;
602
603
ptr += MINBPC(enc);
604
REQUIRE_CHAR(enc, ptr, end);
605
t = BYTE_TYPE(enc, ptr);
606
if (t == BT_EQUALS)
607
break;
608
switch (t) {
609
case BT_S:
610
case BT_LF:
611
case BT_CR:
612
break;
613
default:
614
*nextTokPtr = ptr;
615
return XML_TOK_INVALID;
616
}
617
}
618
/* fall through */
619
case BT_EQUALS: {
620
int open;
621
# ifdef XML_NS
622
hadColon = 0;
623
# endif
624
for (;;) {
625
ptr += MINBPC(enc);
626
REQUIRE_CHAR(enc, ptr, end);
627
open = BYTE_TYPE(enc, ptr);
628
if (open == BT_QUOT || open == BT_APOS)
629
break;
630
switch (open) {
631
case BT_S:
632
case BT_LF:
633
case BT_CR:
634
break;
635
default:
636
*nextTokPtr = ptr;
637
return XML_TOK_INVALID;
638
}
639
}
640
ptr += MINBPC(enc);
641
/* in attribute value */
642
for (;;) {
643
int t;
644
REQUIRE_CHAR(enc, ptr, end);
645
t = BYTE_TYPE(enc, ptr);
646
if (t == open)
647
break;
648
switch (t) {
649
INVALID_CASES(ptr, nextTokPtr)
650
case BT_AMP: {
651
int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
652
if (tok <= 0) {
653
if (tok == XML_TOK_INVALID)
654
*nextTokPtr = ptr;
655
return tok;
656
}
657
break;
658
}
659
case BT_LT:
660
*nextTokPtr = ptr;
661
return XML_TOK_INVALID;
662
default:
663
ptr += MINBPC(enc);
664
break;
665
}
666
}
667
ptr += MINBPC(enc);
668
REQUIRE_CHAR(enc, ptr, end);
669
switch (BYTE_TYPE(enc, ptr)) {
670
case BT_S:
671
case BT_CR:
672
case BT_LF:
673
break;
674
case BT_SOL:
675
goto sol;
676
case BT_GT:
677
goto gt;
678
default:
679
*nextTokPtr = ptr;
680
return XML_TOK_INVALID;
681
}
682
/* ptr points to closing quote */
683
for (;;) {
684
ptr += MINBPC(enc);
685
REQUIRE_CHAR(enc, ptr, end);
686
switch (BYTE_TYPE(enc, ptr)) {
687
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
688
case BT_S:
689
case BT_CR:
690
case BT_LF:
691
continue;
692
case BT_GT:
693
gt:
694
*nextTokPtr = ptr + MINBPC(enc);
695
return XML_TOK_START_TAG_WITH_ATTS;
696
case BT_SOL:
697
sol:
698
ptr += MINBPC(enc);
699
REQUIRE_CHAR(enc, ptr, end);
700
if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
701
*nextTokPtr = ptr;
702
return XML_TOK_INVALID;
703
}
704
*nextTokPtr = ptr + MINBPC(enc);
705
return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
706
default:
707
*nextTokPtr = ptr;
708
return XML_TOK_INVALID;
709
}
710
break;
711
}
712
break;
713
}
714
default:
715
*nextTokPtr = ptr;
716
return XML_TOK_INVALID;
717
}
718
}
719
return XML_TOK_PARTIAL;
720
}
721
722
/* ptr points to character following "<" */
723
724
static int PTRCALL
725
PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
726
const char **nextTokPtr) {
727
# ifdef XML_NS
728
int hadColon;
729
# endif
730
REQUIRE_CHAR(enc, ptr, end);
731
switch (BYTE_TYPE(enc, ptr)) {
732
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
733
case BT_EXCL:
734
ptr += MINBPC(enc);
735
REQUIRE_CHAR(enc, ptr, end);
736
switch (BYTE_TYPE(enc, ptr)) {
737
case BT_MINUS:
738
return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
739
case BT_LSQB:
740
return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
741
}
742
*nextTokPtr = ptr;
743
return XML_TOK_INVALID;
744
case BT_QUEST:
745
return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
746
case BT_SOL:
747
return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
748
default:
749
*nextTokPtr = ptr;
750
return XML_TOK_INVALID;
751
}
752
# ifdef XML_NS
753
hadColon = 0;
754
# endif
755
/* we have a start-tag */
756
while (HAS_CHAR(enc, ptr, end)) {
757
switch (BYTE_TYPE(enc, ptr)) {
758
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
759
# ifdef XML_NS
760
case BT_COLON:
761
if (hadColon) {
762
*nextTokPtr = ptr;
763
return XML_TOK_INVALID;
764
}
765
hadColon = 1;
766
ptr += MINBPC(enc);
767
REQUIRE_CHAR(enc, ptr, end);
768
switch (BYTE_TYPE(enc, ptr)) {
769
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
770
default:
771
*nextTokPtr = ptr;
772
return XML_TOK_INVALID;
773
}
774
break;
775
# endif
776
case BT_S:
777
case BT_CR:
778
case BT_LF: {
779
ptr += MINBPC(enc);
780
while (HAS_CHAR(enc, ptr, end)) {
781
switch (BYTE_TYPE(enc, ptr)) {
782
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
783
case BT_GT:
784
goto gt;
785
case BT_SOL:
786
goto sol;
787
case BT_S:
788
case BT_CR:
789
case BT_LF:
790
ptr += MINBPC(enc);
791
continue;
792
default:
793
*nextTokPtr = ptr;
794
return XML_TOK_INVALID;
795
}
796
return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
797
}
798
return XML_TOK_PARTIAL;
799
}
800
case BT_GT:
801
gt:
802
*nextTokPtr = ptr + MINBPC(enc);
803
return XML_TOK_START_TAG_NO_ATTS;
804
case BT_SOL:
805
sol:
806
ptr += MINBPC(enc);
807
REQUIRE_CHAR(enc, ptr, end);
808
if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
809
*nextTokPtr = ptr;
810
return XML_TOK_INVALID;
811
}
812
*nextTokPtr = ptr + MINBPC(enc);
813
return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
814
default:
815
*nextTokPtr = ptr;
816
return XML_TOK_INVALID;
817
}
818
}
819
return XML_TOK_PARTIAL;
820
}
821
822
static int PTRCALL
823
PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
824
const char **nextTokPtr) {
825
if (ptr >= end)
826
return XML_TOK_NONE;
827
if (MINBPC(enc) > 1) {
828
size_t n = end - ptr;
829
if (n & (MINBPC(enc) - 1)) {
830
n &= ~(MINBPC(enc) - 1);
831
if (n == 0)
832
return XML_TOK_PARTIAL;
833
end = ptr + n;
834
}
835
}
836
switch (BYTE_TYPE(enc, ptr)) {
837
case BT_LT:
838
return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
839
case BT_AMP:
840
return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
841
case BT_CR:
842
ptr += MINBPC(enc);
843
if (! HAS_CHAR(enc, ptr, end))
844
return XML_TOK_TRAILING_CR;
845
if (BYTE_TYPE(enc, ptr) == BT_LF)
846
ptr += MINBPC(enc);
847
*nextTokPtr = ptr;
848
return XML_TOK_DATA_NEWLINE;
849
case BT_LF:
850
*nextTokPtr = ptr + MINBPC(enc);
851
return XML_TOK_DATA_NEWLINE;
852
case BT_RSQB:
853
ptr += MINBPC(enc);
854
if (! HAS_CHAR(enc, ptr, end))
855
return XML_TOK_TRAILING_RSQB;
856
if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
857
break;
858
ptr += MINBPC(enc);
859
if (! HAS_CHAR(enc, ptr, end))
860
return XML_TOK_TRAILING_RSQB;
861
if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
862
ptr -= MINBPC(enc);
863
break;
864
}
865
*nextTokPtr = ptr;
866
return XML_TOK_INVALID;
867
INVALID_CASES(ptr, nextTokPtr)
868
default:
869
ptr += MINBPC(enc);
870
break;
871
}
872
while (HAS_CHAR(enc, ptr, end)) {
873
switch (BYTE_TYPE(enc, ptr)) {
874
# define LEAD_CASE(n) \
875
case BT_LEAD##n: \
876
if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
877
*nextTokPtr = ptr; \
878
return XML_TOK_DATA_CHARS; \
879
} \
880
ptr += n; \
881
break;
882
LEAD_CASE(2)
883
LEAD_CASE(3)
884
LEAD_CASE(4)
885
# undef LEAD_CASE
886
case BT_RSQB:
887
if (HAS_CHARS(enc, ptr, end, 2)) {
888
if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
889
ptr += MINBPC(enc);
890
break;
891
}
892
if (HAS_CHARS(enc, ptr, end, 3)) {
893
if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
894
ptr += MINBPC(enc);
895
break;
896
}
897
*nextTokPtr = ptr + 2 * MINBPC(enc);
898
return XML_TOK_INVALID;
899
}
900
}
901
/* fall through */
902
case BT_AMP:
903
case BT_LT:
904
case BT_NONXML:
905
case BT_MALFORM:
906
case BT_TRAIL:
907
case BT_CR:
908
case BT_LF:
909
*nextTokPtr = ptr;
910
return XML_TOK_DATA_CHARS;
911
default:
912
ptr += MINBPC(enc);
913
break;
914
}
915
}
916
*nextTokPtr = ptr;
917
return XML_TOK_DATA_CHARS;
918
}
919
920
/* ptr points to character following "%" */
921
922
static int PTRCALL
923
PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
924
const char **nextTokPtr) {
925
REQUIRE_CHAR(enc, ptr, end);
926
switch (BYTE_TYPE(enc, ptr)) {
927
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
928
case BT_S:
929
case BT_LF:
930
case BT_CR:
931
case BT_PERCNT:
932
*nextTokPtr = ptr;
933
return XML_TOK_PERCENT;
934
default:
935
*nextTokPtr = ptr;
936
return XML_TOK_INVALID;
937
}
938
while (HAS_CHAR(enc, ptr, end)) {
939
switch (BYTE_TYPE(enc, ptr)) {
940
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
941
case BT_SEMI:
942
*nextTokPtr = ptr + MINBPC(enc);
943
return XML_TOK_PARAM_ENTITY_REF;
944
default:
945
*nextTokPtr = ptr;
946
return XML_TOK_INVALID;
947
}
948
}
949
return XML_TOK_PARTIAL;
950
}
951
952
static int PTRCALL
953
PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
954
const char **nextTokPtr) {
955
REQUIRE_CHAR(enc, ptr, end);
956
switch (BYTE_TYPE(enc, ptr)) {
957
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
958
default:
959
*nextTokPtr = ptr;
960
return XML_TOK_INVALID;
961
}
962
while (HAS_CHAR(enc, ptr, end)) {
963
switch (BYTE_TYPE(enc, ptr)) {
964
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
965
case BT_CR:
966
case BT_LF:
967
case BT_S:
968
case BT_RPAR:
969
case BT_GT:
970
case BT_PERCNT:
971
case BT_VERBAR:
972
*nextTokPtr = ptr;
973
return XML_TOK_POUND_NAME;
974
default:
975
*nextTokPtr = ptr;
976
return XML_TOK_INVALID;
977
}
978
}
979
return -XML_TOK_POUND_NAME;
980
}
981
982
static int PTRCALL
983
PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
984
const char **nextTokPtr) {
985
while (HAS_CHAR(enc, ptr, end)) {
986
int t = BYTE_TYPE(enc, ptr);
987
switch (t) {
988
INVALID_CASES(ptr, nextTokPtr)
989
case BT_QUOT:
990
case BT_APOS:
991
ptr += MINBPC(enc);
992
if (t != open)
993
break;
994
if (! HAS_CHAR(enc, ptr, end))
995
return -XML_TOK_LITERAL;
996
*nextTokPtr = ptr;
997
switch (BYTE_TYPE(enc, ptr)) {
998
case BT_S:
999
case BT_CR:
1000
case BT_LF:
1001
case BT_GT:
1002
case BT_PERCNT:
1003
case BT_LSQB:
1004
return XML_TOK_LITERAL;
1005
default:
1006
return XML_TOK_INVALID;
1007
}
1008
default:
1009
ptr += MINBPC(enc);
1010
break;
1011
}
1012
}
1013
return XML_TOK_PARTIAL;
1014
}
1015
1016
static int PTRCALL
1017
PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
1018
const char **nextTokPtr) {
1019
int tok;
1020
if (ptr >= end)
1021
return XML_TOK_NONE;
1022
if (MINBPC(enc) > 1) {
1023
size_t n = end - ptr;
1024
if (n & (MINBPC(enc) - 1)) {
1025
n &= ~(MINBPC(enc) - 1);
1026
if (n == 0)
1027
return XML_TOK_PARTIAL;
1028
end = ptr + n;
1029
}
1030
}
1031
switch (BYTE_TYPE(enc, ptr)) {
1032
case BT_QUOT:
1033
return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1034
case BT_APOS:
1035
return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1036
case BT_LT: {
1037
ptr += MINBPC(enc);
1038
REQUIRE_CHAR(enc, ptr, end);
1039
switch (BYTE_TYPE(enc, ptr)) {
1040
case BT_EXCL:
1041
return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1042
case BT_QUEST:
1043
return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1044
case BT_NMSTRT:
1045
case BT_HEX:
1046
case BT_NONASCII:
1047
case BT_LEAD2:
1048
case BT_LEAD3:
1049
case BT_LEAD4:
1050
*nextTokPtr = ptr - MINBPC(enc);
1051
return XML_TOK_INSTANCE_START;
1052
}
1053
*nextTokPtr = ptr;
1054
return XML_TOK_INVALID;
1055
}
1056
case BT_CR:
1057
if (ptr + MINBPC(enc) == end) {
1058
*nextTokPtr = end;
1059
/* indicate that this might be part of a CR/LF pair */
1060
return -XML_TOK_PROLOG_S;
1061
}
1062
/* fall through */
1063
case BT_S:
1064
case BT_LF:
1065
for (;;) {
1066
ptr += MINBPC(enc);
1067
if (! HAS_CHAR(enc, ptr, end))
1068
break;
1069
switch (BYTE_TYPE(enc, ptr)) {
1070
case BT_S:
1071
case BT_LF:
1072
break;
1073
case BT_CR:
1074
/* don't split CR/LF pair */
1075
if (ptr + MINBPC(enc) != end)
1076
break;
1077
/* fall through */
1078
default:
1079
*nextTokPtr = ptr;
1080
return XML_TOK_PROLOG_S;
1081
}
1082
}
1083
*nextTokPtr = ptr;
1084
return XML_TOK_PROLOG_S;
1085
case BT_PERCNT:
1086
return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1087
case BT_COMMA:
1088
*nextTokPtr = ptr + MINBPC(enc);
1089
return XML_TOK_COMMA;
1090
case BT_LSQB:
1091
*nextTokPtr = ptr + MINBPC(enc);
1092
return XML_TOK_OPEN_BRACKET;
1093
case BT_RSQB:
1094
ptr += MINBPC(enc);
1095
if (! HAS_CHAR(enc, ptr, end))
1096
return -XML_TOK_CLOSE_BRACKET;
1097
if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1098
REQUIRE_CHARS(enc, ptr, end, 2);
1099
if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1100
*nextTokPtr = ptr + 2 * MINBPC(enc);
1101
return XML_TOK_COND_SECT_CLOSE;
1102
}
1103
}
1104
*nextTokPtr = ptr;
1105
return XML_TOK_CLOSE_BRACKET;
1106
case BT_LPAR:
1107
*nextTokPtr = ptr + MINBPC(enc);
1108
return XML_TOK_OPEN_PAREN;
1109
case BT_RPAR:
1110
ptr += MINBPC(enc);
1111
if (! HAS_CHAR(enc, ptr, end))
1112
return -XML_TOK_CLOSE_PAREN;
1113
switch (BYTE_TYPE(enc, ptr)) {
1114
case BT_AST:
1115
*nextTokPtr = ptr + MINBPC(enc);
1116
return XML_TOK_CLOSE_PAREN_ASTERISK;
1117
case BT_QUEST:
1118
*nextTokPtr = ptr + MINBPC(enc);
1119
return XML_TOK_CLOSE_PAREN_QUESTION;
1120
case BT_PLUS:
1121
*nextTokPtr = ptr + MINBPC(enc);
1122
return XML_TOK_CLOSE_PAREN_PLUS;
1123
case BT_CR:
1124
case BT_LF:
1125
case BT_S:
1126
case BT_GT:
1127
case BT_COMMA:
1128
case BT_VERBAR:
1129
case BT_RPAR:
1130
*nextTokPtr = ptr;
1131
return XML_TOK_CLOSE_PAREN;
1132
}
1133
*nextTokPtr = ptr;
1134
return XML_TOK_INVALID;
1135
case BT_VERBAR:
1136
*nextTokPtr = ptr + MINBPC(enc);
1137
return XML_TOK_OR;
1138
case BT_GT:
1139
*nextTokPtr = ptr + MINBPC(enc);
1140
return XML_TOK_DECL_CLOSE;
1141
case BT_NUM:
1142
return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1143
# define LEAD_CASE(n) \
1144
case BT_LEAD##n: \
1145
if (end - ptr < n) \
1146
return XML_TOK_PARTIAL_CHAR; \
1147
if (IS_INVALID_CHAR(enc, ptr, n)) { \
1148
*nextTokPtr = ptr; \
1149
return XML_TOK_INVALID; \
1150
} \
1151
if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1152
ptr += n; \
1153
tok = XML_TOK_NAME; \
1154
break; \
1155
} \
1156
if (IS_NAME_CHAR(enc, ptr, n)) { \
1157
ptr += n; \
1158
tok = XML_TOK_NMTOKEN; \
1159
break; \
1160
} \
1161
*nextTokPtr = ptr; \
1162
return XML_TOK_INVALID;
1163
LEAD_CASE(2)
1164
LEAD_CASE(3)
1165
LEAD_CASE(4)
1166
# undef LEAD_CASE
1167
case BT_NMSTRT:
1168
case BT_HEX:
1169
tok = XML_TOK_NAME;
1170
ptr += MINBPC(enc);
1171
break;
1172
case BT_DIGIT:
1173
case BT_NAME:
1174
case BT_MINUS:
1175
# ifdef XML_NS
1176
case BT_COLON:
1177
# endif
1178
tok = XML_TOK_NMTOKEN;
1179
ptr += MINBPC(enc);
1180
break;
1181
case BT_NONASCII:
1182
if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1183
ptr += MINBPC(enc);
1184
tok = XML_TOK_NAME;
1185
break;
1186
}
1187
if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1188
ptr += MINBPC(enc);
1189
tok = XML_TOK_NMTOKEN;
1190
break;
1191
}
1192
/* fall through */
1193
default:
1194
*nextTokPtr = ptr;
1195
return XML_TOK_INVALID;
1196
}
1197
while (HAS_CHAR(enc, ptr, end)) {
1198
switch (BYTE_TYPE(enc, ptr)) {
1199
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1200
case BT_GT:
1201
case BT_RPAR:
1202
case BT_COMMA:
1203
case BT_VERBAR:
1204
case BT_LSQB:
1205
case BT_PERCNT:
1206
case BT_S:
1207
case BT_CR:
1208
case BT_LF:
1209
*nextTokPtr = ptr;
1210
return tok;
1211
# ifdef XML_NS
1212
case BT_COLON:
1213
ptr += MINBPC(enc);
1214
switch (tok) {
1215
case XML_TOK_NAME:
1216
REQUIRE_CHAR(enc, ptr, end);
1217
tok = XML_TOK_PREFIXED_NAME;
1218
switch (BYTE_TYPE(enc, ptr)) {
1219
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1220
default:
1221
tok = XML_TOK_NMTOKEN;
1222
break;
1223
}
1224
break;
1225
case XML_TOK_PREFIXED_NAME:
1226
tok = XML_TOK_NMTOKEN;
1227
break;
1228
}
1229
break;
1230
# endif
1231
case BT_PLUS:
1232
if (tok == XML_TOK_NMTOKEN) {
1233
*nextTokPtr = ptr;
1234
return XML_TOK_INVALID;
1235
}
1236
*nextTokPtr = ptr + MINBPC(enc);
1237
return XML_TOK_NAME_PLUS;
1238
case BT_AST:
1239
if (tok == XML_TOK_NMTOKEN) {
1240
*nextTokPtr = ptr;
1241
return XML_TOK_INVALID;
1242
}
1243
*nextTokPtr = ptr + MINBPC(enc);
1244
return XML_TOK_NAME_ASTERISK;
1245
case BT_QUEST:
1246
if (tok == XML_TOK_NMTOKEN) {
1247
*nextTokPtr = ptr;
1248
return XML_TOK_INVALID;
1249
}
1250
*nextTokPtr = ptr + MINBPC(enc);
1251
return XML_TOK_NAME_QUESTION;
1252
default:
1253
*nextTokPtr = ptr;
1254
return XML_TOK_INVALID;
1255
}
1256
}
1257
return -tok;
1258
}
1259
1260
static int PTRCALL
1261
PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1262
const char **nextTokPtr) {
1263
const char *start;
1264
if (ptr >= end)
1265
return XML_TOK_NONE;
1266
else if (! HAS_CHAR(enc, ptr, end)) {
1267
/* This line cannot be executed. The incoming data has already
1268
* been tokenized once, so incomplete characters like this have
1269
* already been eliminated from the input. Retaining the paranoia
1270
* check is still valuable, however.
1271
*/
1272
return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1273
}
1274
start = ptr;
1275
while (HAS_CHAR(enc, ptr, end)) {
1276
switch (BYTE_TYPE(enc, ptr)) {
1277
# define LEAD_CASE(n) \
1278
case BT_LEAD##n: \
1279
ptr += n; /* NOTE: The encoding has already been validated. */ \
1280
break;
1281
LEAD_CASE(2)
1282
LEAD_CASE(3)
1283
LEAD_CASE(4)
1284
# undef LEAD_CASE
1285
case BT_AMP:
1286
if (ptr == start)
1287
return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1288
*nextTokPtr = ptr;
1289
return XML_TOK_DATA_CHARS;
1290
case BT_LT:
1291
/* this is for inside entity references */
1292
*nextTokPtr = ptr;
1293
return XML_TOK_INVALID;
1294
case BT_LF:
1295
if (ptr == start) {
1296
*nextTokPtr = ptr + MINBPC(enc);
1297
return XML_TOK_DATA_NEWLINE;
1298
}
1299
*nextTokPtr = ptr;
1300
return XML_TOK_DATA_CHARS;
1301
case BT_CR:
1302
if (ptr == start) {
1303
ptr += MINBPC(enc);
1304
if (! HAS_CHAR(enc, ptr, end))
1305
return XML_TOK_TRAILING_CR;
1306
if (BYTE_TYPE(enc, ptr) == BT_LF)
1307
ptr += MINBPC(enc);
1308
*nextTokPtr = ptr;
1309
return XML_TOK_DATA_NEWLINE;
1310
}
1311
*nextTokPtr = ptr;
1312
return XML_TOK_DATA_CHARS;
1313
case BT_S:
1314
if (ptr == start) {
1315
*nextTokPtr = ptr + MINBPC(enc);
1316
return XML_TOK_ATTRIBUTE_VALUE_S;
1317
}
1318
*nextTokPtr = ptr;
1319
return XML_TOK_DATA_CHARS;
1320
default:
1321
ptr += MINBPC(enc);
1322
break;
1323
}
1324
}
1325
*nextTokPtr = ptr;
1326
return XML_TOK_DATA_CHARS;
1327
}
1328
1329
static int PTRCALL
1330
PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1331
const char **nextTokPtr) {
1332
const char *start;
1333
if (ptr >= end)
1334
return XML_TOK_NONE;
1335
else if (! HAS_CHAR(enc, ptr, end)) {
1336
/* This line cannot be executed. The incoming data has already
1337
* been tokenized once, so incomplete characters like this have
1338
* already been eliminated from the input. Retaining the paranoia
1339
* check is still valuable, however.
1340
*/
1341
return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1342
}
1343
start = ptr;
1344
while (HAS_CHAR(enc, ptr, end)) {
1345
switch (BYTE_TYPE(enc, ptr)) {
1346
# define LEAD_CASE(n) \
1347
case BT_LEAD##n: \
1348
ptr += n; /* NOTE: The encoding has already been validated. */ \
1349
break;
1350
LEAD_CASE(2)
1351
LEAD_CASE(3)
1352
LEAD_CASE(4)
1353
# undef LEAD_CASE
1354
case BT_AMP:
1355
if (ptr == start)
1356
return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1357
*nextTokPtr = ptr;
1358
return XML_TOK_DATA_CHARS;
1359
case BT_PERCNT:
1360
if (ptr == start) {
1361
int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1362
return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1363
}
1364
*nextTokPtr = ptr;
1365
return XML_TOK_DATA_CHARS;
1366
case BT_LF:
1367
if (ptr == start) {
1368
*nextTokPtr = ptr + MINBPC(enc);
1369
return XML_TOK_DATA_NEWLINE;
1370
}
1371
*nextTokPtr = ptr;
1372
return XML_TOK_DATA_CHARS;
1373
case BT_CR:
1374
if (ptr == start) {
1375
ptr += MINBPC(enc);
1376
if (! HAS_CHAR(enc, ptr, end))
1377
return XML_TOK_TRAILING_CR;
1378
if (BYTE_TYPE(enc, ptr) == BT_LF)
1379
ptr += MINBPC(enc);
1380
*nextTokPtr = ptr;
1381
return XML_TOK_DATA_NEWLINE;
1382
}
1383
*nextTokPtr = ptr;
1384
return XML_TOK_DATA_CHARS;
1385
default:
1386
ptr += MINBPC(enc);
1387
break;
1388
}
1389
}
1390
*nextTokPtr = ptr;
1391
return XML_TOK_DATA_CHARS;
1392
}
1393
1394
# ifdef XML_DTD
1395
1396
static int PTRCALL
1397
PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1398
const char **nextTokPtr) {
1399
int level = 0;
1400
if (MINBPC(enc) > 1) {
1401
size_t n = end - ptr;
1402
if (n & (MINBPC(enc) - 1)) {
1403
n &= ~(MINBPC(enc) - 1);
1404
end = ptr + n;
1405
}
1406
}
1407
while (HAS_CHAR(enc, ptr, end)) {
1408
switch (BYTE_TYPE(enc, ptr)) {
1409
INVALID_CASES(ptr, nextTokPtr)
1410
case BT_LT:
1411
ptr += MINBPC(enc);
1412
REQUIRE_CHAR(enc, ptr, end);
1413
if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1414
ptr += MINBPC(enc);
1415
REQUIRE_CHAR(enc, ptr, end);
1416
if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1417
++level;
1418
ptr += MINBPC(enc);
1419
}
1420
}
1421
break;
1422
case BT_RSQB:
1423
ptr += MINBPC(enc);
1424
REQUIRE_CHAR(enc, ptr, end);
1425
if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1426
ptr += MINBPC(enc);
1427
REQUIRE_CHAR(enc, ptr, end);
1428
if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1429
ptr += MINBPC(enc);
1430
if (level == 0) {
1431
*nextTokPtr = ptr;
1432
return XML_TOK_IGNORE_SECT;
1433
}
1434
--level;
1435
}
1436
}
1437
break;
1438
default:
1439
ptr += MINBPC(enc);
1440
break;
1441
}
1442
}
1443
return XML_TOK_PARTIAL;
1444
}
1445
1446
# endif /* XML_DTD */
1447
1448
static int PTRCALL
1449
PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1450
const char **badPtr) {
1451
ptr += MINBPC(enc);
1452
end -= MINBPC(enc);
1453
for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1454
switch (BYTE_TYPE(enc, ptr)) {
1455
case BT_DIGIT:
1456
case BT_HEX:
1457
case BT_MINUS:
1458
case BT_APOS:
1459
case BT_LPAR:
1460
case BT_RPAR:
1461
case BT_PLUS:
1462
case BT_COMMA:
1463
case BT_SOL:
1464
case BT_EQUALS:
1465
case BT_QUEST:
1466
case BT_CR:
1467
case BT_LF:
1468
case BT_SEMI:
1469
case BT_EXCL:
1470
case BT_AST:
1471
case BT_PERCNT:
1472
case BT_NUM:
1473
# ifdef XML_NS
1474
case BT_COLON:
1475
# endif
1476
break;
1477
case BT_S:
1478
if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1479
*badPtr = ptr;
1480
return 0;
1481
}
1482
break;
1483
case BT_NAME:
1484
case BT_NMSTRT:
1485
if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1486
break;
1487
/* fall through */
1488
default:
1489
switch (BYTE_TO_ASCII(enc, ptr)) {
1490
case 0x24: /* $ */
1491
case 0x40: /* @ */
1492
break;
1493
default:
1494
*badPtr = ptr;
1495
return 0;
1496
}
1497
break;
1498
}
1499
}
1500
return 1;
1501
}
1502
1503
/* This must only be called for a well-formed start-tag or empty
1504
element tag. Returns the number of attributes. Pointers to the
1505
first attsMax attributes are stored in atts.
1506
*/
1507
1508
static int PTRCALL
1509
PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1510
ATTRIBUTE *atts) {
1511
enum { other, inName, inValue } state = inName;
1512
int nAtts = 0;
1513
int open = 0; /* defined when state == inValue;
1514
initialization just to shut up compilers */
1515
1516
for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1517
switch (BYTE_TYPE(enc, ptr)) {
1518
# define START_NAME \
1519
if (state == other) { \
1520
if (nAtts < attsMax) { \
1521
atts[nAtts].name = ptr; \
1522
atts[nAtts].normalized = 1; \
1523
} \
1524
state = inName; \
1525
}
1526
# define LEAD_CASE(n) \
1527
case BT_LEAD##n: /* NOTE: The encoding has already been validated. */ \
1528
START_NAME ptr += (n - MINBPC(enc)); \
1529
break;
1530
LEAD_CASE(2)
1531
LEAD_CASE(3)
1532
LEAD_CASE(4)
1533
# undef LEAD_CASE
1534
case BT_NONASCII:
1535
case BT_NMSTRT:
1536
case BT_HEX:
1537
START_NAME
1538
break;
1539
# undef START_NAME
1540
case BT_QUOT:
1541
if (state != inValue) {
1542
if (nAtts < attsMax)
1543
atts[nAtts].valuePtr = ptr + MINBPC(enc);
1544
state = inValue;
1545
open = BT_QUOT;
1546
} else if (open == BT_QUOT) {
1547
state = other;
1548
if (nAtts < attsMax)
1549
atts[nAtts].valueEnd = ptr;
1550
nAtts++;
1551
}
1552
break;
1553
case BT_APOS:
1554
if (state != inValue) {
1555
if (nAtts < attsMax)
1556
atts[nAtts].valuePtr = ptr + MINBPC(enc);
1557
state = inValue;
1558
open = BT_APOS;
1559
} else if (open == BT_APOS) {
1560
state = other;
1561
if (nAtts < attsMax)
1562
atts[nAtts].valueEnd = ptr;
1563
nAtts++;
1564
}
1565
break;
1566
case BT_AMP:
1567
if (nAtts < attsMax)
1568
atts[nAtts].normalized = 0;
1569
break;
1570
case BT_S:
1571
if (state == inName)
1572
state = other;
1573
else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
1574
&& (ptr == atts[nAtts].valuePtr
1575
|| BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1576
|| BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1577
|| BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1578
atts[nAtts].normalized = 0;
1579
break;
1580
case BT_CR:
1581
case BT_LF:
1582
/* This case ensures that the first attribute name is counted
1583
Apart from that we could just change state on the quote. */
1584
if (state == inName)
1585
state = other;
1586
else if (state == inValue && nAtts < attsMax)
1587
atts[nAtts].normalized = 0;
1588
break;
1589
case BT_GT:
1590
case BT_SOL:
1591
if (state != inValue)
1592
return nAtts;
1593
break;
1594
default:
1595
break;
1596
}
1597
}
1598
/* not reached */
1599
}
1600
1601
static int PTRFASTCALL
1602
PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
1603
int result = 0;
1604
/* skip &# */
1605
UNUSED_P(enc);
1606
ptr += 2 * MINBPC(enc);
1607
if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1608
for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1609
ptr += MINBPC(enc)) {
1610
int c = BYTE_TO_ASCII(enc, ptr);
1611
switch (c) {
1612
case ASCII_0:
1613
case ASCII_1:
1614
case ASCII_2:
1615
case ASCII_3:
1616
case ASCII_4:
1617
case ASCII_5:
1618
case ASCII_6:
1619
case ASCII_7:
1620
case ASCII_8:
1621
case ASCII_9:
1622
result <<= 4;
1623
result |= (c - ASCII_0);
1624
break;
1625
case ASCII_A:
1626
case ASCII_B:
1627
case ASCII_C:
1628
case ASCII_D:
1629
case ASCII_E:
1630
case ASCII_F:
1631
result <<= 4;
1632
result += 10 + (c - ASCII_A);
1633
break;
1634
case ASCII_a:
1635
case ASCII_b:
1636
case ASCII_c:
1637
case ASCII_d:
1638
case ASCII_e:
1639
case ASCII_f:
1640
result <<= 4;
1641
result += 10 + (c - ASCII_a);
1642
break;
1643
}
1644
if (result >= 0x110000)
1645
return -1;
1646
}
1647
} else {
1648
for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1649
int c = BYTE_TO_ASCII(enc, ptr);
1650
result *= 10;
1651
result += (c - ASCII_0);
1652
if (result >= 0x110000)
1653
return -1;
1654
}
1655
}
1656
return checkCharRefNumber(result);
1657
}
1658
1659
static int PTRCALL
1660
PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1661
const char *end) {
1662
UNUSED_P(enc);
1663
switch ((end - ptr) / MINBPC(enc)) {
1664
case 2:
1665
if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1666
switch (BYTE_TO_ASCII(enc, ptr)) {
1667
case ASCII_l:
1668
return ASCII_LT;
1669
case ASCII_g:
1670
return ASCII_GT;
1671
}
1672
}
1673
break;
1674
case 3:
1675
if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1676
ptr += MINBPC(enc);
1677
if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1678
ptr += MINBPC(enc);
1679
if (CHAR_MATCHES(enc, ptr, ASCII_p))
1680
return ASCII_AMP;
1681
}
1682
}
1683
break;
1684
case 4:
1685
switch (BYTE_TO_ASCII(enc, ptr)) {
1686
case ASCII_q:
1687
ptr += MINBPC(enc);
1688
if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1689
ptr += MINBPC(enc);
1690
if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1691
ptr += MINBPC(enc);
1692
if (CHAR_MATCHES(enc, ptr, ASCII_t))
1693
return ASCII_QUOT;
1694
}
1695
}
1696
break;
1697
case ASCII_a:
1698
ptr += MINBPC(enc);
1699
if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1700
ptr += MINBPC(enc);
1701
if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1702
ptr += MINBPC(enc);
1703
if (CHAR_MATCHES(enc, ptr, ASCII_s))
1704
return ASCII_APOS;
1705
}
1706
}
1707
break;
1708
}
1709
}
1710
return 0;
1711
}
1712
1713
static int PTRCALL
1714
PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1715
const char *end1, const char *ptr2) {
1716
UNUSED_P(enc);
1717
for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1718
if (end1 - ptr1 < MINBPC(enc)) {
1719
/* This line cannot be executed. The incoming data has already
1720
* been tokenized once, so incomplete characters like this have
1721
* already been eliminated from the input. Retaining the
1722
* paranoia check is still valuable, however.
1723
*/
1724
return 0; /* LCOV_EXCL_LINE */
1725
}
1726
if (! CHAR_MATCHES(enc, ptr1, *ptr2))
1727
return 0;
1728
}
1729
return ptr1 == end1;
1730
}
1731
1732
static int PTRFASTCALL
1733
PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
1734
const char *start = ptr;
1735
for (;;) {
1736
switch (BYTE_TYPE(enc, ptr)) {
1737
# define LEAD_CASE(n) \
1738
case BT_LEAD##n: \
1739
ptr += n; /* NOTE: The encoding has already been validated. */ \
1740
break;
1741
LEAD_CASE(2)
1742
LEAD_CASE(3)
1743
LEAD_CASE(4)
1744
# undef LEAD_CASE
1745
case BT_NONASCII:
1746
case BT_NMSTRT:
1747
# ifdef XML_NS
1748
case BT_COLON:
1749
# endif
1750
case BT_HEX:
1751
case BT_DIGIT:
1752
case BT_NAME:
1753
case BT_MINUS:
1754
ptr += MINBPC(enc);
1755
break;
1756
default:
1757
return (int)(ptr - start);
1758
}
1759
}
1760
}
1761
1762
static const char *PTRFASTCALL
1763
PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
1764
for (;;) {
1765
switch (BYTE_TYPE(enc, ptr)) {
1766
case BT_LF:
1767
case BT_CR:
1768
case BT_S:
1769
ptr += MINBPC(enc);
1770
break;
1771
default:
1772
return ptr;
1773
}
1774
}
1775
}
1776
1777
static void PTRCALL
1778
PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1779
POSITION *pos) {
1780
while (HAS_CHAR(enc, ptr, end)) {
1781
switch (BYTE_TYPE(enc, ptr)) {
1782
# define LEAD_CASE(n) \
1783
case BT_LEAD##n: \
1784
ptr += n; /* NOTE: The encoding has already been validated. */ \
1785
pos->columnNumber++; \
1786
break;
1787
LEAD_CASE(2)
1788
LEAD_CASE(3)
1789
LEAD_CASE(4)
1790
# undef LEAD_CASE
1791
case BT_LF:
1792
pos->columnNumber = 0;
1793
pos->lineNumber++;
1794
ptr += MINBPC(enc);
1795
break;
1796
case BT_CR:
1797
pos->lineNumber++;
1798
ptr += MINBPC(enc);
1799
if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1800
ptr += MINBPC(enc);
1801
pos->columnNumber = 0;
1802
break;
1803
default:
1804
ptr += MINBPC(enc);
1805
pos->columnNumber++;
1806
break;
1807
}
1808
}
1809
}
1810
1811
# undef DO_LEAD_CASE
1812
# undef MULTIBYTE_CASES
1813
# undef INVALID_CASES
1814
# undef CHECK_NAME_CASE
1815
# undef CHECK_NAME_CASES
1816
# undef CHECK_NMSTRT_CASE
1817
# undef CHECK_NMSTRT_CASES
1818
1819
#endif /* XML_TOK_IMPL_C */
1820
1821