Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Kitware
GitHub Repository: Kitware/CMake
Path: blob/master/Utilities/cmexpat/lib/xmltok.c
3153 views
1
/*
2
__ __ _
3
___\ \/ /_ __ __ _| |_
4
/ _ \\ /| '_ \ / _` | __|
5
| __// \| |_) | (_| | |_
6
\___/_/\_\ .__/ \__,_|\__|
7
|_| XML parser
8
9
Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10
Copyright (c) 2000 Clark Cooper <[email protected]>
11
Copyright (c) 2001-2003 Fred L. Drake, Jr. <[email protected]>
12
Copyright (c) 2002 Greg Stein <[email protected]>
13
Copyright (c) 2002-2016 Karl Waclawek <[email protected]>
14
Copyright (c) 2005-2009 Steven Solie <[email protected]>
15
Copyright (c) 2016-2024 Sebastian Pipping <[email protected]>
16
Copyright (c) 2016 Pascal Cuoq <[email protected]>
17
Copyright (c) 2016 Don Lewis <[email protected]>
18
Copyright (c) 2017 Rhodri James <[email protected]>
19
Copyright (c) 2017 Alexander Bluhm <[email protected]>
20
Copyright (c) 2017 Benbuck Nason <[email protected]>
21
Copyright (c) 2017 José Gutiérrez de la Concha <[email protected]>
22
Copyright (c) 2019 David Loffredo <[email protected]>
23
Copyright (c) 2021 Donghee Na <[email protected]>
24
Copyright (c) 2022 Martin Ettl <[email protected]>
25
Copyright (c) 2022 Sean McBride <[email protected]>
26
Copyright (c) 2023 Hanno Böck <[email protected]>
27
Licensed under the MIT license:
28
29
Permission is hereby granted, free of charge, to any person obtaining
30
a copy of this software and associated documentation files (the
31
"Software"), to deal in the Software without restriction, including
32
without limitation the rights to use, copy, modify, merge, publish,
33
distribute, sublicense, and/or sell copies of the Software, and to permit
34
persons to whom the Software is furnished to do so, subject to the
35
following conditions:
36
37
The above copyright notice and this permission notice shall be included
38
in all copies or substantial portions of the Software.
39
40
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
41
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
42
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
43
NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
44
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
45
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
46
USE OR OTHER DEALINGS IN THE SOFTWARE.
47
*/
48
49
#include "expat_config.h"
50
51
#include <stddef.h>
52
#include <string.h> /* memcpy */
53
#include <stdbool.h>
54
55
#ifdef _WIN32
56
# include "winconfig.h"
57
#endif
58
59
#include "expat_external.h"
60
#include "internal.h"
61
#include "xmltok.h"
62
#include "nametab.h"
63
64
#ifdef XML_DTD
65
# define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
66
#else
67
# define IGNORE_SECTION_TOK_VTABLE /* as nothing */
68
#endif
69
70
#define VTABLE1 \
71
{PREFIX(prologTok), PREFIX(contentTok), \
72
PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \
73
{PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \
74
PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \
75
PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \
76
PREFIX(updatePosition), PREFIX(isPublicId)
77
78
#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
79
80
#define UCS2_GET_NAMING(pages, hi, lo) \
81
(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
82
83
/* A 2 byte UTF-8 representation splits the characters 11 bits between
84
the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
85
pages, 3 bits to add to that index and 5 bits to generate the mask.
86
*/
87
#define UTF8_GET_NAMING2(pages, byte) \
88
(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
89
+ ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \
90
& (1u << (((byte)[1]) & 0x1F)))
91
92
/* A 3 byte UTF-8 representation splits the characters 16 bits between
93
the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
94
into pages, 3 bits to add to that index and 5 bits to generate the
95
mask.
96
*/
97
#define UTF8_GET_NAMING3(pages, byte) \
98
(namingBitmap \
99
[((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \
100
<< 3) \
101
+ ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
102
& (1u << (((byte)[2]) & 0x1F)))
103
104
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
105
of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/
106
with the additional restriction of not allowing the Unicode
107
code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
108
Implementation details:
109
(A & 0x80) == 0 means A < 0x80
110
and
111
(A & 0xC0) == 0xC0 means A > 0xBF
112
*/
113
114
#define UTF8_INVALID2(p) \
115
((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
116
117
#define UTF8_INVALID3(p) \
118
(((p)[2] & 0x80) == 0 \
119
|| ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \
120
: ((p)[2] & 0xC0) == 0xC0) \
121
|| ((*p) == 0xE0 \
122
? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
123
: ((p)[1] & 0x80) == 0 \
124
|| ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
125
126
#define UTF8_INVALID4(p) \
127
(((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \
128
|| ((p)[2] & 0xC0) == 0xC0 \
129
|| ((*p) == 0xF0 \
130
? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
131
: ((p)[1] & 0x80) == 0 \
132
|| ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
133
134
static int PTRFASTCALL
135
isNever(const ENCODING *enc, const char *p) {
136
UNUSED_P(enc);
137
UNUSED_P(p);
138
return 0;
139
}
140
141
static int PTRFASTCALL
142
utf8_isName2(const ENCODING *enc, const char *p) {
143
UNUSED_P(enc);
144
return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
145
}
146
147
static int PTRFASTCALL
148
utf8_isName3(const ENCODING *enc, const char *p) {
149
UNUSED_P(enc);
150
return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
151
}
152
153
#define utf8_isName4 isNever
154
155
static int PTRFASTCALL
156
utf8_isNmstrt2(const ENCODING *enc, const char *p) {
157
UNUSED_P(enc);
158
return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
159
}
160
161
static int PTRFASTCALL
162
utf8_isNmstrt3(const ENCODING *enc, const char *p) {
163
UNUSED_P(enc);
164
return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
165
}
166
167
#define utf8_isNmstrt4 isNever
168
169
static int PTRFASTCALL
170
utf8_isInvalid2(const ENCODING *enc, const char *p) {
171
UNUSED_P(enc);
172
return UTF8_INVALID2((const unsigned char *)p);
173
}
174
175
static int PTRFASTCALL
176
utf8_isInvalid3(const ENCODING *enc, const char *p) {
177
UNUSED_P(enc);
178
return UTF8_INVALID3((const unsigned char *)p);
179
}
180
181
static int PTRFASTCALL
182
utf8_isInvalid4(const ENCODING *enc, const char *p) {
183
UNUSED_P(enc);
184
return UTF8_INVALID4((const unsigned char *)p);
185
}
186
187
struct normal_encoding {
188
ENCODING enc;
189
unsigned char type[256];
190
#ifdef XML_MIN_SIZE
191
int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
192
int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
193
int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
194
int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
195
int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
196
#endif /* XML_MIN_SIZE */
197
int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
198
int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
199
int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
200
int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
201
int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
202
int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
203
int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
204
int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
205
int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
206
};
207
208
#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
209
210
#ifdef XML_MIN_SIZE
211
212
# define STANDARD_VTABLE(E) \
213
E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
214
215
#else
216
217
# define STANDARD_VTABLE(E) /* as nothing */
218
219
#endif
220
221
#define NORMAL_VTABLE(E) \
222
E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \
223
E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
224
225
#define NULL_VTABLE \
226
/* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \
227
/* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \
228
/* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
229
230
static int FASTCALL checkCharRefNumber(int result);
231
232
#include "xmltok_impl.h"
233
#include "ascii.h"
234
235
#ifdef XML_MIN_SIZE
236
# define sb_isNameMin isNever
237
# define sb_isNmstrtMin isNever
238
#endif
239
240
#ifdef XML_MIN_SIZE
241
# define MINBPC(enc) ((enc)->minBytesPerChar)
242
#else
243
/* minimum bytes per character */
244
# define MINBPC(enc) 1
245
#endif
246
247
#define SB_BYTE_TYPE(enc, p) \
248
(((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
249
250
#ifdef XML_MIN_SIZE
251
static int PTRFASTCALL
252
sb_byteType(const ENCODING *enc, const char *p) {
253
return SB_BYTE_TYPE(enc, p);
254
}
255
# define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
256
#else
257
# define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
258
#endif
259
260
#ifdef XML_MIN_SIZE
261
# define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
262
static int PTRFASTCALL
263
sb_byteToAscii(const ENCODING *enc, const char *p) {
264
UNUSED_P(enc);
265
return *p;
266
}
267
#else
268
# define BYTE_TO_ASCII(enc, p) (*(p))
269
#endif
270
271
#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
272
#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
273
#ifdef XML_MIN_SIZE
274
# define IS_INVALID_CHAR(enc, p, n) \
275
(AS_NORMAL_ENCODING(enc)->isInvalid##n \
276
&& AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
277
#else
278
# define IS_INVALID_CHAR(enc, p, n) \
279
(AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
280
#endif
281
282
#ifdef XML_MIN_SIZE
283
# define IS_NAME_CHAR_MINBPC(enc, p) \
284
(AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
285
# define IS_NMSTRT_CHAR_MINBPC(enc, p) \
286
(AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
287
#else
288
# define IS_NAME_CHAR_MINBPC(enc, p) (0)
289
# define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
290
#endif
291
292
#ifdef XML_MIN_SIZE
293
# define CHAR_MATCHES(enc, p, c) \
294
(AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
295
static int PTRCALL
296
sb_charMatches(const ENCODING *enc, const char *p, int c) {
297
UNUSED_P(enc);
298
return *p == c;
299
}
300
#else
301
/* c is an ASCII character */
302
# define CHAR_MATCHES(enc, p, c) (*(p) == (c))
303
#endif
304
305
#define PREFIX(ident) normal_##ident
306
#define XML_TOK_IMPL_C
307
#include "xmltok_impl.c"
308
#undef XML_TOK_IMPL_C
309
310
#undef MINBPC
311
#undef BYTE_TYPE
312
#undef BYTE_TO_ASCII
313
#undef CHAR_MATCHES
314
#undef IS_NAME_CHAR
315
#undef IS_NAME_CHAR_MINBPC
316
#undef IS_NMSTRT_CHAR
317
#undef IS_NMSTRT_CHAR_MINBPC
318
#undef IS_INVALID_CHAR
319
320
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
321
UTF8_cval1 = 0x00,
322
UTF8_cval2 = 0xc0,
323
UTF8_cval3 = 0xe0,
324
UTF8_cval4 = 0xf0
325
};
326
327
void
328
_INTERNAL_trim_to_complete_utf8_characters(const char *from,
329
const char **fromLimRef) {
330
const char *fromLim = *fromLimRef;
331
size_t walked = 0;
332
for (; fromLim > from; fromLim--, walked++) {
333
const unsigned char prev = (unsigned char)fromLim[-1];
334
if ((prev & 0xf8u)
335
== 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
336
if (walked + 1 >= 4) {
337
fromLim += 4 - 1;
338
break;
339
} else {
340
walked = 0;
341
}
342
} else if ((prev & 0xf0u)
343
== 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
344
if (walked + 1 >= 3) {
345
fromLim += 3 - 1;
346
break;
347
} else {
348
walked = 0;
349
}
350
} else if ((prev & 0xe0u)
351
== 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
352
if (walked + 1 >= 2) {
353
fromLim += 2 - 1;
354
break;
355
} else {
356
walked = 0;
357
}
358
} else if ((prev & 0x80u)
359
== 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
360
break;
361
}
362
}
363
*fromLimRef = fromLim;
364
}
365
366
static enum XML_Convert_Result PTRCALL
367
utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
368
char **toP, const char *toLim) {
369
bool input_incomplete = false;
370
bool output_exhausted = false;
371
372
/* Avoid copying partial characters (due to limited space). */
373
const ptrdiff_t bytesAvailable = fromLim - *fromP;
374
const ptrdiff_t bytesStorable = toLim - *toP;
375
UNUSED_P(enc);
376
if (bytesAvailable > bytesStorable) {
377
fromLim = *fromP + bytesStorable;
378
output_exhausted = true;
379
}
380
381
/* Avoid copying partial characters (from incomplete input). */
382
{
383
const char *const fromLimBefore = fromLim;
384
_INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
385
if (fromLim < fromLimBefore) {
386
input_incomplete = true;
387
}
388
}
389
390
{
391
const ptrdiff_t bytesToCopy = fromLim - *fromP;
392
memcpy(*toP, *fromP, bytesToCopy);
393
*fromP += bytesToCopy;
394
*toP += bytesToCopy;
395
}
396
397
if (output_exhausted) /* needs to go first */
398
return XML_CONVERT_OUTPUT_EXHAUSTED;
399
else if (input_incomplete)
400
return XML_CONVERT_INPUT_INCOMPLETE;
401
else
402
return XML_CONVERT_COMPLETED;
403
}
404
405
static enum XML_Convert_Result PTRCALL
406
utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
407
unsigned short **toP, const unsigned short *toLim) {
408
enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
409
unsigned short *to = *toP;
410
const char *from = *fromP;
411
while (from < fromLim && to < toLim) {
412
switch (SB_BYTE_TYPE(enc, from)) {
413
case BT_LEAD2:
414
if (fromLim - from < 2) {
415
res = XML_CONVERT_INPUT_INCOMPLETE;
416
goto after;
417
}
418
*to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
419
from += 2;
420
break;
421
case BT_LEAD3:
422
if (fromLim - from < 3) {
423
res = XML_CONVERT_INPUT_INCOMPLETE;
424
goto after;
425
}
426
*to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
427
| (from[2] & 0x3f));
428
from += 3;
429
break;
430
case BT_LEAD4: {
431
unsigned long n;
432
if (toLim - to < 2) {
433
res = XML_CONVERT_OUTPUT_EXHAUSTED;
434
goto after;
435
}
436
if (fromLim - from < 4) {
437
res = XML_CONVERT_INPUT_INCOMPLETE;
438
goto after;
439
}
440
n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
441
| ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
442
n -= 0x10000;
443
to[0] = (unsigned short)((n >> 10) | 0xD800);
444
to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
445
to += 2;
446
from += 4;
447
} break;
448
default:
449
*to++ = *from++;
450
break;
451
}
452
}
453
if (from < fromLim)
454
res = XML_CONVERT_OUTPUT_EXHAUSTED;
455
after:
456
*fromP = from;
457
*toP = to;
458
return res;
459
}
460
461
#ifdef XML_NS
462
static const struct normal_encoding utf8_encoding_ns
463
= {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
464
{
465
# include "asciitab.h"
466
# include "utf8tab.h"
467
},
468
STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
469
#endif
470
471
static const struct normal_encoding utf8_encoding
472
= {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
473
{
474
#define BT_COLON BT_NMSTRT
475
#include "asciitab.h"
476
#undef BT_COLON
477
#include "utf8tab.h"
478
},
479
STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
480
481
#ifdef XML_NS
482
483
static const struct normal_encoding internal_utf8_encoding_ns
484
= {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
485
{
486
# include "iasciitab.h"
487
# include "utf8tab.h"
488
},
489
STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
490
491
#endif
492
493
static const struct normal_encoding internal_utf8_encoding
494
= {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
495
{
496
#define BT_COLON BT_NMSTRT
497
#include "iasciitab.h"
498
#undef BT_COLON
499
#include "utf8tab.h"
500
},
501
STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
502
503
static enum XML_Convert_Result PTRCALL
504
latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
505
char **toP, const char *toLim) {
506
UNUSED_P(enc);
507
for (;;) {
508
unsigned char c;
509
if (*fromP == fromLim)
510
return XML_CONVERT_COMPLETED;
511
c = (unsigned char)**fromP;
512
if (c & 0x80) {
513
if (toLim - *toP < 2)
514
return XML_CONVERT_OUTPUT_EXHAUSTED;
515
*(*toP)++ = (char)((c >> 6) | UTF8_cval2);
516
*(*toP)++ = (char)((c & 0x3f) | 0x80);
517
(*fromP)++;
518
} else {
519
if (*toP == toLim)
520
return XML_CONVERT_OUTPUT_EXHAUSTED;
521
*(*toP)++ = *(*fromP)++;
522
}
523
}
524
}
525
526
static enum XML_Convert_Result PTRCALL
527
latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
528
unsigned short **toP, const unsigned short *toLim) {
529
UNUSED_P(enc);
530
while (*fromP < fromLim && *toP < toLim)
531
*(*toP)++ = (unsigned char)*(*fromP)++;
532
533
if ((*toP == toLim) && (*fromP < fromLim))
534
return XML_CONVERT_OUTPUT_EXHAUSTED;
535
else
536
return XML_CONVERT_COMPLETED;
537
}
538
539
#ifdef XML_NS
540
541
static const struct normal_encoding latin1_encoding_ns
542
= {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
543
{
544
# include "asciitab.h"
545
# include "latin1tab.h"
546
},
547
STANDARD_VTABLE(sb_) NULL_VTABLE};
548
549
#endif
550
551
static const struct normal_encoding latin1_encoding
552
= {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
553
{
554
#define BT_COLON BT_NMSTRT
555
#include "asciitab.h"
556
#undef BT_COLON
557
#include "latin1tab.h"
558
},
559
STANDARD_VTABLE(sb_) NULL_VTABLE};
560
561
static enum XML_Convert_Result PTRCALL
562
ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
563
char **toP, const char *toLim) {
564
UNUSED_P(enc);
565
while (*fromP < fromLim && *toP < toLim)
566
*(*toP)++ = *(*fromP)++;
567
568
if ((*toP == toLim) && (*fromP < fromLim))
569
return XML_CONVERT_OUTPUT_EXHAUSTED;
570
else
571
return XML_CONVERT_COMPLETED;
572
}
573
574
#ifdef XML_NS
575
576
static const struct normal_encoding ascii_encoding_ns
577
= {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
578
{
579
# include "asciitab.h"
580
/* BT_NONXML == 0 */
581
},
582
STANDARD_VTABLE(sb_) NULL_VTABLE};
583
584
#endif
585
586
static const struct normal_encoding ascii_encoding
587
= {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
588
{
589
#define BT_COLON BT_NMSTRT
590
#include "asciitab.h"
591
#undef BT_COLON
592
/* BT_NONXML == 0 */
593
},
594
STANDARD_VTABLE(sb_) NULL_VTABLE};
595
596
static int PTRFASTCALL
597
unicode_byte_type(char hi, char lo) {
598
switch ((unsigned char)hi) {
599
/* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
600
case 0xD8:
601
case 0xD9:
602
case 0xDA:
603
case 0xDB:
604
return BT_LEAD4;
605
/* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
606
case 0xDC:
607
case 0xDD:
608
case 0xDE:
609
case 0xDF:
610
return BT_TRAIL;
611
case 0xFF:
612
switch ((unsigned char)lo) {
613
case 0xFF: /* noncharacter-FFFF */
614
case 0xFE: /* noncharacter-FFFE */
615
return BT_NONXML;
616
}
617
break;
618
}
619
return BT_NONASCII;
620
}
621
622
#define DEFINE_UTF16_TO_UTF8(E) \
623
static enum XML_Convert_Result PTRCALL E##toUtf8( \
624
const ENCODING *enc, const char **fromP, const char *fromLim, \
625
char **toP, const char *toLim) { \
626
const char *from = *fromP; \
627
UNUSED_P(enc); \
628
fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
629
for (; from < fromLim; from += 2) { \
630
int plane; \
631
unsigned char lo2; \
632
unsigned char lo = GET_LO(from); \
633
unsigned char hi = GET_HI(from); \
634
switch (hi) { \
635
case 0: \
636
if (lo < 0x80) { \
637
if (*toP == toLim) { \
638
*fromP = from; \
639
return XML_CONVERT_OUTPUT_EXHAUSTED; \
640
} \
641
*(*toP)++ = lo; \
642
break; \
643
} \
644
/* fall through */ \
645
case 0x1: \
646
case 0x2: \
647
case 0x3: \
648
case 0x4: \
649
case 0x5: \
650
case 0x6: \
651
case 0x7: \
652
if (toLim - *toP < 2) { \
653
*fromP = from; \
654
return XML_CONVERT_OUTPUT_EXHAUSTED; \
655
} \
656
*(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
657
*(*toP)++ = ((lo & 0x3f) | 0x80); \
658
break; \
659
default: \
660
if (toLim - *toP < 3) { \
661
*fromP = from; \
662
return XML_CONVERT_OUTPUT_EXHAUSTED; \
663
} \
664
/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
665
*(*toP)++ = ((hi >> 4) | UTF8_cval3); \
666
*(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
667
*(*toP)++ = ((lo & 0x3f) | 0x80); \
668
break; \
669
case 0xD8: \
670
case 0xD9: \
671
case 0xDA: \
672
case 0xDB: \
673
if (toLim - *toP < 4) { \
674
*fromP = from; \
675
return XML_CONVERT_OUTPUT_EXHAUSTED; \
676
} \
677
if (fromLim - from < 4) { \
678
*fromP = from; \
679
return XML_CONVERT_INPUT_INCOMPLETE; \
680
} \
681
plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
682
*(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \
683
*(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
684
from += 2; \
685
lo2 = GET_LO(from); \
686
*(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \
687
| (lo2 >> 6) | 0x80); \
688
*(*toP)++ = ((lo2 & 0x3f) | 0x80); \
689
break; \
690
} \
691
} \
692
*fromP = from; \
693
if (from < fromLim) \
694
return XML_CONVERT_INPUT_INCOMPLETE; \
695
else \
696
return XML_CONVERT_COMPLETED; \
697
}
698
699
#define DEFINE_UTF16_TO_UTF16(E) \
700
static enum XML_Convert_Result PTRCALL E##toUtf16( \
701
const ENCODING *enc, const char **fromP, const char *fromLim, \
702
unsigned short **toP, const unsigned short *toLim) { \
703
enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
704
UNUSED_P(enc); \
705
fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
706
/* Avoid copying first half only of surrogate */ \
707
if (fromLim - *fromP > ((toLim - *toP) << 1) \
708
&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
709
fromLim -= 2; \
710
res = XML_CONVERT_INPUT_INCOMPLETE; \
711
} \
712
for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
713
*(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
714
if ((*toP == toLim) && (*fromP < fromLim)) \
715
return XML_CONVERT_OUTPUT_EXHAUSTED; \
716
else \
717
return res; \
718
}
719
720
#define GET_LO(ptr) ((unsigned char)(ptr)[0])
721
#define GET_HI(ptr) ((unsigned char)(ptr)[1])
722
723
DEFINE_UTF16_TO_UTF8(little2_)
724
DEFINE_UTF16_TO_UTF16(little2_)
725
726
#undef GET_LO
727
#undef GET_HI
728
729
#define GET_LO(ptr) ((unsigned char)(ptr)[1])
730
#define GET_HI(ptr) ((unsigned char)(ptr)[0])
731
732
DEFINE_UTF16_TO_UTF8(big2_)
733
DEFINE_UTF16_TO_UTF16(big2_)
734
735
#undef GET_LO
736
#undef GET_HI
737
738
#define LITTLE2_BYTE_TYPE(enc, p) \
739
((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0]))
740
#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
741
#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
742
#define LITTLE2_IS_NAME_CHAR_MINBPC(p) \
743
UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
744
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \
745
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
746
747
#ifdef XML_MIN_SIZE
748
749
static int PTRFASTCALL
750
little2_byteType(const ENCODING *enc, const char *p) {
751
return LITTLE2_BYTE_TYPE(enc, p);
752
}
753
754
static int PTRFASTCALL
755
little2_byteToAscii(const ENCODING *enc, const char *p) {
756
UNUSED_P(enc);
757
return LITTLE2_BYTE_TO_ASCII(p);
758
}
759
760
static int PTRCALL
761
little2_charMatches(const ENCODING *enc, const char *p, int c) {
762
UNUSED_P(enc);
763
return LITTLE2_CHAR_MATCHES(p, c);
764
}
765
766
static int PTRFASTCALL
767
little2_isNameMin(const ENCODING *enc, const char *p) {
768
UNUSED_P(enc);
769
return LITTLE2_IS_NAME_CHAR_MINBPC(p);
770
}
771
772
static int PTRFASTCALL
773
little2_isNmstrtMin(const ENCODING *enc, const char *p) {
774
UNUSED_P(enc);
775
return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
776
}
777
778
# undef VTABLE
779
# define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
780
781
#else /* not XML_MIN_SIZE */
782
783
# undef PREFIX
784
# define PREFIX(ident) little2_##ident
785
# define MINBPC(enc) 2
786
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
787
# define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
788
# define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
789
# define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
790
# define IS_NAME_CHAR(enc, p, n) 0
791
# define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
792
# define IS_NMSTRT_CHAR(enc, p, n) (0)
793
# define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
794
795
# define XML_TOK_IMPL_C
796
# include "xmltok_impl.c"
797
# undef XML_TOK_IMPL_C
798
799
# undef MINBPC
800
# undef BYTE_TYPE
801
# undef BYTE_TO_ASCII
802
# undef CHAR_MATCHES
803
# undef IS_NAME_CHAR
804
# undef IS_NAME_CHAR_MINBPC
805
# undef IS_NMSTRT_CHAR
806
# undef IS_NMSTRT_CHAR_MINBPC
807
# undef IS_INVALID_CHAR
808
809
#endif /* not XML_MIN_SIZE */
810
811
#ifdef XML_NS
812
813
static const struct normal_encoding little2_encoding_ns
814
= {{VTABLE, 2, 0,
815
# if BYTEORDER == 1234
816
1
817
# else
818
0
819
# endif
820
},
821
{
822
# include "asciitab.h"
823
# include "latin1tab.h"
824
},
825
STANDARD_VTABLE(little2_) NULL_VTABLE};
826
827
#endif
828
829
static const struct normal_encoding little2_encoding
830
= {{VTABLE, 2, 0,
831
#if BYTEORDER == 1234
832
1
833
#else
834
0
835
#endif
836
},
837
{
838
#define BT_COLON BT_NMSTRT
839
#include "asciitab.h"
840
#undef BT_COLON
841
#include "latin1tab.h"
842
},
843
STANDARD_VTABLE(little2_) NULL_VTABLE};
844
845
#if BYTEORDER != 4321
846
847
# ifdef XML_NS
848
849
static const struct normal_encoding internal_little2_encoding_ns
850
= {{VTABLE, 2, 0, 1},
851
{
852
# include "iasciitab.h"
853
# include "latin1tab.h"
854
},
855
STANDARD_VTABLE(little2_) NULL_VTABLE};
856
857
# endif
858
859
static const struct normal_encoding internal_little2_encoding
860
= {{VTABLE, 2, 0, 1},
861
{
862
# define BT_COLON BT_NMSTRT
863
# include "iasciitab.h"
864
# undef BT_COLON
865
# include "latin1tab.h"
866
},
867
STANDARD_VTABLE(little2_) NULL_VTABLE};
868
869
#endif
870
871
#define BIG2_BYTE_TYPE(enc, p) \
872
((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1]))
873
#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
874
#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
875
#define BIG2_IS_NAME_CHAR_MINBPC(p) \
876
UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
877
#define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \
878
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
879
880
#ifdef XML_MIN_SIZE
881
882
static int PTRFASTCALL
883
big2_byteType(const ENCODING *enc, const char *p) {
884
return BIG2_BYTE_TYPE(enc, p);
885
}
886
887
static int PTRFASTCALL
888
big2_byteToAscii(const ENCODING *enc, const char *p) {
889
UNUSED_P(enc);
890
return BIG2_BYTE_TO_ASCII(p);
891
}
892
893
static int PTRCALL
894
big2_charMatches(const ENCODING *enc, const char *p, int c) {
895
UNUSED_P(enc);
896
return BIG2_CHAR_MATCHES(p, c);
897
}
898
899
static int PTRFASTCALL
900
big2_isNameMin(const ENCODING *enc, const char *p) {
901
UNUSED_P(enc);
902
return BIG2_IS_NAME_CHAR_MINBPC(p);
903
}
904
905
static int PTRFASTCALL
906
big2_isNmstrtMin(const ENCODING *enc, const char *p) {
907
UNUSED_P(enc);
908
return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
909
}
910
911
# undef VTABLE
912
# define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
913
914
#else /* not XML_MIN_SIZE */
915
916
# undef PREFIX
917
# define PREFIX(ident) big2_##ident
918
# define MINBPC(enc) 2
919
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
920
# define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
921
# define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
922
# define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
923
# define IS_NAME_CHAR(enc, p, n) 0
924
# define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
925
# define IS_NMSTRT_CHAR(enc, p, n) (0)
926
# define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
927
928
# define XML_TOK_IMPL_C
929
# include "xmltok_impl.c"
930
# undef XML_TOK_IMPL_C
931
932
# undef MINBPC
933
# undef BYTE_TYPE
934
# undef BYTE_TO_ASCII
935
# undef CHAR_MATCHES
936
# undef IS_NAME_CHAR
937
# undef IS_NAME_CHAR_MINBPC
938
# undef IS_NMSTRT_CHAR
939
# undef IS_NMSTRT_CHAR_MINBPC
940
# undef IS_INVALID_CHAR
941
942
#endif /* not XML_MIN_SIZE */
943
944
#ifdef XML_NS
945
946
static const struct normal_encoding big2_encoding_ns
947
= {{VTABLE, 2, 0,
948
# if BYTEORDER == 4321
949
1
950
# else
951
0
952
# endif
953
},
954
{
955
# include "asciitab.h"
956
# include "latin1tab.h"
957
},
958
STANDARD_VTABLE(big2_) NULL_VTABLE};
959
960
#endif
961
962
static const struct normal_encoding big2_encoding
963
= {{VTABLE, 2, 0,
964
#if BYTEORDER == 4321
965
1
966
#else
967
0
968
#endif
969
},
970
{
971
#define BT_COLON BT_NMSTRT
972
#include "asciitab.h"
973
#undef BT_COLON
974
#include "latin1tab.h"
975
},
976
STANDARD_VTABLE(big2_) NULL_VTABLE};
977
978
#if BYTEORDER != 1234
979
980
# ifdef XML_NS
981
982
static const struct normal_encoding internal_big2_encoding_ns
983
= {{VTABLE, 2, 0, 1},
984
{
985
# include "iasciitab.h"
986
# include "latin1tab.h"
987
},
988
STANDARD_VTABLE(big2_) NULL_VTABLE};
989
990
# endif
991
992
static const struct normal_encoding internal_big2_encoding
993
= {{VTABLE, 2, 0, 1},
994
{
995
# define BT_COLON BT_NMSTRT
996
# include "iasciitab.h"
997
# undef BT_COLON
998
# include "latin1tab.h"
999
},
1000
STANDARD_VTABLE(big2_) NULL_VTABLE};
1001
1002
#endif
1003
1004
#undef PREFIX
1005
1006
static int FASTCALL
1007
streqci(const char *s1, const char *s2) {
1008
for (;;) {
1009
char c1 = *s1++;
1010
char c2 = *s2++;
1011
if (ASCII_a <= c1 && c1 <= ASCII_z)
1012
c1 += ASCII_A - ASCII_a;
1013
if (ASCII_a <= c2 && c2 <= ASCII_z)
1014
/* The following line will never get executed. streqci() is
1015
* only called from two places, both of which guarantee to put
1016
* upper-case strings into s2.
1017
*/
1018
c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1019
if (c1 != c2)
1020
return 0;
1021
if (! c1)
1022
break;
1023
}
1024
return 1;
1025
}
1026
1027
static void PTRCALL
1028
initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1029
POSITION *pos) {
1030
UNUSED_P(enc);
1031
normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1032
}
1033
1034
static int
1035
toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1036
char buf[1];
1037
char *p = buf;
1038
XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1039
if (p == buf)
1040
return -1;
1041
else
1042
return buf[0];
1043
}
1044
1045
static int FASTCALL
1046
isSpace(int c) {
1047
switch (c) {
1048
case 0x20:
1049
case 0xD:
1050
case 0xA:
1051
case 0x9:
1052
return 1;
1053
}
1054
return 0;
1055
}
1056
1057
/* Return 1 if there's just optional white space or there's an S
1058
followed by name=val.
1059
*/
1060
static int
1061
parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1062
const char **namePtr, const char **nameEndPtr,
1063
const char **valPtr, const char **nextTokPtr) {
1064
int c;
1065
char open;
1066
if (ptr == end) {
1067
*namePtr = NULL;
1068
return 1;
1069
}
1070
if (! isSpace(toAscii(enc, ptr, end))) {
1071
*nextTokPtr = ptr;
1072
return 0;
1073
}
1074
do {
1075
ptr += enc->minBytesPerChar;
1076
} while (isSpace(toAscii(enc, ptr, end)));
1077
if (ptr == end) {
1078
*namePtr = NULL;
1079
return 1;
1080
}
1081
*namePtr = ptr;
1082
for (;;) {
1083
c = toAscii(enc, ptr, end);
1084
if (c == -1) {
1085
*nextTokPtr = ptr;
1086
return 0;
1087
}
1088
if (c == ASCII_EQUALS) {
1089
*nameEndPtr = ptr;
1090
break;
1091
}
1092
if (isSpace(c)) {
1093
*nameEndPtr = ptr;
1094
do {
1095
ptr += enc->minBytesPerChar;
1096
} while (isSpace(c = toAscii(enc, ptr, end)));
1097
if (c != ASCII_EQUALS) {
1098
*nextTokPtr = ptr;
1099
return 0;
1100
}
1101
break;
1102
}
1103
ptr += enc->minBytesPerChar;
1104
}
1105
if (ptr == *namePtr) {
1106
*nextTokPtr = ptr;
1107
return 0;
1108
}
1109
ptr += enc->minBytesPerChar;
1110
c = toAscii(enc, ptr, end);
1111
while (isSpace(c)) {
1112
ptr += enc->minBytesPerChar;
1113
c = toAscii(enc, ptr, end);
1114
}
1115
if (c != ASCII_QUOT && c != ASCII_APOS) {
1116
*nextTokPtr = ptr;
1117
return 0;
1118
}
1119
open = (char)c;
1120
ptr += enc->minBytesPerChar;
1121
*valPtr = ptr;
1122
for (;; ptr += enc->minBytesPerChar) {
1123
c = toAscii(enc, ptr, end);
1124
if (c == open)
1125
break;
1126
if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1127
&& ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1128
&& c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1129
*nextTokPtr = ptr;
1130
return 0;
1131
}
1132
}
1133
*nextTokPtr = ptr + enc->minBytesPerChar;
1134
return 1;
1135
}
1136
1137
static const char KW_version[]
1138
= {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1139
1140
static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1141
ASCII_i, ASCII_n, ASCII_g, '\0'};
1142
1143
static const char KW_standalone[]
1144
= {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1145
ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1146
1147
static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1148
1149
static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1150
1151
static int
1152
doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1153
const char *),
1154
int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1155
const char *end, const char **badPtr, const char **versionPtr,
1156
const char **versionEndPtr, const char **encodingName,
1157
const ENCODING **encoding, int *standalone) {
1158
const char *val = NULL;
1159
const char *name = NULL;
1160
const char *nameEnd = NULL;
1161
ptr += 5 * enc->minBytesPerChar;
1162
end -= 2 * enc->minBytesPerChar;
1163
if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1164
|| ! name) {
1165
*badPtr = ptr;
1166
return 0;
1167
}
1168
if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1169
if (! isGeneralTextEntity) {
1170
*badPtr = name;
1171
return 0;
1172
}
1173
} else {
1174
if (versionPtr)
1175
*versionPtr = val;
1176
if (versionEndPtr)
1177
*versionEndPtr = ptr;
1178
if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1179
*badPtr = ptr;
1180
return 0;
1181
}
1182
if (! name) {
1183
if (isGeneralTextEntity) {
1184
/* a TextDecl must have an EncodingDecl */
1185
*badPtr = ptr;
1186
return 0;
1187
}
1188
return 1;
1189
}
1190
}
1191
if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1192
int c = toAscii(enc, val, end);
1193
if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1194
*badPtr = val;
1195
return 0;
1196
}
1197
if (encodingName)
1198
*encodingName = val;
1199
if (encoding)
1200
*encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1201
if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1202
*badPtr = ptr;
1203
return 0;
1204
}
1205
if (! name)
1206
return 1;
1207
}
1208
if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1209
|| isGeneralTextEntity) {
1210
*badPtr = name;
1211
return 0;
1212
}
1213
if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1214
if (standalone)
1215
*standalone = 1;
1216
} else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1217
if (standalone)
1218
*standalone = 0;
1219
} else {
1220
*badPtr = val;
1221
return 0;
1222
}
1223
while (isSpace(toAscii(enc, ptr, end)))
1224
ptr += enc->minBytesPerChar;
1225
if (ptr != end) {
1226
*badPtr = ptr;
1227
return 0;
1228
}
1229
return 1;
1230
}
1231
1232
static int FASTCALL
1233
checkCharRefNumber(int result) {
1234
switch (result >> 8) {
1235
case 0xD8:
1236
case 0xD9:
1237
case 0xDA:
1238
case 0xDB:
1239
case 0xDC:
1240
case 0xDD:
1241
case 0xDE:
1242
case 0xDF:
1243
return -1;
1244
case 0:
1245
if (latin1_encoding.type[result] == BT_NONXML)
1246
return -1;
1247
break;
1248
case 0xFF:
1249
if (result == 0xFFFE || result == 0xFFFF)
1250
return -1;
1251
break;
1252
}
1253
return result;
1254
}
1255
1256
int FASTCALL
1257
XmlUtf8Encode(int c, char *buf) {
1258
enum {
1259
/* minN is minimum legal resulting value for N byte sequence */
1260
min2 = 0x80,
1261
min3 = 0x800,
1262
min4 = 0x10000
1263
};
1264
1265
if (c < 0)
1266
return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1267
if (c < min2) {
1268
buf[0] = (char)(c | UTF8_cval1);
1269
return 1;
1270
}
1271
if (c < min3) {
1272
buf[0] = (char)((c >> 6) | UTF8_cval2);
1273
buf[1] = (char)((c & 0x3f) | 0x80);
1274
return 2;
1275
}
1276
if (c < min4) {
1277
buf[0] = (char)((c >> 12) | UTF8_cval3);
1278
buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1279
buf[2] = (char)((c & 0x3f) | 0x80);
1280
return 3;
1281
}
1282
if (c < 0x110000) {
1283
buf[0] = (char)((c >> 18) | UTF8_cval4);
1284
buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1285
buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1286
buf[3] = (char)((c & 0x3f) | 0x80);
1287
return 4;
1288
}
1289
return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1290
}
1291
1292
int FASTCALL
1293
XmlUtf16Encode(int charNum, unsigned short *buf) {
1294
if (charNum < 0)
1295
return 0;
1296
if (charNum < 0x10000) {
1297
buf[0] = (unsigned short)charNum;
1298
return 1;
1299
}
1300
if (charNum < 0x110000) {
1301
charNum -= 0x10000;
1302
buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1303
buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1304
return 2;
1305
}
1306
return 0;
1307
}
1308
1309
struct unknown_encoding {
1310
struct normal_encoding normal;
1311
CONVERTER convert;
1312
void *userData;
1313
unsigned short utf16[256];
1314
char utf8[256][4];
1315
};
1316
1317
#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1318
1319
int
1320
XmlSizeOfUnknownEncoding(void) {
1321
return sizeof(struct unknown_encoding);
1322
}
1323
1324
static int PTRFASTCALL
1325
unknown_isName(const ENCODING *enc, const char *p) {
1326
const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1327
int c = uenc->convert(uenc->userData, p);
1328
if (c & ~0xFFFF)
1329
return 0;
1330
return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1331
}
1332
1333
static int PTRFASTCALL
1334
unknown_isNmstrt(const ENCODING *enc, const char *p) {
1335
const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336
int c = uenc->convert(uenc->userData, p);
1337
if (c & ~0xFFFF)
1338
return 0;
1339
return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1340
}
1341
1342
static int PTRFASTCALL
1343
unknown_isInvalid(const ENCODING *enc, const char *p) {
1344
const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1345
int c = uenc->convert(uenc->userData, p);
1346
return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1347
}
1348
1349
static enum XML_Convert_Result PTRCALL
1350
unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1351
char **toP, const char *toLim) {
1352
const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1353
char buf[XML_UTF8_ENCODE_MAX];
1354
for (;;) {
1355
const char *utf8;
1356
int n;
1357
if (*fromP == fromLim)
1358
return XML_CONVERT_COMPLETED;
1359
utf8 = uenc->utf8[(unsigned char)**fromP];
1360
n = *utf8++;
1361
if (n == 0) {
1362
int c = uenc->convert(uenc->userData, *fromP);
1363
n = XmlUtf8Encode(c, buf);
1364
if (n > toLim - *toP)
1365
return XML_CONVERT_OUTPUT_EXHAUSTED;
1366
utf8 = buf;
1367
*fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1368
- (BT_LEAD2 - 2));
1369
} else {
1370
if (n > toLim - *toP)
1371
return XML_CONVERT_OUTPUT_EXHAUSTED;
1372
(*fromP)++;
1373
}
1374
memcpy(*toP, utf8, n);
1375
*toP += n;
1376
}
1377
}
1378
1379
static enum XML_Convert_Result PTRCALL
1380
unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1381
unsigned short **toP, const unsigned short *toLim) {
1382
const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1383
while (*fromP < fromLim && *toP < toLim) {
1384
unsigned short c = uenc->utf16[(unsigned char)**fromP];
1385
if (c == 0) {
1386
c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1387
*fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1388
- (BT_LEAD2 - 2));
1389
} else
1390
(*fromP)++;
1391
*(*toP)++ = c;
1392
}
1393
1394
if ((*toP == toLim) && (*fromP < fromLim))
1395
return XML_CONVERT_OUTPUT_EXHAUSTED;
1396
else
1397
return XML_CONVERT_COMPLETED;
1398
}
1399
1400
ENCODING *
1401
XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1402
void *userData) {
1403
int i;
1404
struct unknown_encoding *e = (struct unknown_encoding *)mem;
1405
memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1406
for (i = 0; i < 128; i++)
1407
if (latin1_encoding.type[i] != BT_OTHER
1408
&& latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1409
return 0;
1410
for (i = 0; i < 256; i++) {
1411
int c = table[i];
1412
if (c == -1) {
1413
e->normal.type[i] = BT_MALFORM;
1414
/* This shouldn't really get used. */
1415
e->utf16[i] = 0xFFFF;
1416
e->utf8[i][0] = 1;
1417
e->utf8[i][1] = 0;
1418
} else if (c < 0) {
1419
if (c < -4)
1420
return 0;
1421
/* Multi-byte sequences need a converter function */
1422
if (! convert)
1423
return 0;
1424
e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1425
e->utf8[i][0] = 0;
1426
e->utf16[i] = 0;
1427
} else if (c < 0x80) {
1428
if (latin1_encoding.type[c] != BT_OTHER
1429
&& latin1_encoding.type[c] != BT_NONXML && c != i)
1430
return 0;
1431
e->normal.type[i] = latin1_encoding.type[c];
1432
e->utf8[i][0] = 1;
1433
e->utf8[i][1] = (char)c;
1434
e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1435
} else if (checkCharRefNumber(c) < 0) {
1436
e->normal.type[i] = BT_NONXML;
1437
/* This shouldn't really get used. */
1438
e->utf16[i] = 0xFFFF;
1439
e->utf8[i][0] = 1;
1440
e->utf8[i][1] = 0;
1441
} else {
1442
if (c > 0xFFFF)
1443
return 0;
1444
if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1445
e->normal.type[i] = BT_NMSTRT;
1446
else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1447
e->normal.type[i] = BT_NAME;
1448
else
1449
e->normal.type[i] = BT_OTHER;
1450
e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1451
e->utf16[i] = (unsigned short)c;
1452
}
1453
}
1454
e->userData = userData;
1455
e->convert = convert;
1456
if (convert) {
1457
e->normal.isName2 = unknown_isName;
1458
e->normal.isName3 = unknown_isName;
1459
e->normal.isName4 = unknown_isName;
1460
e->normal.isNmstrt2 = unknown_isNmstrt;
1461
e->normal.isNmstrt3 = unknown_isNmstrt;
1462
e->normal.isNmstrt4 = unknown_isNmstrt;
1463
e->normal.isInvalid2 = unknown_isInvalid;
1464
e->normal.isInvalid3 = unknown_isInvalid;
1465
e->normal.isInvalid4 = unknown_isInvalid;
1466
}
1467
e->normal.enc.utf8Convert = unknown_toUtf8;
1468
e->normal.enc.utf16Convert = unknown_toUtf16;
1469
return &(e->normal.enc);
1470
}
1471
1472
/* If this enumeration is changed, getEncodingIndex and encodings
1473
must also be changed. */
1474
enum {
1475
UNKNOWN_ENC = -1,
1476
ISO_8859_1_ENC = 0,
1477
US_ASCII_ENC,
1478
UTF_8_ENC,
1479
UTF_16_ENC,
1480
UTF_16BE_ENC,
1481
UTF_16LE_ENC,
1482
/* must match encodingNames up to here */
1483
NO_ENC
1484
};
1485
1486
static const char KW_ISO_8859_1[]
1487
= {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8,
1488
ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'};
1489
static const char KW_US_ASCII[]
1490
= {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1491
ASCII_C, ASCII_I, ASCII_I, '\0'};
1492
static const char KW_UTF_8[]
1493
= {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1494
static const char KW_UTF_16[]
1495
= {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1496
static const char KW_UTF_16BE[]
1497
= {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1498
ASCII_6, ASCII_B, ASCII_E, '\0'};
1499
static const char KW_UTF_16LE[]
1500
= {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1501
ASCII_6, ASCII_L, ASCII_E, '\0'};
1502
1503
static int FASTCALL
1504
getEncodingIndex(const char *name) {
1505
static const char *const encodingNames[] = {
1506
KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1507
};
1508
int i;
1509
if (name == NULL)
1510
return NO_ENC;
1511
for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1512
if (streqci(name, encodingNames[i]))
1513
return i;
1514
return UNKNOWN_ENC;
1515
}
1516
1517
/* For binary compatibility, we store the index of the encoding
1518
specified at initialization in the isUtf16 member.
1519
*/
1520
1521
#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1522
#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1523
1524
/* This is what detects the encoding. encodingTable maps from
1525
encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1526
the external (protocol) specified encoding; state is
1527
XML_CONTENT_STATE if we're parsing an external text entity, and
1528
XML_PROLOG_STATE otherwise.
1529
*/
1530
1531
static int
1532
initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1533
int state, const char *ptr, const char *end, const char **nextTokPtr) {
1534
const ENCODING **encPtr;
1535
1536
if (ptr >= end)
1537
return XML_TOK_NONE;
1538
encPtr = enc->encPtr;
1539
if (ptr + 1 == end) {
1540
/* only a single byte available for auto-detection */
1541
#ifndef XML_DTD /* FIXME */
1542
/* a well-formed document entity must have more than one byte */
1543
if (state != XML_CONTENT_STATE)
1544
return XML_TOK_PARTIAL;
1545
#endif
1546
/* so we're parsing an external text entity... */
1547
/* if UTF-16 was externally specified, then we need at least 2 bytes */
1548
switch (INIT_ENC_INDEX(enc)) {
1549
case UTF_16_ENC:
1550
case UTF_16LE_ENC:
1551
case UTF_16BE_ENC:
1552
return XML_TOK_PARTIAL;
1553
}
1554
switch ((unsigned char)*ptr) {
1555
case 0xFE:
1556
case 0xFF:
1557
case 0xEF: /* possibly first byte of UTF-8 BOM */
1558
if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1559
break;
1560
/* fall through */
1561
case 0x00:
1562
case 0x3C:
1563
return XML_TOK_PARTIAL;
1564
}
1565
} else {
1566
switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1567
case 0xFEFF:
1568
if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1569
break;
1570
*nextTokPtr = ptr + 2;
1571
*encPtr = encodingTable[UTF_16BE_ENC];
1572
return XML_TOK_BOM;
1573
/* 00 3C is handled in the default case */
1574
case 0x3C00:
1575
if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1576
|| INIT_ENC_INDEX(enc) == UTF_16_ENC)
1577
&& state == XML_CONTENT_STATE)
1578
break;
1579
*encPtr = encodingTable[UTF_16LE_ENC];
1580
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1581
case 0xFFFE:
1582
if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1583
break;
1584
*nextTokPtr = ptr + 2;
1585
*encPtr = encodingTable[UTF_16LE_ENC];
1586
return XML_TOK_BOM;
1587
case 0xEFBB:
1588
/* Maybe a UTF-8 BOM (EF BB BF) */
1589
/* If there's an explicitly specified (external) encoding
1590
of ISO-8859-1 or some flavour of UTF-16
1591
and this is an external text entity,
1592
don't look for the BOM,
1593
because it might be a legal data.
1594
*/
1595
if (state == XML_CONTENT_STATE) {
1596
int e = INIT_ENC_INDEX(enc);
1597
if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1598
|| e == UTF_16_ENC)
1599
break;
1600
}
1601
if (ptr + 2 == end)
1602
return XML_TOK_PARTIAL;
1603
if ((unsigned char)ptr[2] == 0xBF) {
1604
*nextTokPtr = ptr + 3;
1605
*encPtr = encodingTable[UTF_8_ENC];
1606
return XML_TOK_BOM;
1607
}
1608
break;
1609
default:
1610
if (ptr[0] == '\0') {
1611
/* 0 isn't a legal data character. Furthermore a document
1612
entity can only start with ASCII characters. So the only
1613
way this can fail to be big-endian UTF-16 if it it's an
1614
external parsed general entity that's labelled as
1615
UTF-16LE.
1616
*/
1617
if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1618
break;
1619
*encPtr = encodingTable[UTF_16BE_ENC];
1620
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1621
} else if (ptr[1] == '\0') {
1622
/* We could recover here in the case:
1623
- parsing an external entity
1624
- second byte is 0
1625
- no externally specified encoding
1626
- no encoding declaration
1627
by assuming UTF-16LE. But we don't, because this would mean when
1628
presented just with a single byte, we couldn't reliably determine
1629
whether we needed further bytes.
1630
*/
1631
if (state == XML_CONTENT_STATE)
1632
break;
1633
*encPtr = encodingTable[UTF_16LE_ENC];
1634
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1635
}
1636
break;
1637
}
1638
}
1639
*encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1640
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1641
}
1642
1643
#define NS(x) x
1644
#define ns(x) x
1645
#define XML_TOK_NS_C
1646
#include "xmltok_ns.c"
1647
#undef XML_TOK_NS_C
1648
#undef NS
1649
#undef ns
1650
1651
#ifdef XML_NS
1652
1653
# define NS(x) x##NS
1654
# define ns(x) x##_ns
1655
1656
# define XML_TOK_NS_C
1657
# include "xmltok_ns.c"
1658
# undef XML_TOK_NS_C
1659
1660
# undef NS
1661
# undef ns
1662
1663
ENCODING *
1664
XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1665
void *userData) {
1666
ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1667
if (enc)
1668
((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1669
return enc;
1670
}
1671
1672
#endif /* XML_NS */
1673
1674