Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/xml2/xmlstring.c
4393 views
1
/*
2
* string.c : an XML string utilities module
3
*
4
* This module provides various utility functions for manipulating
5
* the xmlChar* type. All functions named xmlStr* have been moved here
6
* from the parser.c file (their original home).
7
*
8
* See Copyright for the status of this software.
9
*
10
* UTF8 string routines from:
11
* William Brack <[email protected]>
12
*
13
* [email protected]
14
*/
15
16
#define IN_LIBXML
17
#include "libxml.h"
18
19
#include <stdlib.h>
20
#include <string.h>
21
#include <limits.h>
22
#include <libxml/xmlmemory.h>
23
#include <libxml/parserInternals.h>
24
#include <libxml/xmlstring.h>
25
26
#include "private/parser.h"
27
#include "private/string.h"
28
29
/************************************************************************
30
* *
31
* Commodity functions to handle xmlChars *
32
* *
33
************************************************************************/
34
35
/**
36
* xmlStrndup:
37
* @cur: the input xmlChar *
38
* @len: the len of @cur
39
*
40
* a strndup for array of xmlChar's
41
*
42
* Returns a new xmlChar * or NULL
43
*/
44
xmlChar *
45
xmlStrndup(const xmlChar *cur, int len) {
46
xmlChar *ret;
47
48
if ((cur == NULL) || (len < 0)) return(NULL);
49
ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
50
if (ret == NULL) {
51
return(NULL);
52
}
53
memcpy(ret, cur, len);
54
ret[len] = 0;
55
return(ret);
56
}
57
58
/**
59
* xmlStrdup:
60
* @cur: the input xmlChar *
61
*
62
* a strdup for array of xmlChar's. Since they are supposed to be
63
* encoded in UTF-8 or an encoding with 8bit based chars, we assume
64
* a termination mark of '0'.
65
*
66
* Returns a new xmlChar * or NULL
67
*/
68
xmlChar *
69
xmlStrdup(const xmlChar *cur) {
70
const xmlChar *p = cur;
71
72
if (cur == NULL) return(NULL);
73
while (*p != 0) p++; /* non input consuming */
74
return(xmlStrndup(cur, p - cur));
75
}
76
77
/**
78
* xmlCharStrndup:
79
* @cur: the input char *
80
* @len: the len of @cur
81
*
82
* a strndup for char's to xmlChar's
83
*
84
* Returns a new xmlChar * or NULL
85
*/
86
87
xmlChar *
88
xmlCharStrndup(const char *cur, int len) {
89
int i;
90
xmlChar *ret;
91
92
if ((cur == NULL) || (len < 0)) return(NULL);
93
ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
94
if (ret == NULL) {
95
return(NULL);
96
}
97
for (i = 0;i < len;i++) {
98
/* Explicit sign change */
99
ret[i] = (xmlChar) cur[i];
100
if (ret[i] == 0) return(ret);
101
}
102
ret[len] = 0;
103
return(ret);
104
}
105
106
/**
107
* xmlCharStrdup:
108
* @cur: the input char *
109
*
110
* a strdup for char's to xmlChar's
111
*
112
* Returns a new xmlChar * or NULL
113
*/
114
115
xmlChar *
116
xmlCharStrdup(const char *cur) {
117
const char *p = cur;
118
119
if (cur == NULL) return(NULL);
120
while (*p != '\0') p++; /* non input consuming */
121
return(xmlCharStrndup(cur, p - cur));
122
}
123
124
/**
125
* xmlStrcmp:
126
* @str1: the first xmlChar *
127
* @str2: the second xmlChar *
128
*
129
* a strcmp for xmlChar's
130
*
131
* Returns the integer result of the comparison
132
*/
133
134
int
135
xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
136
if (str1 == str2) return(0);
137
if (str1 == NULL) return(-1);
138
if (str2 == NULL) return(1);
139
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
140
return(strcmp((const char *)str1, (const char *)str2));
141
#else
142
do {
143
int tmp = *str1++ - *str2;
144
if (tmp != 0) return(tmp);
145
} while (*str2++ != 0);
146
return 0;
147
#endif
148
}
149
150
/**
151
* xmlStrEqual:
152
* @str1: the first xmlChar *
153
* @str2: the second xmlChar *
154
*
155
* Check if both strings are equal of have same content.
156
* Should be a bit more readable and faster than xmlStrcmp()
157
*
158
* Returns 1 if they are equal, 0 if they are different
159
*/
160
161
int
162
xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
163
if (str1 == str2) return(1);
164
if (str1 == NULL) return(0);
165
if (str2 == NULL) return(0);
166
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
167
return(strcmp((const char *)str1, (const char *)str2) == 0);
168
#else
169
do {
170
if (*str1++ != *str2) return(0);
171
} while (*str2++);
172
return(1);
173
#endif
174
}
175
176
/**
177
* xmlStrQEqual:
178
* @pref: the prefix of the QName
179
* @name: the localname of the QName
180
* @str: the second xmlChar *
181
*
182
* Check if a QName is Equal to a given string
183
*
184
* Returns 1 if they are equal, 0 if they are different
185
*/
186
187
int
188
xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
189
if (pref == NULL) return(xmlStrEqual(name, str));
190
if (name == NULL) return(0);
191
if (str == NULL) return(0);
192
193
do {
194
if (*pref++ != *str) return(0);
195
} while ((*str++) && (*pref));
196
if (*str++ != ':') return(0);
197
do {
198
if (*name++ != *str) return(0);
199
} while (*str++);
200
return(1);
201
}
202
203
/**
204
* xmlStrncmp:
205
* @str1: the first xmlChar *
206
* @str2: the second xmlChar *
207
* @len: the max comparison length
208
*
209
* a strncmp for xmlChar's
210
*
211
* Returns the integer result of the comparison
212
*/
213
214
int
215
xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
216
if (len <= 0) return(0);
217
if (str1 == str2) return(0);
218
if (str1 == NULL) return(-1);
219
if (str2 == NULL) return(1);
220
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
221
return(strncmp((const char *)str1, (const char *)str2, len));
222
#else
223
do {
224
int tmp = *str1++ - *str2;
225
if (tmp != 0 || --len == 0) return(tmp);
226
} while (*str2++ != 0);
227
return 0;
228
#endif
229
}
230
231
static const xmlChar casemap[256] = {
232
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
233
0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
234
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
235
0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
236
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
237
0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
238
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
239
0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
240
0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
241
0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
242
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
243
0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
244
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
245
0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
246
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
247
0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
248
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
249
0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
250
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
251
0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
252
0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
253
0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
254
0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
255
0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
256
0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
257
0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
258
0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
259
0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
260
0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
261
0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
262
0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
263
0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
264
};
265
266
/**
267
* xmlStrcasecmp:
268
* @str1: the first xmlChar *
269
* @str2: the second xmlChar *
270
*
271
* a strcasecmp for xmlChar's
272
*
273
* Returns the integer result of the comparison
274
*/
275
276
int
277
xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
278
register int tmp;
279
280
if (str1 == str2) return(0);
281
if (str1 == NULL) return(-1);
282
if (str2 == NULL) return(1);
283
do {
284
tmp = casemap[*str1++] - casemap[*str2];
285
if (tmp != 0) return(tmp);
286
} while (*str2++ != 0);
287
return 0;
288
}
289
290
/**
291
* xmlStrncasecmp:
292
* @str1: the first xmlChar *
293
* @str2: the second xmlChar *
294
* @len: the max comparison length
295
*
296
* a strncasecmp for xmlChar's
297
*
298
* Returns the integer result of the comparison
299
*/
300
301
int
302
xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
303
register int tmp;
304
305
if (len <= 0) return(0);
306
if (str1 == str2) return(0);
307
if (str1 == NULL) return(-1);
308
if (str2 == NULL) return(1);
309
do {
310
tmp = casemap[*str1++] - casemap[*str2];
311
if (tmp != 0 || --len == 0) return(tmp);
312
} while (*str2++ != 0);
313
return 0;
314
}
315
316
/**
317
* xmlStrchr:
318
* @str: the xmlChar * array
319
* @val: the xmlChar to search
320
*
321
* a strchr for xmlChar's
322
*
323
* Returns the xmlChar * for the first occurrence or NULL.
324
*/
325
326
const xmlChar *
327
xmlStrchr(const xmlChar *str, xmlChar val) {
328
if (str == NULL) return(NULL);
329
while (*str != 0) { /* non input consuming */
330
if (*str == val) return((xmlChar *) str);
331
str++;
332
}
333
return(NULL);
334
}
335
336
/**
337
* xmlStrstr:
338
* @str: the xmlChar * array (haystack)
339
* @val: the xmlChar to search (needle)
340
*
341
* a strstr for xmlChar's
342
*
343
* Returns the xmlChar * for the first occurrence or NULL.
344
*/
345
346
const xmlChar *
347
xmlStrstr(const xmlChar *str, const xmlChar *val) {
348
int n;
349
350
if (str == NULL) return(NULL);
351
if (val == NULL) return(NULL);
352
n = xmlStrlen(val);
353
354
if (n == 0) return(str);
355
while (*str != 0) { /* non input consuming */
356
if (*str == *val) {
357
if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
358
}
359
str++;
360
}
361
return(NULL);
362
}
363
364
/**
365
* xmlStrcasestr:
366
* @str: the xmlChar * array (haystack)
367
* @val: the xmlChar to search (needle)
368
*
369
* a case-ignoring strstr for xmlChar's
370
*
371
* Returns the xmlChar * for the first occurrence or NULL.
372
*/
373
374
const xmlChar *
375
xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
376
int n;
377
378
if (str == NULL) return(NULL);
379
if (val == NULL) return(NULL);
380
n = xmlStrlen(val);
381
382
if (n == 0) return(str);
383
while (*str != 0) { /* non input consuming */
384
if (casemap[*str] == casemap[*val])
385
if (!xmlStrncasecmp(str, val, n)) return(str);
386
str++;
387
}
388
return(NULL);
389
}
390
391
/**
392
* xmlStrsub:
393
* @str: the xmlChar * array (haystack)
394
* @start: the index of the first char (zero based)
395
* @len: the length of the substring
396
*
397
* Extract a substring of a given string
398
*
399
* Returns the xmlChar * for the first occurrence or NULL.
400
*/
401
402
xmlChar *
403
xmlStrsub(const xmlChar *str, int start, int len) {
404
int i;
405
406
if (str == NULL) return(NULL);
407
if (start < 0) return(NULL);
408
if (len < 0) return(NULL);
409
410
for (i = 0;i < start;i++) {
411
if (*str == 0) return(NULL);
412
str++;
413
}
414
if (*str == 0) return(NULL);
415
return(xmlStrndup(str, len));
416
}
417
418
/**
419
* xmlStrlen:
420
* @str: the xmlChar * array
421
*
422
* length of a xmlChar's string
423
*
424
* Returns the number of xmlChar contained in the ARRAY.
425
*/
426
427
int
428
xmlStrlen(const xmlChar *str) {
429
size_t len = str ? strlen((const char *)str) : 0;
430
return(len > INT_MAX ? 0 : len);
431
}
432
433
/**
434
* xmlStrncat:
435
* @cur: the original xmlChar * array
436
* @add: the xmlChar * array added
437
* @len: the length of @add
438
*
439
* a strncat for array of xmlChar's, it will extend @cur with the len
440
* first bytes of @add. Note that if @len < 0 then this is an API error
441
* and NULL will be returned.
442
*
443
* Returns a new xmlChar *, the original @cur is reallocated and should
444
* not be freed.
445
*/
446
447
xmlChar *
448
xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
449
int size;
450
xmlChar *ret;
451
452
if ((add == NULL) || (len == 0))
453
return(cur);
454
if (len < 0)
455
return(NULL);
456
if (cur == NULL)
457
return(xmlStrndup(add, len));
458
459
size = xmlStrlen(cur);
460
if ((size < 0) || (size > INT_MAX - len))
461
return(NULL);
462
ret = (xmlChar *) xmlRealloc(cur, (size_t) size + len + 1);
463
if (ret == NULL) {
464
return(cur);
465
}
466
memcpy(&ret[size], add, len);
467
ret[size + len] = 0;
468
return(ret);
469
}
470
471
/**
472
* xmlStrncatNew:
473
* @str1: first xmlChar string
474
* @str2: second xmlChar string
475
* @len: the len of @str2 or < 0
476
*
477
* same as xmlStrncat, but creates a new string. The original
478
* two strings are not freed. If @len is < 0 then the length
479
* will be calculated automatically.
480
*
481
* Returns a new xmlChar * or NULL
482
*/
483
xmlChar *
484
xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
485
int size;
486
xmlChar *ret;
487
488
if (len < 0) {
489
len = xmlStrlen(str2);
490
if (len < 0)
491
return(NULL);
492
}
493
if ((str2 == NULL) || (len == 0))
494
return(xmlStrdup(str1));
495
if (str1 == NULL)
496
return(xmlStrndup(str2, len));
497
498
size = xmlStrlen(str1);
499
if ((size < 0) || (size > INT_MAX - len))
500
return(NULL);
501
ret = (xmlChar *) xmlMalloc((size_t) size + len + 1);
502
if (ret == NULL) {
503
return(xmlStrndup(str1, size));
504
}
505
memcpy(ret, str1, size);
506
memcpy(&ret[size], str2, len);
507
ret[size + len] = 0;
508
return(ret);
509
}
510
511
/**
512
* xmlStrcat:
513
* @cur: the original xmlChar * array
514
* @add: the xmlChar * array added
515
*
516
* a strcat for array of xmlChar's. Since they are supposed to be
517
* encoded in UTF-8 or an encoding with 8bit based chars, we assume
518
* a termination mark of '0'.
519
*
520
* Returns a new xmlChar * containing the concatenated string. The original
521
* @cur is reallocated and should not be freed.
522
*/
523
xmlChar *
524
xmlStrcat(xmlChar *cur, const xmlChar *add) {
525
const xmlChar *p = add;
526
527
if (add == NULL) return(cur);
528
if (cur == NULL)
529
return(xmlStrdup(add));
530
531
while (*p != 0) p++; /* non input consuming */
532
return(xmlStrncat(cur, add, p - add));
533
}
534
535
/**
536
* xmlStrPrintf:
537
* @buf: the result buffer.
538
* @len: the result buffer length.
539
* @msg: the message with printf formatting.
540
* @...: extra parameters for the message.
541
*
542
* Formats @msg and places result into @buf.
543
*
544
* Returns the number of characters written to @buf or -1 if an error occurs.
545
*/
546
int
547
xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
548
va_list args;
549
int ret;
550
551
if((buf == NULL) || (msg == NULL)) {
552
return(-1);
553
}
554
555
va_start(args, msg);
556
ret = vsnprintf((char *) buf, len, (const char *) msg, args);
557
va_end(args);
558
buf[len - 1] = 0; /* be safe ! */
559
560
return(ret);
561
}
562
563
/**
564
* xmlStrVPrintf:
565
* @buf: the result buffer.
566
* @len: the result buffer length.
567
* @msg: the message with printf formatting.
568
* @ap: extra parameters for the message.
569
*
570
* Formats @msg and places result into @buf.
571
*
572
* Returns the number of characters written to @buf or -1 if an error occurs.
573
*/
574
int
575
xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
576
int ret;
577
578
if((buf == NULL) || (msg == NULL)) {
579
return(-1);
580
}
581
582
ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
583
buf[len - 1] = 0; /* be safe ! */
584
585
return(ret);
586
}
587
588
/************************************************************************
589
* *
590
* Generic UTF8 handling routines *
591
* *
592
* From rfc2044: encoding of the Unicode values on UTF-8: *
593
* *
594
* UCS-4 range (hex.) UTF-8 octet sequence (binary) *
595
* 0000 0000-0000 007F 0xxxxxxx *
596
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
597
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
598
* *
599
* I hope we won't use values > 0xFFFF anytime soon ! *
600
* *
601
************************************************************************/
602
603
604
/**
605
* xmlUTF8Size:
606
* @utf: pointer to the UTF8 character
607
*
608
* calculates the internal size of a UTF8 character
609
*
610
* returns the numbers of bytes in the character, -1 on format error
611
*/
612
int
613
xmlUTF8Size(const xmlChar *utf) {
614
xmlChar mask;
615
int len;
616
617
if (utf == NULL)
618
return -1;
619
if (*utf < 0x80)
620
return 1;
621
/* check valid UTF8 character */
622
if (!(*utf & 0x40))
623
return -1;
624
/* determine number of bytes in char */
625
len = 2;
626
for (mask=0x20; mask != 0; mask>>=1) {
627
if (!(*utf & mask))
628
return len;
629
len++;
630
}
631
return -1;
632
}
633
634
/**
635
* xmlUTF8Charcmp:
636
* @utf1: pointer to first UTF8 char
637
* @utf2: pointer to second UTF8 char
638
*
639
* compares the two UCS4 values
640
*
641
* returns result of the compare as with xmlStrncmp
642
*/
643
int
644
xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
645
646
if (utf1 == NULL ) {
647
if (utf2 == NULL)
648
return 0;
649
return -1;
650
}
651
return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
652
}
653
654
/**
655
* xmlUTF8Strlen:
656
* @utf: a sequence of UTF-8 encoded bytes
657
*
658
* compute the length of an UTF8 string, it doesn't do a full UTF8
659
* checking of the content of the string.
660
*
661
* Returns the number of characters in the string or -1 in case of error
662
*/
663
int
664
xmlUTF8Strlen(const xmlChar *utf) {
665
size_t ret = 0;
666
667
if (utf == NULL)
668
return(-1);
669
670
while (*utf != 0) {
671
if (utf[0] & 0x80) {
672
if ((utf[1] & 0xc0) != 0x80)
673
return(-1);
674
if ((utf[0] & 0xe0) == 0xe0) {
675
if ((utf[2] & 0xc0) != 0x80)
676
return(-1);
677
if ((utf[0] & 0xf0) == 0xf0) {
678
if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
679
return(-1);
680
utf += 4;
681
} else {
682
utf += 3;
683
}
684
} else {
685
utf += 2;
686
}
687
} else {
688
utf++;
689
}
690
ret++;
691
}
692
return(ret > INT_MAX ? 0 : ret);
693
}
694
695
/**
696
* xmlGetUTF8Char:
697
* @utf: a sequence of UTF-8 encoded bytes
698
* @len: a pointer to the minimum number of bytes present in
699
* the sequence. This is used to assure the next character
700
* is completely contained within the sequence.
701
*
702
* Read the first UTF8 character from @utf
703
*
704
* Returns the char value or -1 in case of error, and sets *len to
705
* the actual number of bytes consumed (0 in case of error)
706
*/
707
int
708
xmlGetUTF8Char(const unsigned char *utf, int *len) {
709
unsigned int c;
710
711
if (utf == NULL)
712
goto error;
713
if (len == NULL)
714
goto error;
715
716
c = utf[0];
717
if (c < 0x80) {
718
if (*len < 1)
719
goto error;
720
/* 1-byte code */
721
*len = 1;
722
} else {
723
if ((*len < 2) || ((utf[1] & 0xc0) != 0x80))
724
goto error;
725
if (c < 0xe0) {
726
if (c < 0xc2)
727
goto error;
728
/* 2-byte code */
729
*len = 2;
730
c = (c & 0x1f) << 6;
731
c |= utf[1] & 0x3f;
732
} else {
733
if ((*len < 3) || ((utf[2] & 0xc0) != 0x80))
734
goto error;
735
if (c < 0xf0) {
736
/* 3-byte code */
737
*len = 3;
738
c = (c & 0xf) << 12;
739
c |= (utf[1] & 0x3f) << 6;
740
c |= utf[2] & 0x3f;
741
if ((c < 0x800) || ((c >= 0xd800) && (c < 0xe000)))
742
goto error;
743
} else {
744
if ((*len < 4) || ((utf[3] & 0xc0) != 0x80))
745
goto error;
746
*len = 4;
747
/* 4-byte code */
748
c = (c & 0x7) << 18;
749
c |= (utf[1] & 0x3f) << 12;
750
c |= (utf[2] & 0x3f) << 6;
751
c |= utf[3] & 0x3f;
752
if ((c < 0x10000) || (c >= 0x110000))
753
goto error;
754
}
755
}
756
}
757
return(c);
758
759
error:
760
if (len != NULL)
761
*len = 0;
762
return(-1);
763
}
764
765
/**
766
* xmlCheckUTF8:
767
* @utf: Pointer to putative UTF-8 encoded string.
768
*
769
* Checks @utf for being valid UTF-8. @utf is assumed to be
770
* null-terminated. This function is not super-strict, as it will
771
* allow longer UTF-8 sequences than necessary. Note that Java is
772
* capable of producing these sequences if provoked. Also note, this
773
* routine checks for the 4-byte maximum size, but does not check for
774
* 0x10ffff maximum value.
775
*
776
* Return value: true if @utf is valid.
777
**/
778
int
779
xmlCheckUTF8(const unsigned char *utf)
780
{
781
int ix;
782
unsigned char c;
783
784
if (utf == NULL)
785
return(0);
786
/*
787
* utf is a string of 1, 2, 3 or 4 bytes. The valid strings
788
* are as follows (in "bit format"):
789
* 0xxxxxxx valid 1-byte
790
* 110xxxxx 10xxxxxx valid 2-byte
791
* 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
792
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
793
*/
794
while ((c = utf[0])) { /* string is 0-terminated */
795
ix = 0;
796
if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
797
ix = 1;
798
} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
799
if ((utf[1] & 0xc0 ) != 0x80)
800
return 0;
801
ix = 2;
802
} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
803
if (((utf[1] & 0xc0) != 0x80) ||
804
((utf[2] & 0xc0) != 0x80))
805
return 0;
806
ix = 3;
807
} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
808
if (((utf[1] & 0xc0) != 0x80) ||
809
((utf[2] & 0xc0) != 0x80) ||
810
((utf[3] & 0xc0) != 0x80))
811
return 0;
812
ix = 4;
813
} else /* unknown encoding */
814
return 0;
815
utf += ix;
816
}
817
return(1);
818
}
819
820
/**
821
* xmlUTF8Strsize:
822
* @utf: a sequence of UTF-8 encoded bytes
823
* @len: the number of characters in the array
824
*
825
* storage size of an UTF8 string
826
* the behaviour is not guaranteed if the input string is not UTF-8
827
*
828
* Returns the storage size of
829
* the first 'len' characters of ARRAY
830
*/
831
832
int
833
xmlUTF8Strsize(const xmlChar *utf, int len) {
834
const xmlChar *ptr=utf;
835
int ch;
836
size_t ret;
837
838
if (utf == NULL)
839
return(0);
840
841
if (len <= 0)
842
return(0);
843
844
while ( len-- > 0) {
845
if ( !*ptr )
846
break;
847
if ( (ch = *ptr++) & 0x80)
848
while ((ch<<=1) & 0x80 ) {
849
if (*ptr == 0) break;
850
ptr++;
851
}
852
}
853
ret = ptr - utf;
854
return (ret > INT_MAX ? 0 : ret);
855
}
856
857
858
/**
859
* xmlUTF8Strndup:
860
* @utf: the input UTF8 *
861
* @len: the len of @utf (in chars)
862
*
863
* a strndup for array of UTF8's
864
*
865
* Returns a new UTF8 * or NULL
866
*/
867
xmlChar *
868
xmlUTF8Strndup(const xmlChar *utf, int len) {
869
xmlChar *ret;
870
int i;
871
872
if ((utf == NULL) || (len < 0)) return(NULL);
873
i = xmlUTF8Strsize(utf, len);
874
ret = (xmlChar *) xmlMallocAtomic((size_t) i + 1);
875
if (ret == NULL) {
876
return(NULL);
877
}
878
memcpy(ret, utf, i);
879
ret[i] = 0;
880
return(ret);
881
}
882
883
/**
884
* xmlUTF8Strpos:
885
* @utf: the input UTF8 *
886
* @pos: the position of the desired UTF8 char (in chars)
887
*
888
* a function to provide the equivalent of fetching a
889
* character from a string array
890
*
891
* Returns a pointer to the UTF8 character or NULL
892
*/
893
const xmlChar *
894
xmlUTF8Strpos(const xmlChar *utf, int pos) {
895
int ch;
896
897
if (utf == NULL) return(NULL);
898
if (pos < 0)
899
return(NULL);
900
while (pos--) {
901
if ((ch=*utf++) == 0) return(NULL);
902
if ( ch & 0x80 ) {
903
/* if not simple ascii, verify proper format */
904
if ( (ch & 0xc0) != 0xc0 )
905
return(NULL);
906
/* then skip over remaining bytes for this char */
907
while ( (ch <<= 1) & 0x80 )
908
if ( (*utf++ & 0xc0) != 0x80 )
909
return(NULL);
910
}
911
}
912
return((xmlChar *)utf);
913
}
914
915
/**
916
* xmlUTF8Strloc:
917
* @utf: the input UTF8 *
918
* @utfchar: the UTF8 character to be found
919
*
920
* a function to provide the relative location of a UTF8 char
921
*
922
* Returns the relative character position of the desired char
923
* or -1 if not found
924
*/
925
int
926
xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
927
size_t i;
928
int size;
929
int ch;
930
931
if (utf==NULL || utfchar==NULL) return -1;
932
size = xmlUTF8Strsize(utfchar, 1);
933
for(i=0; (ch=*utf) != 0; i++) {
934
if (xmlStrncmp(utf, utfchar, size)==0)
935
return(i > INT_MAX ? 0 : i);
936
utf++;
937
if ( ch & 0x80 ) {
938
/* if not simple ascii, verify proper format */
939
if ( (ch & 0xc0) != 0xc0 )
940
return(-1);
941
/* then skip over remaining bytes for this char */
942
while ( (ch <<= 1) & 0x80 )
943
if ( (*utf++ & 0xc0) != 0x80 )
944
return(-1);
945
}
946
}
947
948
return(-1);
949
}
950
/**
951
* xmlUTF8Strsub:
952
* @utf: a sequence of UTF-8 encoded bytes
953
* @start: relative pos of first char
954
* @len: total number to copy
955
*
956
* Create a substring from a given UTF-8 string
957
* Note: positions are given in units of UTF-8 chars
958
*
959
* Returns a pointer to a newly created string
960
* or NULL if any problem
961
*/
962
963
xmlChar *
964
xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
965
int i;
966
int ch;
967
968
if (utf == NULL) return(NULL);
969
if (start < 0) return(NULL);
970
if (len < 0) return(NULL);
971
972
/*
973
* Skip over any leading chars
974
*/
975
for (i = 0;i < start;i++) {
976
if ((ch=*utf++) == 0) return(NULL);
977
if ( ch & 0x80 ) {
978
/* if not simple ascii, verify proper format */
979
if ( (ch & 0xc0) != 0xc0 )
980
return(NULL);
981
/* then skip over remaining bytes for this char */
982
while ( (ch <<= 1) & 0x80 )
983
if ( (*utf++ & 0xc0) != 0x80 )
984
return(NULL);
985
}
986
}
987
988
return(xmlUTF8Strndup(utf, len));
989
}
990
991
/**
992
* xmlEscapeFormatString:
993
* @msg: a pointer to the string in which to escape '%' characters.
994
* Must be a heap-allocated buffer created by libxml2 that may be
995
* returned, or that may be freed and replaced.
996
*
997
* Replaces the string pointed to by 'msg' with an escaped string.
998
* Returns the same string with all '%' characters escaped.
999
*/
1000
xmlChar *
1001
xmlEscapeFormatString(xmlChar **msg)
1002
{
1003
xmlChar *msgPtr = NULL;
1004
xmlChar *result = NULL;
1005
xmlChar *resultPtr = NULL;
1006
size_t count = 0;
1007
size_t msgLen = 0;
1008
size_t resultLen = 0;
1009
1010
if (!msg || !*msg)
1011
return(NULL);
1012
1013
for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1014
++msgLen;
1015
if (*msgPtr == '%')
1016
++count;
1017
}
1018
1019
if (count == 0)
1020
return(*msg);
1021
1022
if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1023
return(NULL);
1024
resultLen = msgLen + count + 1;
1025
result = (xmlChar *) xmlMallocAtomic(resultLen);
1026
if (result == NULL) {
1027
/* Clear *msg to prevent format string vulnerabilities in
1028
out-of-memory situations. */
1029
xmlFree(*msg);
1030
*msg = NULL;
1031
return(NULL);
1032
}
1033
1034
for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1035
*resultPtr = *msgPtr;
1036
if (*msgPtr == '%')
1037
*(++resultPtr) = '%';
1038
}
1039
result[resultLen - 1] = '\0';
1040
1041
xmlFree(*msg);
1042
*msg = result;
1043
1044
return *msg;
1045
}
1046
1047