CoCalc -- xmlstring.c

GitHub Repository: wine-mirror/wine
Path: blob/master/libs/xml2/xmlstring.c
⁴³⁹³ views
1
/*
2
 * string.c : an XML string utilities module
3
 *
4
 * This module provides various utility functions for manipulating
5
 * the xmlChar* type. All functions named xmlStr* have been moved here
6
 * from the parser.c file (their original home).
7
 *
8
 * See Copyright for the status of this software.
9
 *
10
 * UTF8 string routines from:
11
 * William Brack <[email protected]>
12
 *
13
 * [email protected]
14
 */
15

16
#define IN_LIBXML
17
#include "libxml.h"
18

19
#include <stdlib.h>
20
#include <string.h>
21
#include <limits.h>
22
#include <libxml/xmlmemory.h>
23
#include <libxml/parserInternals.h>
24
#include <libxml/xmlstring.h>
25

26
#include "private/parser.h"
27
#include "private/string.h"
28

29
/************************************************************************
30
 *                                                                      *
31
 *                Commodity functions to handle xmlChars                *
32
 *                                                                      *
33
 ************************************************************************/
34

35
/**
36
 * xmlStrndup:
37
 * @cur:  the input xmlChar *
38
 * @len:  the len of @cur
39
 *
40
 * a strndup for array of xmlChar's
41
 *
42
 * Returns a new xmlChar * or NULL
43
 */
44
xmlChar *
45
xmlStrndup(const xmlChar *cur, int len) {
46
    xmlChar *ret;
47

48
    if ((cur == NULL) || (len < 0)) return(NULL);
49
    ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
50
    if (ret == NULL) {
51
        return(NULL);
52
    }
53
    memcpy(ret, cur, len);
54
    ret[len] = 0;
55
    return(ret);
56
}
57

58
/**
59
 * xmlStrdup:
60
 * @cur:  the input xmlChar *
61
 *
62
 * a strdup for array of xmlChar's. Since they are supposed to be
63
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
64
 * a termination mark of '0'.
65
 *
66
 * Returns a new xmlChar * or NULL
67
 */
68
xmlChar *
69
xmlStrdup(const xmlChar *cur) {
70
    const xmlChar *p = cur;
71

72
    if (cur == NULL) return(NULL);
73
    while (*p != 0) p++; /* non input consuming */
74
    return(xmlStrndup(cur, p - cur));
75
}
76

77
/**
78
 * xmlCharStrndup:
79
 * @cur:  the input char *
80
 * @len:  the len of @cur
81
 *
82
 * a strndup for char's to xmlChar's
83
 *
84
 * Returns a new xmlChar * or NULL
85
 */
86

87
xmlChar *
88
xmlCharStrndup(const char *cur, int len) {
89
    int i;
90
    xmlChar *ret;
91

92
    if ((cur == NULL) || (len < 0)) return(NULL);
93
    ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
94
    if (ret == NULL) {
95
        return(NULL);
96
    }
97
    for (i = 0;i < len;i++) {
98
        /* Explicit sign change */
99
        ret[i] = (xmlChar) cur[i];
100
        if (ret[i] == 0) return(ret);
101
    }
102
    ret[len] = 0;
103
    return(ret);
104
}
105

106
/**
107
 * xmlCharStrdup:
108
 * @cur:  the input char *
109
 *
110
 * a strdup for char's to xmlChar's
111
 *
112
 * Returns a new xmlChar * or NULL
113
 */
114

115
xmlChar *
116
xmlCharStrdup(const char *cur) {
117
    const char *p = cur;
118

119
    if (cur == NULL) return(NULL);
120
    while (*p != '\0') p++; /* non input consuming */
121
    return(xmlCharStrndup(cur, p - cur));
122
}
123

124
/**
125
 * xmlStrcmp:
126
 * @str1:  the first xmlChar *
127
 * @str2:  the second xmlChar *
128
 *
129
 * a strcmp for xmlChar's
130
 *
131
 * Returns the integer result of the comparison
132
 */
133

134
int
135
xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
136
    if (str1 == str2) return(0);
137
    if (str1 == NULL) return(-1);
138
    if (str2 == NULL) return(1);
139
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
140
    return(strcmp((const char *)str1, (const char *)str2));
141
#else
142
    do {
143
        int tmp = *str1++ - *str2;
144
        if (tmp != 0) return(tmp);
145
    } while (*str2++ != 0);
146
    return 0;
147
#endif
148
}
149

150
/**
151
 * xmlStrEqual:
152
 * @str1:  the first xmlChar *
153
 * @str2:  the second xmlChar *
154
 *
155
 * Check if both strings are equal of have same content.
156
 * Should be a bit more readable and faster than xmlStrcmp()
157
 *
158
 * Returns 1 if they are equal, 0 if they are different
159
 */
160

161
int
162
xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
163
    if (str1 == str2) return(1);
164
    if (str1 == NULL) return(0);
165
    if (str2 == NULL) return(0);
166
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
167
    return(strcmp((const char *)str1, (const char *)str2) == 0);
168
#else
169
    do {
170
        if (*str1++ != *str2) return(0);
171
    } while (*str2++);
172
    return(1);
173
#endif
174
}
175

176
/**
177
 * xmlStrQEqual:
178
 * @pref:  the prefix of the QName
179
 * @name:  the localname of the QName
180
 * @str:  the second xmlChar *
181
 *
182
 * Check if a QName is Equal to a given string
183
 *
184
 * Returns 1 if they are equal, 0 if they are different
185
 */
186

187
int
188
xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
189
    if (pref == NULL) return(xmlStrEqual(name, str));
190
    if (name == NULL) return(0);
191
    if (str == NULL) return(0);
192

193
    do {
194
        if (*pref++ != *str) return(0);
195
    } while ((*str++) && (*pref));
196
    if (*str++ != ':') return(0);
197
    do {
198
        if (*name++ != *str) return(0);
199
    } while (*str++);
200
    return(1);
201
}
202

203
/**
204
 * xmlStrncmp:
205
 * @str1:  the first xmlChar *
206
 * @str2:  the second xmlChar *
207
 * @len:  the max comparison length
208
 *
209
 * a strncmp for xmlChar's
210
 *
211
 * Returns the integer result of the comparison
212
 */
213

214
int
215
xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
216
    if (len <= 0) return(0);
217
    if (str1 == str2) return(0);
218
    if (str1 == NULL) return(-1);
219
    if (str2 == NULL) return(1);
220
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
221
    return(strncmp((const char *)str1, (const char *)str2, len));
222
#else
223
    do {
224
        int tmp = *str1++ - *str2;
225
        if (tmp != 0 || --len == 0) return(tmp);
226
    } while (*str2++ != 0);
227
    return 0;
228
#endif
229
}
230

231
static const xmlChar casemap[256] = {
232
    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
233
    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
234
    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
235
    0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
236
    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
237
    0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
238
    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
239
    0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
240
    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
241
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
242
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
243
    0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
244
    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
245
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
246
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
247
    0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
248
    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
249
    0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
250
    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
251
    0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
252
    0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
253
    0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
254
    0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
255
    0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
256
    0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
257
    0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
258
    0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
259
    0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
260
    0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
261
    0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
262
    0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
263
    0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
264
};
265

266
/**
267
 * xmlStrcasecmp:
268
 * @str1:  the first xmlChar *
269
 * @str2:  the second xmlChar *
270
 *
271
 * a strcasecmp for xmlChar's
272
 *
273
 * Returns the integer result of the comparison
274
 */
275

276
int
277
xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
278
    register int tmp;
279

280
    if (str1 == str2) return(0);
281
    if (str1 == NULL) return(-1);
282
    if (str2 == NULL) return(1);
283
    do {
284
        tmp = casemap[*str1++] - casemap[*str2];
285
        if (tmp != 0) return(tmp);
286
    } while (*str2++ != 0);
287
    return 0;
288
}
289

290
/**
291
 * xmlStrncasecmp:
292
 * @str1:  the first xmlChar *
293
 * @str2:  the second xmlChar *
294
 * @len:  the max comparison length
295
 *
296
 * a strncasecmp for xmlChar's
297
 *
298
 * Returns the integer result of the comparison
299
 */
300

301
int
302
xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
303
    register int tmp;
304

305
    if (len <= 0) return(0);
306
    if (str1 == str2) return(0);
307
    if (str1 == NULL) return(-1);
308
    if (str2 == NULL) return(1);
309
    do {
310
        tmp = casemap[*str1++] - casemap[*str2];
311
        if (tmp != 0 || --len == 0) return(tmp);
312
    } while (*str2++ != 0);
313
    return 0;
314
}
315

316
/**
317
 * xmlStrchr:
318
 * @str:  the xmlChar * array
319
 * @val:  the xmlChar to search
320
 *
321
 * a strchr for xmlChar's
322
 *
323
 * Returns the xmlChar * for the first occurrence or NULL.
324
 */
325

326
const xmlChar *
327
xmlStrchr(const xmlChar *str, xmlChar val) {
328
    if (str == NULL) return(NULL);
329
    while (*str != 0) { /* non input consuming */
330
        if (*str == val) return((xmlChar *) str);
331
        str++;
332
    }
333
    return(NULL);
334
}
335

336
/**
337
 * xmlStrstr:
338
 * @str:  the xmlChar * array (haystack)
339
 * @val:  the xmlChar to search (needle)
340
 *
341
 * a strstr for xmlChar's
342
 *
343
 * Returns the xmlChar * for the first occurrence or NULL.
344
 */
345

346
const xmlChar *
347
xmlStrstr(const xmlChar *str, const xmlChar *val) {
348
    int n;
349

350
    if (str == NULL) return(NULL);
351
    if (val == NULL) return(NULL);
352
    n = xmlStrlen(val);
353

354
    if (n == 0) return(str);
355
    while (*str != 0) { /* non input consuming */
356
        if (*str == *val) {
357
            if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
358
        }
359
        str++;
360
    }
361
    return(NULL);
362
}
363

364
/**
365
 * xmlStrcasestr:
366
 * @str:  the xmlChar * array (haystack)
367
 * @val:  the xmlChar to search (needle)
368
 *
369
 * a case-ignoring strstr for xmlChar's
370
 *
371
 * Returns the xmlChar * for the first occurrence or NULL.
372
 */
373

374
const xmlChar *
375
xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
376
    int n;
377

378
    if (str == NULL) return(NULL);
379
    if (val == NULL) return(NULL);
380
    n = xmlStrlen(val);
381

382
    if (n == 0) return(str);
383
    while (*str != 0) { /* non input consuming */
384
        if (casemap[*str] == casemap[*val])
385
            if (!xmlStrncasecmp(str, val, n)) return(str);
386
        str++;
387
    }
388
    return(NULL);
389
}
390

391
/**
392
 * xmlStrsub:
393
 * @str:  the xmlChar * array (haystack)
394
 * @start:  the index of the first char (zero based)
395
 * @len:  the length of the substring
396
 *
397
 * Extract a substring of a given string
398
 *
399
 * Returns the xmlChar * for the first occurrence or NULL.
400
 */
401

402
xmlChar *
403
xmlStrsub(const xmlChar *str, int start, int len) {
404
    int i;
405

406
    if (str == NULL) return(NULL);
407
    if (start < 0) return(NULL);
408
    if (len < 0) return(NULL);
409

410
    for (i = 0;i < start;i++) {
411
        if (*str == 0) return(NULL);
412
        str++;
413
    }
414
    if (*str == 0) return(NULL);
415
    return(xmlStrndup(str, len));
416
}
417

418
/**
419
 * xmlStrlen:
420
 * @str:  the xmlChar * array
421
 *
422
 * length of a xmlChar's string
423
 *
424
 * Returns the number of xmlChar contained in the ARRAY.
425
 */
426

427
int
428
xmlStrlen(const xmlChar *str) {
429
    size_t len = str ? strlen((const char *)str) : 0;
430
    return(len > INT_MAX ? 0 : len);
431
}
432

433
/**
434
 * xmlStrncat:
435
 * @cur:  the original xmlChar * array
436
 * @add:  the xmlChar * array added
437
 * @len:  the length of @add
438
 *
439
 * a strncat for array of xmlChar's, it will extend @cur with the len
440
 * first bytes of @add. Note that if @len < 0 then this is an API error
441
 * and NULL will be returned.
442
 *
443
 * Returns a new xmlChar *, the original @cur is reallocated and should
444
 * not be freed.
445
 */
446

447
xmlChar *
448
xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
449
    int size;
450
    xmlChar *ret;
451

452
    if ((add == NULL) || (len == 0))
453
        return(cur);
454
    if (len < 0)
455
	return(NULL);
456
    if (cur == NULL)
457
        return(xmlStrndup(add, len));
458

459
    size = xmlStrlen(cur);
460
    if ((size < 0) || (size > INT_MAX - len))
461
        return(NULL);
462
    ret = (xmlChar *) xmlRealloc(cur, (size_t) size + len + 1);
463
    if (ret == NULL) {
464
        return(cur);
465
    }
466
    memcpy(&ret[size], add, len);
467
    ret[size + len] = 0;
468
    return(ret);
469
}
470

471
/**
472
 * xmlStrncatNew:
473
 * @str1:  first xmlChar string
474
 * @str2:  second xmlChar string
475
 * @len:  the len of @str2 or < 0
476
 *
477
 * same as xmlStrncat, but creates a new string.  The original
478
 * two strings are not freed. If @len is < 0 then the length
479
 * will be calculated automatically.
480
 *
481
 * Returns a new xmlChar * or NULL
482
 */
483
xmlChar *
484
xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
485
    int size;
486
    xmlChar *ret;
487

488
    if (len < 0) {
489
        len = xmlStrlen(str2);
490
        if (len < 0)
491
            return(NULL);
492
    }
493
    if ((str2 == NULL) || (len == 0))
494
        return(xmlStrdup(str1));
495
    if (str1 == NULL)
496
        return(xmlStrndup(str2, len));
497

498
    size = xmlStrlen(str1);
499
    if ((size < 0) || (size > INT_MAX - len))
500
        return(NULL);
501
    ret = (xmlChar *) xmlMalloc((size_t) size + len + 1);
502
    if (ret == NULL) {
503
        return(xmlStrndup(str1, size));
504
    }
505
    memcpy(ret, str1, size);
506
    memcpy(&ret[size], str2, len);
507
    ret[size + len] = 0;
508
    return(ret);
509
}
510

511
/**
512
 * xmlStrcat:
513
 * @cur:  the original xmlChar * array
514
 * @add:  the xmlChar * array added
515
 *
516
 * a strcat for array of xmlChar's. Since they are supposed to be
517
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
518
 * a termination mark of '0'.
519
 *
520
 * Returns a new xmlChar * containing the concatenated string. The original
521
 * @cur is reallocated and should not be freed.
522
 */
523
xmlChar *
524
xmlStrcat(xmlChar *cur, const xmlChar *add) {
525
    const xmlChar *p = add;
526

527
    if (add == NULL) return(cur);
528
    if (cur == NULL)
529
        return(xmlStrdup(add));
530

531
    while (*p != 0) p++; /* non input consuming */
532
    return(xmlStrncat(cur, add, p - add));
533
}
534

535
/**
536
 * xmlStrPrintf:
537
 * @buf:   the result buffer.
538
 * @len:   the result buffer length.
539
 * @msg:   the message with printf formatting.
540
 * @...:   extra parameters for the message.
541
 *
542
 * Formats @msg and places result into @buf.
543
 *
544
 * Returns the number of characters written to @buf or -1 if an error occurs.
545
 */
546
int
547
xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
548
    va_list args;
549
    int ret;
550

551
    if((buf == NULL) || (msg == NULL)) {
552
        return(-1);
553
    }
554

555
    va_start(args, msg);
556
    ret = vsnprintf((char *) buf, len, (const char *) msg, args);
557
    va_end(args);
558
    buf[len - 1] = 0; /* be safe ! */
559

560
    return(ret);
561
}
562

563
/**
564
 * xmlStrVPrintf:
565
 * @buf:   the result buffer.
566
 * @len:   the result buffer length.
567
 * @msg:   the message with printf formatting.
568
 * @ap:    extra parameters for the message.
569
 *
570
 * Formats @msg and places result into @buf.
571
 *
572
 * Returns the number of characters written to @buf or -1 if an error occurs.
573
 */
574
int
575
xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
576
    int ret;
577

578
    if((buf == NULL) || (msg == NULL)) {
579
        return(-1);
580
    }
581

582
    ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
583
    buf[len - 1] = 0; /* be safe ! */
584

585
    return(ret);
586
}
587

588
/************************************************************************
589
 *                                                                      *
590
 *              Generic UTF8 handling routines                          *
591
 *                                                                      *
592
 * From rfc2044: encoding of the Unicode values on UTF-8:               *
593
 *                                                                      *
594
 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
595
 * 0000 0000-0000 007F   0xxxxxxx                                       *
596
 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
597
 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
598
 *                                                                      *
599
 * I hope we won't use values > 0xFFFF anytime soon !                   *
600
 *                                                                      *
601
 ************************************************************************/
602

603

604
/**
605
 * xmlUTF8Size:
606
 * @utf: pointer to the UTF8 character
607
 *
608
 * calculates the internal size of a UTF8 character
609
 *
610
 * returns the numbers of bytes in the character, -1 on format error
611
 */
612
int
613
xmlUTF8Size(const xmlChar *utf) {
614
    xmlChar mask;
615
    int len;
616

617
    if (utf == NULL)
618
        return -1;
619
    if (*utf < 0x80)
620
        return 1;
621
    /* check valid UTF8 character */
622
    if (!(*utf & 0x40))
623
        return -1;
624
    /* determine number of bytes in char */
625
    len = 2;
626
    for (mask=0x20; mask != 0; mask>>=1) {
627
        if (!(*utf & mask))
628
            return len;
629
        len++;
630
    }
631
    return -1;
632
}
633

634
/**
635
 * xmlUTF8Charcmp:
636
 * @utf1: pointer to first UTF8 char
637
 * @utf2: pointer to second UTF8 char
638
 *
639
 * compares the two UCS4 values
640
 *
641
 * returns result of the compare as with xmlStrncmp
642
 */
643
int
644
xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
645

646
    if (utf1 == NULL ) {
647
        if (utf2 == NULL)
648
            return 0;
649
        return -1;
650
    }
651
    return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
652
}
653

654
/**
655
 * xmlUTF8Strlen:
656
 * @utf:  a sequence of UTF-8 encoded bytes
657
 *
658
 * compute the length of an UTF8 string, it doesn't do a full UTF8
659
 * checking of the content of the string.
660
 *
661
 * Returns the number of characters in the string or -1 in case of error
662
 */
663
int
664
xmlUTF8Strlen(const xmlChar *utf) {
665
    size_t ret = 0;
666

667
    if (utf == NULL)
668
        return(-1);
669

670
    while (*utf != 0) {
671
        if (utf[0] & 0x80) {
672
            if ((utf[1] & 0xc0) != 0x80)
673
                return(-1);
674
            if ((utf[0] & 0xe0) == 0xe0) {
675
                if ((utf[2] & 0xc0) != 0x80)
676
                    return(-1);
677
                if ((utf[0] & 0xf0) == 0xf0) {
678
                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
679
                        return(-1);
680
                    utf += 4;
681
                } else {
682
                    utf += 3;
683
                }
684
            } else {
685
                utf += 2;
686
            }
687
        } else {
688
            utf++;
689
        }
690
        ret++;
691
    }
692
    return(ret > INT_MAX ? 0 : ret);
693
}
694

695
/**
696
 * xmlGetUTF8Char:
697
 * @utf:  a sequence of UTF-8 encoded bytes
698
 * @len:  a pointer to the minimum number of bytes present in
699
 *        the sequence.  This is used to assure the next character
700
 *        is completely contained within the sequence.
701
 *
702
 * Read the first UTF8 character from @utf
703
 *
704
 * Returns the char value or -1 in case of error, and sets *len to
705
 *        the actual number of bytes consumed (0 in case of error)
706
 */
707
int
708
xmlGetUTF8Char(const unsigned char *utf, int *len) {
709
    unsigned int c;
710

711
    if (utf == NULL)
712
        goto error;
713
    if (len == NULL)
714
        goto error;
715

716
    c = utf[0];
717
    if (c < 0x80) {
718
        if (*len < 1)
719
            goto error;
720
        /* 1-byte code */
721
        *len = 1;
722
    } else {
723
        if ((*len < 2) || ((utf[1] & 0xc0) != 0x80))
724
            goto error;
725
        if (c < 0xe0) {
726
            if (c < 0xc2)
727
                goto error;
728
            /* 2-byte code */
729
            *len = 2;
730
            c = (c & 0x1f) << 6;
731
            c |= utf[1] & 0x3f;
732
        } else {
733
            if ((*len < 3) || ((utf[2] & 0xc0) != 0x80))
734
                goto error;
735
            if (c < 0xf0) {
736
                /* 3-byte code */
737
                *len = 3;
738
                c = (c & 0xf) << 12;
739
                c |= (utf[1] & 0x3f) << 6;
740
                c |= utf[2] & 0x3f;
741
                if ((c < 0x800) || ((c >= 0xd800) && (c < 0xe000)))
742
                    goto error;
743
            } else {
744
                if ((*len < 4) || ((utf[3] & 0xc0) != 0x80))
745
                    goto error;
746
                *len = 4;
747
                /* 4-byte code */
748
                c = (c & 0x7) << 18;
749
                c |= (utf[1] & 0x3f) << 12;
750
                c |= (utf[2] & 0x3f) << 6;
751
                c |= utf[3] & 0x3f;
752
                if ((c < 0x10000) || (c >= 0x110000))
753
                    goto error;
754
            }
755
        }
756
    }
757
    return(c);
758

759
error:
760
    if (len != NULL)
761
	*len = 0;
762
    return(-1);
763
}
764

765
/**
766
 * xmlCheckUTF8:
767
 * @utf: Pointer to putative UTF-8 encoded string.
768
 *
769
 * Checks @utf for being valid UTF-8. @utf is assumed to be
770
 * null-terminated. This function is not super-strict, as it will
771
 * allow longer UTF-8 sequences than necessary. Note that Java is
772
 * capable of producing these sequences if provoked. Also note, this
773
 * routine checks for the 4-byte maximum size, but does not check for
774
 * 0x10ffff maximum value.
775
 *
776
 * Return value: true if @utf is valid.
777
 **/
778
int
779
xmlCheckUTF8(const unsigned char *utf)
780
{
781
    int ix;
782
    unsigned char c;
783

784
    if (utf == NULL)
785
        return(0);
786
    /*
787
     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
788
     * are as follows (in "bit format"):
789
     *    0xxxxxxx                                      valid 1-byte
790
     *    110xxxxx 10xxxxxx                             valid 2-byte
791
     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
792
     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
793
     */
794
    while ((c = utf[0])) {      /* string is 0-terminated */
795
        ix = 0;
796
        if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
797
            ix = 1;
798
	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
799
	    if ((utf[1] & 0xc0 ) != 0x80)
800
	        return 0;
801
	    ix = 2;
802
	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
803
	    if (((utf[1] & 0xc0) != 0x80) ||
804
	        ((utf[2] & 0xc0) != 0x80))
805
		    return 0;
806
	    ix = 3;
807
	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
808
	    if (((utf[1] & 0xc0) != 0x80) ||
809
	        ((utf[2] & 0xc0) != 0x80) ||
810
		((utf[3] & 0xc0) != 0x80))
811
		    return 0;
812
	    ix = 4;
813
	} else				/* unknown encoding */
814
	    return 0;
815
        utf += ix;
816
      }
817
      return(1);
818
}
819

820
/**
821
 * xmlUTF8Strsize:
822
 * @utf:  a sequence of UTF-8 encoded bytes
823
 * @len:  the number of characters in the array
824
 *
825
 * storage size of an UTF8 string
826
 * the behaviour is not guaranteed if the input string is not UTF-8
827
 *
828
 * Returns the storage size of
829
 * the first 'len' characters of ARRAY
830
 */
831

832
int
833
xmlUTF8Strsize(const xmlChar *utf, int len) {
834
    const xmlChar *ptr=utf;
835
    int ch;
836
    size_t ret;
837

838
    if (utf == NULL)
839
        return(0);
840

841
    if (len <= 0)
842
        return(0);
843

844
    while ( len-- > 0) {
845
        if ( !*ptr )
846
            break;
847
        if ( (ch = *ptr++) & 0x80)
848
            while ((ch<<=1) & 0x80 ) {
849
		if (*ptr == 0) break;
850
                ptr++;
851
	    }
852
    }
853
    ret = ptr - utf;
854
    return (ret > INT_MAX ? 0 : ret);
855
}
856

857

858
/**
859
 * xmlUTF8Strndup:
860
 * @utf:  the input UTF8 *
861
 * @len:  the len of @utf (in chars)
862
 *
863
 * a strndup for array of UTF8's
864
 *
865
 * Returns a new UTF8 * or NULL
866
 */
867
xmlChar *
868
xmlUTF8Strndup(const xmlChar *utf, int len) {
869
    xmlChar *ret;
870
    int i;
871

872
    if ((utf == NULL) || (len < 0)) return(NULL);
873
    i = xmlUTF8Strsize(utf, len);
874
    ret = (xmlChar *) xmlMallocAtomic((size_t) i + 1);
875
    if (ret == NULL) {
876
        return(NULL);
877
    }
878
    memcpy(ret, utf, i);
879
    ret[i] = 0;
880
    return(ret);
881
}
882

883
/**
884
 * xmlUTF8Strpos:
885
 * @utf:  the input UTF8 *
886
 * @pos:  the position of the desired UTF8 char (in chars)
887
 *
888
 * a function to provide the equivalent of fetching a
889
 * character from a string array
890
 *
891
 * Returns a pointer to the UTF8 character or NULL
892
 */
893
const xmlChar *
894
xmlUTF8Strpos(const xmlChar *utf, int pos) {
895
    int ch;
896

897
    if (utf == NULL) return(NULL);
898
    if (pos < 0)
899
        return(NULL);
900
    while (pos--) {
901
        if ((ch=*utf++) == 0) return(NULL);
902
        if ( ch & 0x80 ) {
903
            /* if not simple ascii, verify proper format */
904
            if ( (ch & 0xc0) != 0xc0 )
905
                return(NULL);
906
            /* then skip over remaining bytes for this char */
907
            while ( (ch <<= 1) & 0x80 )
908
                if ( (*utf++ & 0xc0) != 0x80 )
909
                    return(NULL);
910
        }
911
    }
912
    return((xmlChar *)utf);
913
}
914

915
/**
916
 * xmlUTF8Strloc:
917
 * @utf:  the input UTF8 *
918
 * @utfchar:  the UTF8 character to be found
919
 *
920
 * a function to provide the relative location of a UTF8 char
921
 *
922
 * Returns the relative character position of the desired char
923
 * or -1 if not found
924
 */
925
int
926
xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
927
    size_t i;
928
    int size;
929
    int ch;
930

931
    if (utf==NULL || utfchar==NULL) return -1;
932
    size = xmlUTF8Strsize(utfchar, 1);
933
        for(i=0; (ch=*utf) != 0; i++) {
934
            if (xmlStrncmp(utf, utfchar, size)==0)
935
                return(i > INT_MAX ? 0 : i);
936
            utf++;
937
            if ( ch & 0x80 ) {
938
                /* if not simple ascii, verify proper format */
939
                if ( (ch & 0xc0) != 0xc0 )
940
                    return(-1);
941
                /* then skip over remaining bytes for this char */
942
                while ( (ch <<= 1) & 0x80 )
943
                    if ( (*utf++ & 0xc0) != 0x80 )
944
                        return(-1);
945
            }
946
        }
947

948
    return(-1);
949
}
950
/**
951
 * xmlUTF8Strsub:
952
 * @utf:  a sequence of UTF-8 encoded bytes
953
 * @start: relative pos of first char
954
 * @len:   total number to copy
955
 *
956
 * Create a substring from a given UTF-8 string
957
 * Note:  positions are given in units of UTF-8 chars
958
 *
959
 * Returns a pointer to a newly created string
960
 * or NULL if any problem
961
 */
962

963
xmlChar *
964
xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
965
    int i;
966
    int ch;
967

968
    if (utf == NULL) return(NULL);
969
    if (start < 0) return(NULL);
970
    if (len < 0) return(NULL);
971

972
    /*
973
     * Skip over any leading chars
974
     */
975
    for (i = 0;i < start;i++) {
976
        if ((ch=*utf++) == 0) return(NULL);
977
        if ( ch & 0x80 ) {
978
            /* if not simple ascii, verify proper format */
979
            if ( (ch & 0xc0) != 0xc0 )
980
                return(NULL);
981
            /* then skip over remaining bytes for this char */
982
            while ( (ch <<= 1) & 0x80 )
983
                if ( (*utf++ & 0xc0) != 0x80 )
984
                    return(NULL);
985
        }
986
    }
987

988
    return(xmlUTF8Strndup(utf, len));
989
}
990

991
/**
992
 * xmlEscapeFormatString:
993
 * @msg:  a pointer to the string in which to escape '%' characters.
994
 * Must be a heap-allocated buffer created by libxml2 that may be
995
 * returned, or that may be freed and replaced.
996
 *
997
 * Replaces the string pointed to by 'msg' with an escaped string.
998
 * Returns the same string with all '%' characters escaped.
999
 */
1000
xmlChar *
1001
xmlEscapeFormatString(xmlChar **msg)
1002
{
1003
    xmlChar *msgPtr = NULL;
1004
    xmlChar *result = NULL;
1005
    xmlChar *resultPtr = NULL;
1006
    size_t count = 0;
1007
    size_t msgLen = 0;
1008
    size_t resultLen = 0;
1009

1010
    if (!msg || !*msg)
1011
        return(NULL);
1012

1013
    for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1014
        ++msgLen;
1015
        if (*msgPtr == '%')
1016
            ++count;
1017
    }
1018

1019
    if (count == 0)
1020
        return(*msg);
1021

1022
    if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1023
        return(NULL);
1024
    resultLen = msgLen + count + 1;
1025
    result = (xmlChar *) xmlMallocAtomic(resultLen);
1026
    if (result == NULL) {
1027
        /* Clear *msg to prevent format string vulnerabilities in
1028
           out-of-memory situations. */
1029
        xmlFree(*msg);
1030
        *msg = NULL;
1031
        return(NULL);
1032
    }
1033

1034
    for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1035
        *resultPtr = *msgPtr;
1036
        if (*msgPtr == '%')
1037
            *(++resultPtr) = '%';
1038
    }
1039
    result[resultLen - 1] = '\0';
1040

1041
    xmlFree(*msg);
1042
    *msg = result;
1043

1044
    return *msg;
1045
}
1046

1047
Product

Resources

Company