Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Support/ConvertUTF.cpp
35232 views
1
/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
2
*
3
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
* See https://llvm.org/LICENSE.txt for license information.
5
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
*
7
*===------------------------------------------------------------------------=*/
8
/*
9
* Copyright © 1991-2015 Unicode, Inc. All rights reserved.
10
* Distributed under the Terms of Use in
11
* http://www.unicode.org/copyright.html.
12
*
13
* Permission is hereby granted, free of charge, to any person obtaining
14
* a copy of the Unicode data files and any associated documentation
15
* (the "Data Files") or Unicode software and any associated documentation
16
* (the "Software") to deal in the Data Files or Software
17
* without restriction, including without limitation the rights to use,
18
* copy, modify, merge, publish, distribute, and/or sell copies of
19
* the Data Files or Software, and to permit persons to whom the Data Files
20
* or Software are furnished to do so, provided that
21
* (a) this copyright and permission notice appear with all copies
22
* of the Data Files or Software,
23
* (b) this copyright and permission notice appear in associated
24
* documentation, and
25
* (c) there is clear notice in each modified Data File or in the Software
26
* as well as in the documentation associated with the Data File(s) or
27
* Software that the data or software has been modified.
28
*
29
* THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
30
* ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
31
* WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32
* NONINFRINGEMENT OF THIRD PARTY RIGHTS.
33
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
34
* NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
35
* DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
36
* DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
37
* TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
38
* PERFORMANCE OF THE DATA FILES OR SOFTWARE.
39
*
40
* Except as contained in this notice, the name of a copyright holder
41
* shall not be used in advertising or otherwise to promote the sale,
42
* use or other dealings in these Data Files or Software without prior
43
* written authorization of the copyright holder.
44
*/
45
46
/* ---------------------------------------------------------------------
47
48
Conversions between UTF32, UTF-16, and UTF-8. Source code file.
49
Author: Mark E. Davis, 1994.
50
Rev History: Rick McGowan, fixes & updates May 2001.
51
Sept 2001: fixed const & error conditions per
52
mods suggested by S. Parent & A. Lillich.
53
June 2002: Tim Dodd added detection and handling of incomplete
54
source sequences, enhanced error detection, added casts
55
to eliminate compiler warnings.
56
July 2003: slight mods to back out aggressive FFFE detection.
57
Jan 2004: updated switches in from-UTF8 conversions.
58
Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
59
60
See the header file "ConvertUTF.h" for complete documentation.
61
62
------------------------------------------------------------------------ */
63
64
#include "llvm/Support/ConvertUTF.h"
65
#ifdef CVTUTF_DEBUG
66
#include <stdio.h>
67
#endif
68
#include <assert.h>
69
70
/*
71
* This code extensively uses fall-through switches.
72
* Keep the compiler from warning about that.
73
*/
74
#if defined(__clang__) && defined(__has_warning)
75
# if __has_warning("-Wimplicit-fallthrough")
76
# define ConvertUTF_DISABLE_WARNINGS \
77
_Pragma("clang diagnostic push") \
78
_Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
79
# define ConvertUTF_RESTORE_WARNINGS \
80
_Pragma("clang diagnostic pop")
81
# endif
82
#elif defined(__GNUC__) && __GNUC__ > 6
83
# define ConvertUTF_DISABLE_WARNINGS \
84
_Pragma("GCC diagnostic push") \
85
_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
86
# define ConvertUTF_RESTORE_WARNINGS \
87
_Pragma("GCC diagnostic pop")
88
#endif
89
#ifndef ConvertUTF_DISABLE_WARNINGS
90
# define ConvertUTF_DISABLE_WARNINGS
91
#endif
92
#ifndef ConvertUTF_RESTORE_WARNINGS
93
# define ConvertUTF_RESTORE_WARNINGS
94
#endif
95
96
ConvertUTF_DISABLE_WARNINGS
97
98
namespace llvm {
99
100
static const int halfShift = 10; /* used for shifting by 10 bits */
101
102
static const UTF32 halfBase = 0x0010000UL;
103
static const UTF32 halfMask = 0x3FFUL;
104
105
#define UNI_SUR_HIGH_START (UTF32)0xD800
106
#define UNI_SUR_HIGH_END (UTF32)0xDBFF
107
#define UNI_SUR_LOW_START (UTF32)0xDC00
108
#define UNI_SUR_LOW_END (UTF32)0xDFFF
109
110
/* --------------------------------------------------------------------- */
111
112
/*
113
* Index into the table below with the first byte of a UTF-8 sequence to
114
* get the number of trailing bytes that are supposed to follow it.
115
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
116
* left as-is for anyone who may want to do such conversion, which was
117
* allowed in earlier algorithms.
118
*/
119
static const char trailingBytesForUTF8[256] = {
120
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
121
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
122
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
123
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
124
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
125
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
126
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
127
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
128
};
129
130
/*
131
* Magic values subtracted from a buffer value during UTF8 conversion.
132
* This table contains as many values as there might be trailing bytes
133
* in a UTF-8 sequence.
134
*/
135
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
136
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
137
138
/*
139
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
140
* into the first byte, depending on how many bytes follow. There are
141
* as many entries in this table as there are UTF-8 sequence types.
142
* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
143
* for *legal* UTF-8 will be 4 or fewer bytes total.
144
*/
145
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
146
147
/* --------------------------------------------------------------------- */
148
149
/* The interface converts a whole buffer to avoid function-call overhead.
150
* Constants have been gathered. Loops & conditionals have been removed as
151
* much as possible for efficiency, in favor of drop-through switches.
152
* (See "Note A" at the bottom of the file for equivalent code.)
153
* If your compiler supports it, the "isLegalUTF8" call can be turned
154
* into an inline function.
155
*/
156
157
158
/* --------------------------------------------------------------------- */
159
160
ConversionResult ConvertUTF32toUTF16 (
161
const UTF32** sourceStart, const UTF32* sourceEnd,
162
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
163
ConversionResult result = conversionOK;
164
const UTF32* source = *sourceStart;
165
UTF16* target = *targetStart;
166
while (source < sourceEnd) {
167
UTF32 ch;
168
if (target >= targetEnd) {
169
result = targetExhausted; break;
170
}
171
ch = *source++;
172
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
173
/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
174
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
175
if (flags == strictConversion) {
176
--source; /* return to the illegal value itself */
177
result = sourceIllegal;
178
break;
179
} else {
180
*target++ = UNI_REPLACEMENT_CHAR;
181
}
182
} else {
183
*target++ = (UTF16)ch; /* normal case */
184
}
185
} else if (ch > UNI_MAX_LEGAL_UTF32) {
186
if (flags == strictConversion) {
187
result = sourceIllegal;
188
} else {
189
*target++ = UNI_REPLACEMENT_CHAR;
190
}
191
} else {
192
/* target is a character in range 0xFFFF - 0x10FFFF. */
193
if (target + 1 >= targetEnd) {
194
--source; /* Back up source pointer! */
195
result = targetExhausted; break;
196
}
197
ch -= halfBase;
198
*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
199
*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
200
}
201
}
202
*sourceStart = source;
203
*targetStart = target;
204
return result;
205
}
206
207
/* --------------------------------------------------------------------- */
208
209
ConversionResult ConvertUTF16toUTF32 (
210
const UTF16** sourceStart, const UTF16* sourceEnd,
211
UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
212
ConversionResult result = conversionOK;
213
const UTF16* source = *sourceStart;
214
UTF32* target = *targetStart;
215
UTF32 ch, ch2;
216
while (source < sourceEnd) {
217
const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
218
ch = *source++;
219
/* If we have a surrogate pair, convert to UTF32 first. */
220
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
221
/* If the 16 bits following the high surrogate are in the source buffer... */
222
if (source < sourceEnd) {
223
ch2 = *source;
224
/* If it's a low surrogate, convert to UTF32. */
225
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
226
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
227
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
228
++source;
229
} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
230
--source; /* return to the illegal value itself */
231
result = sourceIllegal;
232
break;
233
}
234
} else { /* We don't have the 16 bits following the high surrogate. */
235
--source; /* return to the high surrogate */
236
result = sourceExhausted;
237
break;
238
}
239
} else if (flags == strictConversion) {
240
/* UTF-16 surrogate values are illegal in UTF-32 */
241
if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
242
--source; /* return to the illegal value itself */
243
result = sourceIllegal;
244
break;
245
}
246
}
247
if (target >= targetEnd) {
248
source = oldSource; /* Back up source pointer! */
249
result = targetExhausted; break;
250
}
251
*target++ = ch;
252
}
253
*sourceStart = source;
254
*targetStart = target;
255
#ifdef CVTUTF_DEBUG
256
if (result == sourceIllegal) {
257
fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
258
fflush(stderr);
259
}
260
#endif
261
return result;
262
}
263
ConversionResult ConvertUTF16toUTF8 (
264
const UTF16** sourceStart, const UTF16* sourceEnd,
265
UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
266
ConversionResult result = conversionOK;
267
const UTF16* source = *sourceStart;
268
UTF8* target = *targetStart;
269
while (source < sourceEnd) {
270
UTF32 ch;
271
unsigned short bytesToWrite = 0;
272
const UTF32 byteMask = 0xBF;
273
const UTF32 byteMark = 0x80;
274
const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
275
ch = *source++;
276
/* If we have a surrogate pair, convert to UTF32 first. */
277
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
278
/* If the 16 bits following the high surrogate are in the source buffer... */
279
if (source < sourceEnd) {
280
UTF32 ch2 = *source;
281
/* If it's a low surrogate, convert to UTF32. */
282
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
283
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
284
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
285
++source;
286
} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
287
--source; /* return to the illegal value itself */
288
result = sourceIllegal;
289
break;
290
}
291
} else { /* We don't have the 16 bits following the high surrogate. */
292
--source; /* return to the high surrogate */
293
result = sourceExhausted;
294
break;
295
}
296
} else if (flags == strictConversion) {
297
/* UTF-16 surrogate values are illegal in UTF-32 */
298
if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
299
--source; /* return to the illegal value itself */
300
result = sourceIllegal;
301
break;
302
}
303
}
304
/* Figure out how many bytes the result will require */
305
if (ch < (UTF32)0x80) { bytesToWrite = 1;
306
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
307
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
308
} else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
309
} else { bytesToWrite = 3;
310
ch = UNI_REPLACEMENT_CHAR;
311
}
312
313
target += bytesToWrite;
314
if (target > targetEnd) {
315
source = oldSource; /* Back up source pointer! */
316
target -= bytesToWrite; result = targetExhausted; break;
317
}
318
switch (bytesToWrite) { /* note: everything falls through. */
319
case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
320
case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
321
case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
322
case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
323
}
324
target += bytesToWrite;
325
}
326
*sourceStart = source;
327
*targetStart = target;
328
return result;
329
}
330
331
/* --------------------------------------------------------------------- */
332
333
ConversionResult ConvertUTF32toUTF8 (
334
const UTF32** sourceStart, const UTF32* sourceEnd,
335
UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
336
ConversionResult result = conversionOK;
337
const UTF32* source = *sourceStart;
338
UTF8* target = *targetStart;
339
while (source < sourceEnd) {
340
UTF32 ch;
341
unsigned short bytesToWrite = 0;
342
const UTF32 byteMask = 0xBF;
343
const UTF32 byteMark = 0x80;
344
ch = *source++;
345
if (flags == strictConversion ) {
346
/* UTF-16 surrogate values are illegal in UTF-32 */
347
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
348
--source; /* return to the illegal value itself */
349
result = sourceIllegal;
350
break;
351
}
352
}
353
/*
354
* Figure out how many bytes the result will require. Turn any
355
* illegally large UTF32 things (> Plane 17) into replacement chars.
356
*/
357
if (ch < (UTF32)0x80) { bytesToWrite = 1;
358
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
359
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
360
} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
361
} else { bytesToWrite = 3;
362
ch = UNI_REPLACEMENT_CHAR;
363
result = sourceIllegal;
364
}
365
366
target += bytesToWrite;
367
if (target > targetEnd) {
368
--source; /* Back up source pointer! */
369
target -= bytesToWrite; result = targetExhausted; break;
370
}
371
switch (bytesToWrite) { /* note: everything falls through. */
372
case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
373
case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
374
case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
375
case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
376
}
377
target += bytesToWrite;
378
}
379
*sourceStart = source;
380
*targetStart = target;
381
return result;
382
}
383
384
/* --------------------------------------------------------------------- */
385
386
/*
387
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
388
* This must be called with the length pre-determined by the first byte.
389
* If not calling this from ConvertUTF8to*, then the length can be set by:
390
* length = trailingBytesForUTF8[*source]+1;
391
* and the sequence is illegal right away if there aren't that many bytes
392
* available.
393
* If presented with a length > 4, this returns false. The Unicode
394
* definition of UTF-8 goes up to 4-byte sequences.
395
*/
396
397
static Boolean isLegalUTF8(const UTF8 *source, int length) {
398
UTF8 a;
399
const UTF8 *srcptr = source+length;
400
switch (length) {
401
default: return false;
402
/* Everything else falls through when "true"... */
403
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
404
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
405
case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
406
407
switch (*source) {
408
/* no fall-through in this inner switch */
409
case 0xE0: if (a < 0xA0) return false; break;
410
case 0xED: if (a > 0x9F) return false; break;
411
case 0xF0: if (a < 0x90) return false; break;
412
case 0xF4: if (a > 0x8F) return false; break;
413
default: if (a < 0x80) return false;
414
}
415
416
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
417
}
418
if (*source > 0xF4) return false;
419
return true;
420
}
421
422
/* --------------------------------------------------------------------- */
423
424
/*
425
* Exported function to return whether a UTF-8 sequence is legal or not.
426
* This is not used here; it's just exported.
427
*/
428
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
429
int length = trailingBytesForUTF8[*source]+1;
430
if (length > sourceEnd - source) {
431
return false;
432
}
433
return isLegalUTF8(source, length);
434
}
435
436
/*
437
* Exported function to return the size of the first utf-8 code unit sequence,
438
* Or 0 if the sequence is not valid;
439
*/
440
unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
441
int length = trailingBytesForUTF8[*source] + 1;
442
return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
443
: 0;
444
}
445
446
/* --------------------------------------------------------------------- */
447
448
static unsigned
449
findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
450
const UTF8 *sourceEnd) {
451
UTF8 b1, b2, b3;
452
453
assert(!isLegalUTF8Sequence(source, sourceEnd));
454
455
/*
456
* Unicode 6.3.0, D93b:
457
*
458
* Maximal subpart of an ill-formed subsequence: The longest code unit
459
* subsequence starting at an unconvertible offset that is either:
460
* a. the initial subsequence of a well-formed code unit sequence, or
461
* b. a subsequence of length one.
462
*/
463
464
if (source == sourceEnd)
465
return 0;
466
467
/*
468
* Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
469
* Byte Sequences.
470
*/
471
472
b1 = *source;
473
++source;
474
if (b1 >= 0xC2 && b1 <= 0xDF) {
475
/*
476
* First byte is valid, but we know that this code unit sequence is
477
* invalid, so the maximal subpart has to end after the first byte.
478
*/
479
return 1;
480
}
481
482
if (source == sourceEnd)
483
return 1;
484
485
b2 = *source;
486
++source;
487
488
if (b1 == 0xE0) {
489
return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
490
}
491
if (b1 >= 0xE1 && b1 <= 0xEC) {
492
return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
493
}
494
if (b1 == 0xED) {
495
return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
496
}
497
if (b1 >= 0xEE && b1 <= 0xEF) {
498
return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
499
}
500
if (b1 == 0xF0) {
501
if (b2 >= 0x90 && b2 <= 0xBF) {
502
if (source == sourceEnd)
503
return 2;
504
505
b3 = *source;
506
return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
507
}
508
return 1;
509
}
510
if (b1 >= 0xF1 && b1 <= 0xF3) {
511
if (b2 >= 0x80 && b2 <= 0xBF) {
512
if (source == sourceEnd)
513
return 2;
514
515
b3 = *source;
516
return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
517
}
518
return 1;
519
}
520
if (b1 == 0xF4) {
521
if (b2 >= 0x80 && b2 <= 0x8F) {
522
if (source == sourceEnd)
523
return 2;
524
525
b3 = *source;
526
return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
527
}
528
return 1;
529
}
530
531
assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
532
/*
533
* There are no valid sequences that start with these bytes. Maximal subpart
534
* is defined to have length 1 in these cases.
535
*/
536
return 1;
537
}
538
539
/* --------------------------------------------------------------------- */
540
541
/*
542
* Exported function to return the total number of bytes in a codepoint
543
* represented in UTF-8, given the value of the first byte.
544
*/
545
unsigned getNumBytesForUTF8(UTF8 first) {
546
return trailingBytesForUTF8[first] + 1;
547
}
548
549
/* --------------------------------------------------------------------- */
550
551
/*
552
* Exported function to return whether a UTF-8 string is legal or not.
553
* This is not used here; it's just exported.
554
*/
555
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
556
while (*source != sourceEnd) {
557
int length = trailingBytesForUTF8[**source] + 1;
558
if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
559
return false;
560
*source += length;
561
}
562
return true;
563
}
564
565
/* --------------------------------------------------------------------- */
566
567
ConversionResult ConvertUTF8toUTF16 (
568
const UTF8** sourceStart, const UTF8* sourceEnd,
569
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
570
ConversionResult result = conversionOK;
571
const UTF8* source = *sourceStart;
572
UTF16* target = *targetStart;
573
while (source < sourceEnd) {
574
UTF32 ch = 0;
575
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
576
if (extraBytesToRead >= sourceEnd - source) {
577
result = sourceExhausted; break;
578
}
579
/* Do this check whether lenient or strict */
580
if (!isLegalUTF8(source, extraBytesToRead+1)) {
581
result = sourceIllegal;
582
break;
583
}
584
/*
585
* The cases all fall through. See "Note A" below.
586
*/
587
switch (extraBytesToRead) {
588
case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
589
case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
590
case 3: ch += *source++; ch <<= 6;
591
case 2: ch += *source++; ch <<= 6;
592
case 1: ch += *source++; ch <<= 6;
593
case 0: ch += *source++;
594
}
595
ch -= offsetsFromUTF8[extraBytesToRead];
596
597
if (target >= targetEnd) {
598
source -= (extraBytesToRead+1); /* Back up source pointer! */
599
result = targetExhausted; break;
600
}
601
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
602
/* UTF-16 surrogate values are illegal in UTF-32 */
603
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
604
if (flags == strictConversion) {
605
source -= (extraBytesToRead+1); /* return to the illegal value itself */
606
result = sourceIllegal;
607
break;
608
} else {
609
*target++ = UNI_REPLACEMENT_CHAR;
610
}
611
} else {
612
*target++ = (UTF16)ch; /* normal case */
613
}
614
} else if (ch > UNI_MAX_UTF16) {
615
if (flags == strictConversion) {
616
result = sourceIllegal;
617
source -= (extraBytesToRead+1); /* return to the start */
618
break; /* Bail out; shouldn't continue */
619
} else {
620
*target++ = UNI_REPLACEMENT_CHAR;
621
}
622
} else {
623
/* target is a character in range 0xFFFF - 0x10FFFF. */
624
if (target + 1 >= targetEnd) {
625
source -= (extraBytesToRead+1); /* Back up source pointer! */
626
result = targetExhausted; break;
627
}
628
ch -= halfBase;
629
*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
630
*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
631
}
632
}
633
*sourceStart = source;
634
*targetStart = target;
635
return result;
636
}
637
638
/* --------------------------------------------------------------------- */
639
640
static ConversionResult ConvertUTF8toUTF32Impl(
641
const UTF8** sourceStart, const UTF8* sourceEnd,
642
UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
643
Boolean InputIsPartial) {
644
ConversionResult result = conversionOK;
645
const UTF8* source = *sourceStart;
646
UTF32* target = *targetStart;
647
while (source < sourceEnd) {
648
UTF32 ch = 0;
649
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
650
if (extraBytesToRead >= sourceEnd - source) {
651
if (flags == strictConversion || InputIsPartial) {
652
result = sourceExhausted;
653
break;
654
} else {
655
result = sourceIllegal;
656
657
/*
658
* Replace the maximal subpart of ill-formed sequence with
659
* replacement character.
660
*/
661
source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
662
sourceEnd);
663
*target++ = UNI_REPLACEMENT_CHAR;
664
continue;
665
}
666
}
667
if (target >= targetEnd) {
668
result = targetExhausted; break;
669
}
670
671
/* Do this check whether lenient or strict */
672
if (!isLegalUTF8(source, extraBytesToRead+1)) {
673
result = sourceIllegal;
674
if (flags == strictConversion) {
675
/* Abort conversion. */
676
break;
677
} else {
678
/*
679
* Replace the maximal subpart of ill-formed sequence with
680
* replacement character.
681
*/
682
source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
683
sourceEnd);
684
*target++ = UNI_REPLACEMENT_CHAR;
685
continue;
686
}
687
}
688
/*
689
* The cases all fall through. See "Note A" below.
690
*/
691
switch (extraBytesToRead) {
692
case 5: ch += *source++; ch <<= 6;
693
case 4: ch += *source++; ch <<= 6;
694
case 3: ch += *source++; ch <<= 6;
695
case 2: ch += *source++; ch <<= 6;
696
case 1: ch += *source++; ch <<= 6;
697
case 0: ch += *source++;
698
}
699
ch -= offsetsFromUTF8[extraBytesToRead];
700
701
if (ch <= UNI_MAX_LEGAL_UTF32) {
702
/*
703
* UTF-16 surrogate values are illegal in UTF-32, and anything
704
* over Plane 17 (> 0x10FFFF) is illegal.
705
*/
706
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
707
if (flags == strictConversion) {
708
source -= (extraBytesToRead+1); /* return to the illegal value itself */
709
result = sourceIllegal;
710
break;
711
} else {
712
*target++ = UNI_REPLACEMENT_CHAR;
713
}
714
} else {
715
*target++ = ch;
716
}
717
} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
718
result = sourceIllegal;
719
*target++ = UNI_REPLACEMENT_CHAR;
720
}
721
}
722
*sourceStart = source;
723
*targetStart = target;
724
return result;
725
}
726
727
ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
728
const UTF8 *sourceEnd,
729
UTF32 **targetStart,
730
UTF32 *targetEnd,
731
ConversionFlags flags) {
732
return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
733
flags, /*InputIsPartial=*/true);
734
}
735
736
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
737
const UTF8 *sourceEnd, UTF32 **targetStart,
738
UTF32 *targetEnd, ConversionFlags flags) {
739
return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
740
flags, /*InputIsPartial=*/false);
741
}
742
743
/* ---------------------------------------------------------------------
744
745
Note A.
746
The fall-through switches in UTF-8 reading code save a
747
temp variable, some decrements & conditionals. The switches
748
are equivalent to the following loop:
749
{
750
int tmpBytesToRead = extraBytesToRead+1;
751
do {
752
ch += *source++;
753
--tmpBytesToRead;
754
if (tmpBytesToRead) ch <<= 6;
755
} while (tmpBytesToRead > 0);
756
}
757
In UTF-8 writing code, the switches on "bytesToWrite" are
758
similarly unrolled loops.
759
760
--------------------------------------------------------------------- */
761
762
} // namespace llvm
763
764
ConvertUTF_RESTORE_WARNINGS
765
766