CoCalc -- ConvertUTF.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Support/ConvertUTF.cpp
³⁵²³² views
1
/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===------------------------------------------------------------------------=*/
8
/*
9
 * Copyright © 1991-2015 Unicode, Inc. All rights reserved.
10
 * Distributed under the Terms of Use in
11
 * http://www.unicode.org/copyright.html.
12
 *
13
 * Permission is hereby granted, free of charge, to any person obtaining
14
 * a copy of the Unicode data files and any associated documentation
15
 * (the "Data Files") or Unicode software and any associated documentation
16
 * (the "Software") to deal in the Data Files or Software
17
 * without restriction, including without limitation the rights to use,
18
 * copy, modify, merge, publish, distribute, and/or sell copies of
19
 * the Data Files or Software, and to permit persons to whom the Data Files
20
 * or Software are furnished to do so, provided that
21
 * (a) this copyright and permission notice appear with all copies
22
 * of the Data Files or Software,
23
 * (b) this copyright and permission notice appear in associated
24
 * documentation, and
25
 * (c) there is clear notice in each modified Data File or in the Software
26
 * as well as in the documentation associated with the Data File(s) or
27
 * Software that the data or software has been modified.
28
 *
29
 * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
30
 * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
31
 * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32
 * NONINFRINGEMENT OF THIRD PARTY RIGHTS.
33
 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
34
 * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
35
 * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
36
 * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
37
 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
38
 * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
39
 *
40
 * Except as contained in this notice, the name of a copyright holder
41
 * shall not be used in advertising or otherwise to promote the sale,
42
 * use or other dealings in these Data Files or Software without prior
43
 * written authorization of the copyright holder.
44
 */
45

46
/* ---------------------------------------------------------------------
47

48
    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
49
    Author: Mark E. Davis, 1994.
50
    Rev History: Rick McGowan, fixes & updates May 2001.
51
    Sept 2001: fixed const & error conditions per
52
        mods suggested by S. Parent & A. Lillich.
53
    June 2002: Tim Dodd added detection and handling of incomplete
54
        source sequences, enhanced error detection, added casts
55
        to eliminate compiler warnings.
56
    July 2003: slight mods to back out aggressive FFFE detection.
57
    Jan 2004: updated switches in from-UTF8 conversions.
58
    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
59

60
    See the header file "ConvertUTF.h" for complete documentation.
61

62
------------------------------------------------------------------------ */
63

64
#include "llvm/Support/ConvertUTF.h"
65
#ifdef CVTUTF_DEBUG
66
#include <stdio.h>
67
#endif
68
#include <assert.h>
69

70
/*
71
 * This code extensively uses fall-through switches.
72
 * Keep the compiler from warning about that.
73
 */
74
#if defined(__clang__) && defined(__has_warning)
75
# if __has_warning("-Wimplicit-fallthrough")
76
#  define ConvertUTF_DISABLE_WARNINGS \
77
    _Pragma("clang diagnostic push")  \
78
    _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
79
#  define ConvertUTF_RESTORE_WARNINGS \
80
    _Pragma("clang diagnostic pop")
81
# endif
82
#elif defined(__GNUC__) && __GNUC__ > 6
83
# define ConvertUTF_DISABLE_WARNINGS \
84
   _Pragma("GCC diagnostic push")    \
85
   _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
86
# define ConvertUTF_RESTORE_WARNINGS \
87
   _Pragma("GCC diagnostic pop")
88
#endif
89
#ifndef ConvertUTF_DISABLE_WARNINGS
90
# define ConvertUTF_DISABLE_WARNINGS
91
#endif
92
#ifndef ConvertUTF_RESTORE_WARNINGS
93
# define ConvertUTF_RESTORE_WARNINGS
94
#endif
95

96
ConvertUTF_DISABLE_WARNINGS
97

98
namespace llvm {
99

100
static const int halfShift  = 10; /* used for shifting by 10 bits */
101

102
static const UTF32 halfBase = 0x0010000UL;
103
static const UTF32 halfMask = 0x3FFUL;
104

105
#define UNI_SUR_HIGH_START  (UTF32)0xD800
106
#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
107
#define UNI_SUR_LOW_START   (UTF32)0xDC00
108
#define UNI_SUR_LOW_END     (UTF32)0xDFFF
109

110
/* --------------------------------------------------------------------- */
111

112
/*
113
 * Index into the table below with the first byte of a UTF-8 sequence to
114
 * get the number of trailing bytes that are supposed to follow it.
115
 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
116
 * left as-is for anyone who may want to do such conversion, which was
117
 * allowed in earlier algorithms.
118
 */
119
static const char trailingBytesForUTF8[256] = {
120
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
121
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
122
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
123
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
124
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
125
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
126
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
127
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
128
};
129

130
/*
131
 * Magic values subtracted from a buffer value during UTF8 conversion.
132
 * This table contains as many values as there might be trailing bytes
133
 * in a UTF-8 sequence.
134
 */
135
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
136
                     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
137

138
/*
139
 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
140
 * into the first byte, depending on how many bytes follow.  There are
141
 * as many entries in this table as there are UTF-8 sequence types.
142
 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
143
 * for *legal* UTF-8 will be 4 or fewer bytes total.
144
 */
145
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
146

147
/* --------------------------------------------------------------------- */
148

149
/* The interface converts a whole buffer to avoid function-call overhead.
150
 * Constants have been gathered. Loops & conditionals have been removed as
151
 * much as possible for efficiency, in favor of drop-through switches.
152
 * (See "Note A" at the bottom of the file for equivalent code.)
153
 * If your compiler supports it, the "isLegalUTF8" call can be turned
154
 * into an inline function.
155
 */
156

157

158
/* --------------------------------------------------------------------- */
159

160
ConversionResult ConvertUTF32toUTF16 (
161
        const UTF32** sourceStart, const UTF32* sourceEnd,
162
        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
163
    ConversionResult result = conversionOK;
164
    const UTF32* source = *sourceStart;
165
    UTF16* target = *targetStart;
166
    while (source < sourceEnd) {
167
        UTF32 ch;
168
        if (target >= targetEnd) {
169
            result = targetExhausted; break;
170
        }
171
        ch = *source++;
172
        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
173
            /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
174
            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
175
                if (flags == strictConversion) {
176
                    --source; /* return to the illegal value itself */
177
                    result = sourceIllegal;
178
                    break;
179
                } else {
180
                    *target++ = UNI_REPLACEMENT_CHAR;
181
                }
182
            } else {
183
                *target++ = (UTF16)ch; /* normal case */
184
            }
185
        } else if (ch > UNI_MAX_LEGAL_UTF32) {
186
            if (flags == strictConversion) {
187
                result = sourceIllegal;
188
            } else {
189
                *target++ = UNI_REPLACEMENT_CHAR;
190
            }
191
        } else {
192
            /* target is a character in range 0xFFFF - 0x10FFFF. */
193
            if (target + 1 >= targetEnd) {
194
                --source; /* Back up source pointer! */
195
                result = targetExhausted; break;
196
            }
197
            ch -= halfBase;
198
            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
199
            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
200
        }
201
    }
202
    *sourceStart = source;
203
    *targetStart = target;
204
    return result;
205
}
206

207
/* --------------------------------------------------------------------- */
208

209
ConversionResult ConvertUTF16toUTF32 (
210
        const UTF16** sourceStart, const UTF16* sourceEnd,
211
        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
212
    ConversionResult result = conversionOK;
213
    const UTF16* source = *sourceStart;
214
    UTF32* target = *targetStart;
215
    UTF32 ch, ch2;
216
    while (source < sourceEnd) {
217
        const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
218
        ch = *source++;
219
        /* If we have a surrogate pair, convert to UTF32 first. */
220
        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
221
            /* If the 16 bits following the high surrogate are in the source buffer... */
222
            if (source < sourceEnd) {
223
                ch2 = *source;
224
                /* If it's a low surrogate, convert to UTF32. */
225
                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
226
                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
227
                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
228
                    ++source;
229
                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
230
                    --source; /* return to the illegal value itself */
231
                    result = sourceIllegal;
232
                    break;
233
                }
234
            } else { /* We don't have the 16 bits following the high surrogate. */
235
                --source; /* return to the high surrogate */
236
                result = sourceExhausted;
237
                break;
238
            }
239
        } else if (flags == strictConversion) {
240
            /* UTF-16 surrogate values are illegal in UTF-32 */
241
            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
242
                --source; /* return to the illegal value itself */
243
                result = sourceIllegal;
244
                break;
245
            }
246
        }
247
        if (target >= targetEnd) {
248
            source = oldSource; /* Back up source pointer! */
249
            result = targetExhausted; break;
250
        }
251
        *target++ = ch;
252
    }
253
    *sourceStart = source;
254
    *targetStart = target;
255
#ifdef CVTUTF_DEBUG
256
if (result == sourceIllegal) {
257
    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
258
    fflush(stderr);
259
}
260
#endif
261
    return result;
262
}
263
ConversionResult ConvertUTF16toUTF8 (
264
        const UTF16** sourceStart, const UTF16* sourceEnd,
265
        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
266
    ConversionResult result = conversionOK;
267
    const UTF16* source = *sourceStart;
268
    UTF8* target = *targetStart;
269
    while (source < sourceEnd) {
270
        UTF32 ch;
271
        unsigned short bytesToWrite = 0;
272
        const UTF32 byteMask = 0xBF;
273
        const UTF32 byteMark = 0x80;
274
        const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
275
        ch = *source++;
276
        /* If we have a surrogate pair, convert to UTF32 first. */
277
        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
278
            /* If the 16 bits following the high surrogate are in the source buffer... */
279
            if (source < sourceEnd) {
280
                UTF32 ch2 = *source;
281
                /* If it's a low surrogate, convert to UTF32. */
282
                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
283
                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
284
                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
285
                    ++source;
286
                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
287
                    --source; /* return to the illegal value itself */
288
                    result = sourceIllegal;
289
                    break;
290
                }
291
            } else { /* We don't have the 16 bits following the high surrogate. */
292
                --source; /* return to the high surrogate */
293
                result = sourceExhausted;
294
                break;
295
            }
296
        } else if (flags == strictConversion) {
297
            /* UTF-16 surrogate values are illegal in UTF-32 */
298
            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
299
                --source; /* return to the illegal value itself */
300
                result = sourceIllegal;
301
                break;
302
            }
303
        }
304
        /* Figure out how many bytes the result will require */
305
        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
306
        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
307
        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
308
        } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
309
        } else {                            bytesToWrite = 3;
310
                                            ch = UNI_REPLACEMENT_CHAR;
311
        }
312

313
        target += bytesToWrite;
314
        if (target > targetEnd) {
315
            source = oldSource; /* Back up source pointer! */
316
            target -= bytesToWrite; result = targetExhausted; break;
317
        }
318
        switch (bytesToWrite) { /* note: everything falls through. */
319
            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
320
            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
321
            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
322
            case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
323
        }
324
        target += bytesToWrite;
325
    }
326
    *sourceStart = source;
327
    *targetStart = target;
328
    return result;
329
}
330

331
/* --------------------------------------------------------------------- */
332

333
ConversionResult ConvertUTF32toUTF8 (
334
        const UTF32** sourceStart, const UTF32* sourceEnd,
335
        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
336
    ConversionResult result = conversionOK;
337
    const UTF32* source = *sourceStart;
338
    UTF8* target = *targetStart;
339
    while (source < sourceEnd) {
340
        UTF32 ch;
341
        unsigned short bytesToWrite = 0;
342
        const UTF32 byteMask = 0xBF;
343
        const UTF32 byteMark = 0x80;
344
        ch = *source++;
345
        if (flags == strictConversion ) {
346
            /* UTF-16 surrogate values are illegal in UTF-32 */
347
            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
348
                --source; /* return to the illegal value itself */
349
                result = sourceIllegal;
350
                break;
351
            }
352
        }
353
        /*
354
         * Figure out how many bytes the result will require. Turn any
355
         * illegally large UTF32 things (> Plane 17) into replacement chars.
356
         */
357
        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
358
        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
359
        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
360
        } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
361
        } else {                            bytesToWrite = 3;
362
                                            ch = UNI_REPLACEMENT_CHAR;
363
                                            result = sourceIllegal;
364
        }
365

366
        target += bytesToWrite;
367
        if (target > targetEnd) {
368
            --source; /* Back up source pointer! */
369
            target -= bytesToWrite; result = targetExhausted; break;
370
        }
371
        switch (bytesToWrite) { /* note: everything falls through. */
372
            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
373
            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
374
            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
375
            case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
376
        }
377
        target += bytesToWrite;
378
    }
379
    *sourceStart = source;
380
    *targetStart = target;
381
    return result;
382
}
383

384
/* --------------------------------------------------------------------- */
385

386
/*
387
 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
388
 * This must be called with the length pre-determined by the first byte.
389
 * If not calling this from ConvertUTF8to*, then the length can be set by:
390
 *  length = trailingBytesForUTF8[*source]+1;
391
 * and the sequence is illegal right away if there aren't that many bytes
392
 * available.
393
 * If presented with a length > 4, this returns false.  The Unicode
394
 * definition of UTF-8 goes up to 4-byte sequences.
395
 */
396

397
static Boolean isLegalUTF8(const UTF8 *source, int length) {
398
    UTF8 a;
399
    const UTF8 *srcptr = source+length;
400
    switch (length) {
401
    default: return false;
402
        /* Everything else falls through when "true"... */
403
    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
404
    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
405
    case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
406

407
        switch (*source) {
408
            /* no fall-through in this inner switch */
409
            case 0xE0: if (a < 0xA0) return false; break;
410
            case 0xED: if (a > 0x9F) return false; break;
411
            case 0xF0: if (a < 0x90) return false; break;
412
            case 0xF4: if (a > 0x8F) return false; break;
413
            default:   if (a < 0x80) return false;
414
        }
415

416
    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
417
    }
418
    if (*source > 0xF4) return false;
419
    return true;
420
}
421

422
/* --------------------------------------------------------------------- */
423

424
/*
425
 * Exported function to return whether a UTF-8 sequence is legal or not.
426
 * This is not used here; it's just exported.
427
 */
428
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
429
    int length = trailingBytesForUTF8[*source]+1;
430
    if (length > sourceEnd - source) {
431
        return false;
432
    }
433
    return isLegalUTF8(source, length);
434
}
435

436
/*
437
 * Exported function to return the size of the first utf-8 code unit sequence,
438
 * Or 0 if the sequence is not valid;
439
 */
440
unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
441
  int length = trailingBytesForUTF8[*source] + 1;
442
  return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
443
                                                                       : 0;
444
}
445

446
/* --------------------------------------------------------------------- */
447

448
static unsigned
449
findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
450
                                          const UTF8 *sourceEnd) {
451
  UTF8 b1, b2, b3;
452

453
  assert(!isLegalUTF8Sequence(source, sourceEnd));
454

455
  /*
456
   * Unicode 6.3.0, D93b:
457
   *
458
   *   Maximal subpart of an ill-formed subsequence: The longest code unit
459
   *   subsequence starting at an unconvertible offset that is either:
460
   *   a. the initial subsequence of a well-formed code unit sequence, or
461
   *   b. a subsequence of length one.
462
   */
463

464
  if (source == sourceEnd)
465
    return 0;
466

467
  /*
468
   * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
469
   * Byte Sequences.
470
   */
471

472
  b1 = *source;
473
  ++source;
474
  if (b1 >= 0xC2 && b1 <= 0xDF) {
475
    /*
476
     * First byte is valid, but we know that this code unit sequence is
477
     * invalid, so the maximal subpart has to end after the first byte.
478
     */
479
    return 1;
480
  }
481

482
  if (source == sourceEnd)
483
    return 1;
484

485
  b2 = *source;
486
  ++source;
487

488
  if (b1 == 0xE0) {
489
    return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
490
  }
491
  if (b1 >= 0xE1 && b1 <= 0xEC) {
492
    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
493
  }
494
  if (b1 == 0xED) {
495
    return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
496
  }
497
  if (b1 >= 0xEE && b1 <= 0xEF) {
498
    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
499
  }
500
  if (b1 == 0xF0) {
501
    if (b2 >= 0x90 && b2 <= 0xBF) {
502
      if (source == sourceEnd)
503
        return 2;
504

505
      b3 = *source;
506
      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
507
    }
508
    return 1;
509
  }
510
  if (b1 >= 0xF1 && b1 <= 0xF3) {
511
    if (b2 >= 0x80 && b2 <= 0xBF) {
512
      if (source == sourceEnd)
513
        return 2;
514

515
      b3 = *source;
516
      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
517
    }
518
    return 1;
519
  }
520
  if (b1 == 0xF4) {
521
    if (b2 >= 0x80 && b2 <= 0x8F) {
522
      if (source == sourceEnd)
523
        return 2;
524

525
      b3 = *source;
526
      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
527
    }
528
    return 1;
529
  }
530

531
  assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
532
  /*
533
   * There are no valid sequences that start with these bytes.  Maximal subpart
534
   * is defined to have length 1 in these cases.
535
   */
536
  return 1;
537
}
538

539
/* --------------------------------------------------------------------- */
540

541
/*
542
 * Exported function to return the total number of bytes in a codepoint
543
 * represented in UTF-8, given the value of the first byte.
544
 */
545
unsigned getNumBytesForUTF8(UTF8 first) {
546
  return trailingBytesForUTF8[first] + 1;
547
}
548

549
/* --------------------------------------------------------------------- */
550

551
/*
552
 * Exported function to return whether a UTF-8 string is legal or not.
553
 * This is not used here; it's just exported.
554
 */
555
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
556
    while (*source != sourceEnd) {
557
        int length = trailingBytesForUTF8[**source] + 1;
558
        if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
559
            return false;
560
        *source += length;
561
    }
562
    return true;
563
}
564

565
/* --------------------------------------------------------------------- */
566

567
ConversionResult ConvertUTF8toUTF16 (
568
        const UTF8** sourceStart, const UTF8* sourceEnd,
569
        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
570
    ConversionResult result = conversionOK;
571
    const UTF8* source = *sourceStart;
572
    UTF16* target = *targetStart;
573
    while (source < sourceEnd) {
574
        UTF32 ch = 0;
575
        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
576
        if (extraBytesToRead >= sourceEnd - source) {
577
            result = sourceExhausted; break;
578
        }
579
        /* Do this check whether lenient or strict */
580
        if (!isLegalUTF8(source, extraBytesToRead+1)) {
581
            result = sourceIllegal;
582
            break;
583
        }
584
        /*
585
         * The cases all fall through. See "Note A" below.
586
         */
587
        switch (extraBytesToRead) {
588
            case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
589
            case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
590
            case 3: ch += *source++; ch <<= 6;
591
            case 2: ch += *source++; ch <<= 6;
592
            case 1: ch += *source++; ch <<= 6;
593
            case 0: ch += *source++;
594
        }
595
        ch -= offsetsFromUTF8[extraBytesToRead];
596

597
        if (target >= targetEnd) {
598
            source -= (extraBytesToRead+1); /* Back up source pointer! */
599
            result = targetExhausted; break;
600
        }
601
        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
602
            /* UTF-16 surrogate values are illegal in UTF-32 */
603
            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
604
                if (flags == strictConversion) {
605
                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
606
                    result = sourceIllegal;
607
                    break;
608
                } else {
609
                    *target++ = UNI_REPLACEMENT_CHAR;
610
                }
611
            } else {
612
                *target++ = (UTF16)ch; /* normal case */
613
            }
614
        } else if (ch > UNI_MAX_UTF16) {
615
            if (flags == strictConversion) {
616
                result = sourceIllegal;
617
                source -= (extraBytesToRead+1); /* return to the start */
618
                break; /* Bail out; shouldn't continue */
619
            } else {
620
                *target++ = UNI_REPLACEMENT_CHAR;
621
            }
622
        } else {
623
            /* target is a character in range 0xFFFF - 0x10FFFF. */
624
            if (target + 1 >= targetEnd) {
625
                source -= (extraBytesToRead+1); /* Back up source pointer! */
626
                result = targetExhausted; break;
627
            }
628
            ch -= halfBase;
629
            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
630
            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
631
        }
632
    }
633
    *sourceStart = source;
634
    *targetStart = target;
635
    return result;
636
}
637

638
/* --------------------------------------------------------------------- */
639

640
static ConversionResult ConvertUTF8toUTF32Impl(
641
        const UTF8** sourceStart, const UTF8* sourceEnd,
642
        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
643
        Boolean InputIsPartial) {
644
    ConversionResult result = conversionOK;
645
    const UTF8* source = *sourceStart;
646
    UTF32* target = *targetStart;
647
    while (source < sourceEnd) {
648
        UTF32 ch = 0;
649
        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
650
        if (extraBytesToRead >= sourceEnd - source) {
651
            if (flags == strictConversion || InputIsPartial) {
652
                result = sourceExhausted;
653
                break;
654
            } else {
655
                result = sourceIllegal;
656

657
                /*
658
                 * Replace the maximal subpart of ill-formed sequence with
659
                 * replacement character.
660
                 */
661
                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
662
                                                                    sourceEnd);
663
                *target++ = UNI_REPLACEMENT_CHAR;
664
                continue;
665
            }
666
        }
667
        if (target >= targetEnd) {
668
            result = targetExhausted; break;
669
        }
670

671
        /* Do this check whether lenient or strict */
672
        if (!isLegalUTF8(source, extraBytesToRead+1)) {
673
            result = sourceIllegal;
674
            if (flags == strictConversion) {
675
                /* Abort conversion. */
676
                break;
677
            } else {
678
                /*
679
                 * Replace the maximal subpart of ill-formed sequence with
680
                 * replacement character.
681
                 */
682
                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
683
                                                                    sourceEnd);
684
                *target++ = UNI_REPLACEMENT_CHAR;
685
                continue;
686
            }
687
        }
688
        /*
689
         * The cases all fall through. See "Note A" below.
690
         */
691
        switch (extraBytesToRead) {
692
            case 5: ch += *source++; ch <<= 6;
693
            case 4: ch += *source++; ch <<= 6;
694
            case 3: ch += *source++; ch <<= 6;
695
            case 2: ch += *source++; ch <<= 6;
696
            case 1: ch += *source++; ch <<= 6;
697
            case 0: ch += *source++;
698
        }
699
        ch -= offsetsFromUTF8[extraBytesToRead];
700

701
        if (ch <= UNI_MAX_LEGAL_UTF32) {
702
            /*
703
             * UTF-16 surrogate values are illegal in UTF-32, and anything
704
             * over Plane 17 (> 0x10FFFF) is illegal.
705
             */
706
            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
707
                if (flags == strictConversion) {
708
                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
709
                    result = sourceIllegal;
710
                    break;
711
                } else {
712
                    *target++ = UNI_REPLACEMENT_CHAR;
713
                }
714
            } else {
715
                *target++ = ch;
716
            }
717
        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
718
            result = sourceIllegal;
719
            *target++ = UNI_REPLACEMENT_CHAR;
720
        }
721
    }
722
    *sourceStart = source;
723
    *targetStart = target;
724
    return result;
725
}
726

727
ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
728
                                           const UTF8 *sourceEnd,
729
                                           UTF32 **targetStart,
730
                                           UTF32 *targetEnd,
731
                                           ConversionFlags flags) {
732
  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
733
                                flags, /*InputIsPartial=*/true);
734
}
735

736
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
737
                                    const UTF8 *sourceEnd, UTF32 **targetStart,
738
                                    UTF32 *targetEnd, ConversionFlags flags) {
739
  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
740
                                flags, /*InputIsPartial=*/false);
741
}
742

743
/* ---------------------------------------------------------------------
744

745
    Note A.
746
    The fall-through switches in UTF-8 reading code save a
747
    temp variable, some decrements & conditionals.  The switches
748
    are equivalent to the following loop:
749
        {
750
            int tmpBytesToRead = extraBytesToRead+1;
751
            do {
752
                ch += *source++;
753
                --tmpBytesToRead;
754
                if (tmpBytesToRead) ch <<= 6;
755
            } while (tmpBytesToRead > 0);
756
        }
757
    In UTF-8 writing code, the switches on "bytesToWrite" are
758
    similarly unrolled loops.
759

760
   --------------------------------------------------------------------- */
761

762
} // namespace llvm
763

764
ConvertUTF_RESTORE_WARNINGS
765

766
Product

Resources

Company