Path: blob/main/contrib/llvm-project/llvm/lib/Support/ConvertUTF.cpp
35232 views
/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===1*2* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3* See https://llvm.org/LICENSE.txt for license information.4* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5*6*===------------------------------------------------------------------------=*/7/*8* Copyright © 1991-2015 Unicode, Inc. All rights reserved.9* Distributed under the Terms of Use in10* http://www.unicode.org/copyright.html.11*12* Permission is hereby granted, free of charge, to any person obtaining13* a copy of the Unicode data files and any associated documentation14* (the "Data Files") or Unicode software and any associated documentation15* (the "Software") to deal in the Data Files or Software16* without restriction, including without limitation the rights to use,17* copy, modify, merge, publish, distribute, and/or sell copies of18* the Data Files or Software, and to permit persons to whom the Data Files19* or Software are furnished to do so, provided that20* (a) this copyright and permission notice appear with all copies21* of the Data Files or Software,22* (b) this copyright and permission notice appear in associated23* documentation, and24* (c) there is clear notice in each modified Data File or in the Software25* as well as in the documentation associated with the Data File(s) or26* Software that the data or software has been modified.27*28* THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF29* ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE30* WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND31* NONINFRINGEMENT OF THIRD PARTY RIGHTS.32* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS33* NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL34* DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,35* DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER36* TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR37* PERFORMANCE OF THE DATA FILES OR SOFTWARE.38*39* Except as contained in this notice, the name of a copyright holder40* shall not be used in advertising or otherwise to promote the sale,41* use or other dealings in these Data Files or Software without prior42* written authorization of the copyright holder.43*/4445/* ---------------------------------------------------------------------4647Conversions between UTF32, UTF-16, and UTF-8. Source code file.48Author: Mark E. Davis, 1994.49Rev History: Rick McGowan, fixes & updates May 2001.50Sept 2001: fixed const & error conditions per51mods suggested by S. Parent & A. Lillich.52June 2002: Tim Dodd added detection and handling of incomplete53source sequences, enhanced error detection, added casts54to eliminate compiler warnings.55July 2003: slight mods to back out aggressive FFFE detection.56Jan 2004: updated switches in from-UTF8 conversions.57Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.5859See the header file "ConvertUTF.h" for complete documentation.6061------------------------------------------------------------------------ */6263#include "llvm/Support/ConvertUTF.h"64#ifdef CVTUTF_DEBUG65#include <stdio.h>66#endif67#include <assert.h>6869/*70* This code extensively uses fall-through switches.71* Keep the compiler from warning about that.72*/73#if defined(__clang__) && defined(__has_warning)74# if __has_warning("-Wimplicit-fallthrough")75# define ConvertUTF_DISABLE_WARNINGS \76_Pragma("clang diagnostic push") \77_Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")78# define ConvertUTF_RESTORE_WARNINGS \79_Pragma("clang diagnostic pop")80# endif81#elif defined(__GNUC__) && __GNUC__ > 682# define ConvertUTF_DISABLE_WARNINGS \83_Pragma("GCC diagnostic push") \84_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")85# define ConvertUTF_RESTORE_WARNINGS \86_Pragma("GCC diagnostic pop")87#endif88#ifndef ConvertUTF_DISABLE_WARNINGS89# define ConvertUTF_DISABLE_WARNINGS90#endif91#ifndef ConvertUTF_RESTORE_WARNINGS92# define ConvertUTF_RESTORE_WARNINGS93#endif9495ConvertUTF_DISABLE_WARNINGS9697namespace llvm {9899static const int halfShift = 10; /* used for shifting by 10 bits */100101static const UTF32 halfBase = 0x0010000UL;102static const UTF32 halfMask = 0x3FFUL;103104#define UNI_SUR_HIGH_START (UTF32)0xD800105#define UNI_SUR_HIGH_END (UTF32)0xDBFF106#define UNI_SUR_LOW_START (UTF32)0xDC00107#define UNI_SUR_LOW_END (UTF32)0xDFFF108109/* --------------------------------------------------------------------- */110111/*112* Index into the table below with the first byte of a UTF-8 sequence to113* get the number of trailing bytes that are supposed to follow it.114* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is115* left as-is for anyone who may want to do such conversion, which was116* allowed in earlier algorithms.117*/118static const char trailingBytesForUTF8[256] = {1190,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1210,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1251,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1262,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5127};128129/*130* Magic values subtracted from a buffer value during UTF8 conversion.131* This table contains as many values as there might be trailing bytes132* in a UTF-8 sequence.133*/134static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,1350x03C82080UL, 0xFA082080UL, 0x82082080UL };136137/*138* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed139* into the first byte, depending on how many bytes follow. There are140* as many entries in this table as there are UTF-8 sequence types.141* (I.e., one byte sequence, two byte... etc.). Remember that sequencs142* for *legal* UTF-8 will be 4 or fewer bytes total.143*/144static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };145146/* --------------------------------------------------------------------- */147148/* The interface converts a whole buffer to avoid function-call overhead.149* Constants have been gathered. Loops & conditionals have been removed as150* much as possible for efficiency, in favor of drop-through switches.151* (See "Note A" at the bottom of the file for equivalent code.)152* If your compiler supports it, the "isLegalUTF8" call can be turned153* into an inline function.154*/155156157/* --------------------------------------------------------------------- */158159ConversionResult ConvertUTF32toUTF16 (160const UTF32** sourceStart, const UTF32* sourceEnd,161UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {162ConversionResult result = conversionOK;163const UTF32* source = *sourceStart;164UTF16* target = *targetStart;165while (source < sourceEnd) {166UTF32 ch;167if (target >= targetEnd) {168result = targetExhausted; break;169}170ch = *source++;171if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */172/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */173if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {174if (flags == strictConversion) {175--source; /* return to the illegal value itself */176result = sourceIllegal;177break;178} else {179*target++ = UNI_REPLACEMENT_CHAR;180}181} else {182*target++ = (UTF16)ch; /* normal case */183}184} else if (ch > UNI_MAX_LEGAL_UTF32) {185if (flags == strictConversion) {186result = sourceIllegal;187} else {188*target++ = UNI_REPLACEMENT_CHAR;189}190} else {191/* target is a character in range 0xFFFF - 0x10FFFF. */192if (target + 1 >= targetEnd) {193--source; /* Back up source pointer! */194result = targetExhausted; break;195}196ch -= halfBase;197*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);198*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);199}200}201*sourceStart = source;202*targetStart = target;203return result;204}205206/* --------------------------------------------------------------------- */207208ConversionResult ConvertUTF16toUTF32 (209const UTF16** sourceStart, const UTF16* sourceEnd,210UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {211ConversionResult result = conversionOK;212const UTF16* source = *sourceStart;213UTF32* target = *targetStart;214UTF32 ch, ch2;215while (source < sourceEnd) {216const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */217ch = *source++;218/* If we have a surrogate pair, convert to UTF32 first. */219if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {220/* If the 16 bits following the high surrogate are in the source buffer... */221if (source < sourceEnd) {222ch2 = *source;223/* If it's a low surrogate, convert to UTF32. */224if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {225ch = ((ch - UNI_SUR_HIGH_START) << halfShift)226+ (ch2 - UNI_SUR_LOW_START) + halfBase;227++source;228} else if (flags == strictConversion) { /* it's an unpaired high surrogate */229--source; /* return to the illegal value itself */230result = sourceIllegal;231break;232}233} else { /* We don't have the 16 bits following the high surrogate. */234--source; /* return to the high surrogate */235result = sourceExhausted;236break;237}238} else if (flags == strictConversion) {239/* UTF-16 surrogate values are illegal in UTF-32 */240if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {241--source; /* return to the illegal value itself */242result = sourceIllegal;243break;244}245}246if (target >= targetEnd) {247source = oldSource; /* Back up source pointer! */248result = targetExhausted; break;249}250*target++ = ch;251}252*sourceStart = source;253*targetStart = target;254#ifdef CVTUTF_DEBUG255if (result == sourceIllegal) {256fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);257fflush(stderr);258}259#endif260return result;261}262ConversionResult ConvertUTF16toUTF8 (263const UTF16** sourceStart, const UTF16* sourceEnd,264UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {265ConversionResult result = conversionOK;266const UTF16* source = *sourceStart;267UTF8* target = *targetStart;268while (source < sourceEnd) {269UTF32 ch;270unsigned short bytesToWrite = 0;271const UTF32 byteMask = 0xBF;272const UTF32 byteMark = 0x80;273const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */274ch = *source++;275/* If we have a surrogate pair, convert to UTF32 first. */276if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {277/* If the 16 bits following the high surrogate are in the source buffer... */278if (source < sourceEnd) {279UTF32 ch2 = *source;280/* If it's a low surrogate, convert to UTF32. */281if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {282ch = ((ch - UNI_SUR_HIGH_START) << halfShift)283+ (ch2 - UNI_SUR_LOW_START) + halfBase;284++source;285} else if (flags == strictConversion) { /* it's an unpaired high surrogate */286--source; /* return to the illegal value itself */287result = sourceIllegal;288break;289}290} else { /* We don't have the 16 bits following the high surrogate. */291--source; /* return to the high surrogate */292result = sourceExhausted;293break;294}295} else if (flags == strictConversion) {296/* UTF-16 surrogate values are illegal in UTF-32 */297if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {298--source; /* return to the illegal value itself */299result = sourceIllegal;300break;301}302}303/* Figure out how many bytes the result will require */304if (ch < (UTF32)0x80) { bytesToWrite = 1;305} else if (ch < (UTF32)0x800) { bytesToWrite = 2;306} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;307} else if (ch < (UTF32)0x110000) { bytesToWrite = 4;308} else { bytesToWrite = 3;309ch = UNI_REPLACEMENT_CHAR;310}311312target += bytesToWrite;313if (target > targetEnd) {314source = oldSource; /* Back up source pointer! */315target -= bytesToWrite; result = targetExhausted; break;316}317switch (bytesToWrite) { /* note: everything falls through. */318case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;319case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;320case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;321case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);322}323target += bytesToWrite;324}325*sourceStart = source;326*targetStart = target;327return result;328}329330/* --------------------------------------------------------------------- */331332ConversionResult ConvertUTF32toUTF8 (333const UTF32** sourceStart, const UTF32* sourceEnd,334UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {335ConversionResult result = conversionOK;336const UTF32* source = *sourceStart;337UTF8* target = *targetStart;338while (source < sourceEnd) {339UTF32 ch;340unsigned short bytesToWrite = 0;341const UTF32 byteMask = 0xBF;342const UTF32 byteMark = 0x80;343ch = *source++;344if (flags == strictConversion ) {345/* UTF-16 surrogate values are illegal in UTF-32 */346if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {347--source; /* return to the illegal value itself */348result = sourceIllegal;349break;350}351}352/*353* Figure out how many bytes the result will require. Turn any354* illegally large UTF32 things (> Plane 17) into replacement chars.355*/356if (ch < (UTF32)0x80) { bytesToWrite = 1;357} else if (ch < (UTF32)0x800) { bytesToWrite = 2;358} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;359} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;360} else { bytesToWrite = 3;361ch = UNI_REPLACEMENT_CHAR;362result = sourceIllegal;363}364365target += bytesToWrite;366if (target > targetEnd) {367--source; /* Back up source pointer! */368target -= bytesToWrite; result = targetExhausted; break;369}370switch (bytesToWrite) { /* note: everything falls through. */371case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;372case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;373case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;374case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);375}376target += bytesToWrite;377}378*sourceStart = source;379*targetStart = target;380return result;381}382383/* --------------------------------------------------------------------- */384385/*386* Utility routine to tell whether a sequence of bytes is legal UTF-8.387* This must be called with the length pre-determined by the first byte.388* If not calling this from ConvertUTF8to*, then the length can be set by:389* length = trailingBytesForUTF8[*source]+1;390* and the sequence is illegal right away if there aren't that many bytes391* available.392* If presented with a length > 4, this returns false. The Unicode393* definition of UTF-8 goes up to 4-byte sequences.394*/395396static Boolean isLegalUTF8(const UTF8 *source, int length) {397UTF8 a;398const UTF8 *srcptr = source+length;399switch (length) {400default: return false;401/* Everything else falls through when "true"... */402case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;403case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;404case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;405406switch (*source) {407/* no fall-through in this inner switch */408case 0xE0: if (a < 0xA0) return false; break;409case 0xED: if (a > 0x9F) return false; break;410case 0xF0: if (a < 0x90) return false; break;411case 0xF4: if (a > 0x8F) return false; break;412default: if (a < 0x80) return false;413}414415case 1: if (*source >= 0x80 && *source < 0xC2) return false;416}417if (*source > 0xF4) return false;418return true;419}420421/* --------------------------------------------------------------------- */422423/*424* Exported function to return whether a UTF-8 sequence is legal or not.425* This is not used here; it's just exported.426*/427Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {428int length = trailingBytesForUTF8[*source]+1;429if (length > sourceEnd - source) {430return false;431}432return isLegalUTF8(source, length);433}434435/*436* Exported function to return the size of the first utf-8 code unit sequence,437* Or 0 if the sequence is not valid;438*/439unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {440int length = trailingBytesForUTF8[*source] + 1;441return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length442: 0;443}444445/* --------------------------------------------------------------------- */446447static unsigned448findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,449const UTF8 *sourceEnd) {450UTF8 b1, b2, b3;451452assert(!isLegalUTF8Sequence(source, sourceEnd));453454/*455* Unicode 6.3.0, D93b:456*457* Maximal subpart of an ill-formed subsequence: The longest code unit458* subsequence starting at an unconvertible offset that is either:459* a. the initial subsequence of a well-formed code unit sequence, or460* b. a subsequence of length one.461*/462463if (source == sourceEnd)464return 0;465466/*467* Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8468* Byte Sequences.469*/470471b1 = *source;472++source;473if (b1 >= 0xC2 && b1 <= 0xDF) {474/*475* First byte is valid, but we know that this code unit sequence is476* invalid, so the maximal subpart has to end after the first byte.477*/478return 1;479}480481if (source == sourceEnd)482return 1;483484b2 = *source;485++source;486487if (b1 == 0xE0) {488return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;489}490if (b1 >= 0xE1 && b1 <= 0xEC) {491return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;492}493if (b1 == 0xED) {494return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;495}496if (b1 >= 0xEE && b1 <= 0xEF) {497return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;498}499if (b1 == 0xF0) {500if (b2 >= 0x90 && b2 <= 0xBF) {501if (source == sourceEnd)502return 2;503504b3 = *source;505return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;506}507return 1;508}509if (b1 >= 0xF1 && b1 <= 0xF3) {510if (b2 >= 0x80 && b2 <= 0xBF) {511if (source == sourceEnd)512return 2;513514b3 = *source;515return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;516}517return 1;518}519if (b1 == 0xF4) {520if (b2 >= 0x80 && b2 <= 0x8F) {521if (source == sourceEnd)522return 2;523524b3 = *source;525return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;526}527return 1;528}529530assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);531/*532* There are no valid sequences that start with these bytes. Maximal subpart533* is defined to have length 1 in these cases.534*/535return 1;536}537538/* --------------------------------------------------------------------- */539540/*541* Exported function to return the total number of bytes in a codepoint542* represented in UTF-8, given the value of the first byte.543*/544unsigned getNumBytesForUTF8(UTF8 first) {545return trailingBytesForUTF8[first] + 1;546}547548/* --------------------------------------------------------------------- */549550/*551* Exported function to return whether a UTF-8 string is legal or not.552* This is not used here; it's just exported.553*/554Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {555while (*source != sourceEnd) {556int length = trailingBytesForUTF8[**source] + 1;557if (length > sourceEnd - *source || !isLegalUTF8(*source, length))558return false;559*source += length;560}561return true;562}563564/* --------------------------------------------------------------------- */565566ConversionResult ConvertUTF8toUTF16 (567const UTF8** sourceStart, const UTF8* sourceEnd,568UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {569ConversionResult result = conversionOK;570const UTF8* source = *sourceStart;571UTF16* target = *targetStart;572while (source < sourceEnd) {573UTF32 ch = 0;574unsigned short extraBytesToRead = trailingBytesForUTF8[*source];575if (extraBytesToRead >= sourceEnd - source) {576result = sourceExhausted; break;577}578/* Do this check whether lenient or strict */579if (!isLegalUTF8(source, extraBytesToRead+1)) {580result = sourceIllegal;581break;582}583/*584* The cases all fall through. See "Note A" below.585*/586switch (extraBytesToRead) {587case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */588case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */589case 3: ch += *source++; ch <<= 6;590case 2: ch += *source++; ch <<= 6;591case 1: ch += *source++; ch <<= 6;592case 0: ch += *source++;593}594ch -= offsetsFromUTF8[extraBytesToRead];595596if (target >= targetEnd) {597source -= (extraBytesToRead+1); /* Back up source pointer! */598result = targetExhausted; break;599}600if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */601/* UTF-16 surrogate values are illegal in UTF-32 */602if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {603if (flags == strictConversion) {604source -= (extraBytesToRead+1); /* return to the illegal value itself */605result = sourceIllegal;606break;607} else {608*target++ = UNI_REPLACEMENT_CHAR;609}610} else {611*target++ = (UTF16)ch; /* normal case */612}613} else if (ch > UNI_MAX_UTF16) {614if (flags == strictConversion) {615result = sourceIllegal;616source -= (extraBytesToRead+1); /* return to the start */617break; /* Bail out; shouldn't continue */618} else {619*target++ = UNI_REPLACEMENT_CHAR;620}621} else {622/* target is a character in range 0xFFFF - 0x10FFFF. */623if (target + 1 >= targetEnd) {624source -= (extraBytesToRead+1); /* Back up source pointer! */625result = targetExhausted; break;626}627ch -= halfBase;628*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);629*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);630}631}632*sourceStart = source;633*targetStart = target;634return result;635}636637/* --------------------------------------------------------------------- */638639static ConversionResult ConvertUTF8toUTF32Impl(640const UTF8** sourceStart, const UTF8* sourceEnd,641UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,642Boolean InputIsPartial) {643ConversionResult result = conversionOK;644const UTF8* source = *sourceStart;645UTF32* target = *targetStart;646while (source < sourceEnd) {647UTF32 ch = 0;648unsigned short extraBytesToRead = trailingBytesForUTF8[*source];649if (extraBytesToRead >= sourceEnd - source) {650if (flags == strictConversion || InputIsPartial) {651result = sourceExhausted;652break;653} else {654result = sourceIllegal;655656/*657* Replace the maximal subpart of ill-formed sequence with658* replacement character.659*/660source += findMaximalSubpartOfIllFormedUTF8Sequence(source,661sourceEnd);662*target++ = UNI_REPLACEMENT_CHAR;663continue;664}665}666if (target >= targetEnd) {667result = targetExhausted; break;668}669670/* Do this check whether lenient or strict */671if (!isLegalUTF8(source, extraBytesToRead+1)) {672result = sourceIllegal;673if (flags == strictConversion) {674/* Abort conversion. */675break;676} else {677/*678* Replace the maximal subpart of ill-formed sequence with679* replacement character.680*/681source += findMaximalSubpartOfIllFormedUTF8Sequence(source,682sourceEnd);683*target++ = UNI_REPLACEMENT_CHAR;684continue;685}686}687/*688* The cases all fall through. See "Note A" below.689*/690switch (extraBytesToRead) {691case 5: ch += *source++; ch <<= 6;692case 4: ch += *source++; ch <<= 6;693case 3: ch += *source++; ch <<= 6;694case 2: ch += *source++; ch <<= 6;695case 1: ch += *source++; ch <<= 6;696case 0: ch += *source++;697}698ch -= offsetsFromUTF8[extraBytesToRead];699700if (ch <= UNI_MAX_LEGAL_UTF32) {701/*702* UTF-16 surrogate values are illegal in UTF-32, and anything703* over Plane 17 (> 0x10FFFF) is illegal.704*/705if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {706if (flags == strictConversion) {707source -= (extraBytesToRead+1); /* return to the illegal value itself */708result = sourceIllegal;709break;710} else {711*target++ = UNI_REPLACEMENT_CHAR;712}713} else {714*target++ = ch;715}716} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */717result = sourceIllegal;718*target++ = UNI_REPLACEMENT_CHAR;719}720}721*sourceStart = source;722*targetStart = target;723return result;724}725726ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,727const UTF8 *sourceEnd,728UTF32 **targetStart,729UTF32 *targetEnd,730ConversionFlags flags) {731return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,732flags, /*InputIsPartial=*/true);733}734735ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,736const UTF8 *sourceEnd, UTF32 **targetStart,737UTF32 *targetEnd, ConversionFlags flags) {738return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,739flags, /*InputIsPartial=*/false);740}741742/* ---------------------------------------------------------------------743744Note A.745The fall-through switches in UTF-8 reading code save a746temp variable, some decrements & conditionals. The switches747are equivalent to the following loop:748{749int tmpBytesToRead = extraBytesToRead+1;750do {751ch += *source++;752--tmpBytesToRead;753if (tmpBytesToRead) ch <<= 6;754} while (tmpBytesToRead > 0);755}756In UTF-8 writing code, the switches on "bytesToWrite" are757similarly unrolled loops.758759--------------------------------------------------------------------- */760761} // namespace llvm762763ConvertUTF_RESTORE_WARNINGS764765766