Path: blob/main/contrib/llvm-project/llvm/lib/Support/ConvertUTFWrapper.cpp
35234 views
//===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//78#include "llvm/ADT/ArrayRef.h"9#include "llvm/ADT/StringRef.h"10#include "llvm/Support/ConvertUTF.h"11#include "llvm/Support/ErrorHandling.h"12#include "llvm/Support/SwapByteOrder.h"13#include <string>14#include <vector>1516namespace llvm {1718bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,19char *&ResultPtr, const UTF8 *&ErrorPtr) {20assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4);21ConversionResult result = conversionOK;22// Copy the character span over.23if (WideCharWidth == 1) {24const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.begin());25if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) {26result = sourceIllegal;27ErrorPtr = Pos;28} else {29memcpy(ResultPtr, Source.data(), Source.size());30ResultPtr += Source.size();31}32} else if (WideCharWidth == 2) {33const UTF8 *sourceStart = (const UTF8*)Source.data();34// FIXME: Make the type of the result buffer correct instead of35// using reinterpret_cast.36UTF16 *targetStart = reinterpret_cast<UTF16 *>(ResultPtr);37ConversionFlags flags = strictConversion;38result =39ConvertUTF8toUTF16(&sourceStart, sourceStart + Source.size(),40&targetStart, targetStart + Source.size(), flags);41if (result == conversionOK)42ResultPtr = reinterpret_cast<char *>(targetStart);43else44ErrorPtr = sourceStart;45} else if (WideCharWidth == 4) {46const UTF8 *sourceStart = (const UTF8 *)Source.data();47// FIXME: Make the type of the result buffer correct instead of48// using reinterpret_cast.49UTF32 *targetStart = reinterpret_cast<UTF32 *>(ResultPtr);50ConversionFlags flags = strictConversion;51result =52ConvertUTF8toUTF32(&sourceStart, sourceStart + Source.size(),53&targetStart, targetStart + Source.size(), flags);54if (result == conversionOK)55ResultPtr = reinterpret_cast<char *>(targetStart);56else57ErrorPtr = sourceStart;58}59assert((result != targetExhausted) &&60"ConvertUTF8toUTFXX exhausted target buffer");61return result == conversionOK;62}6364bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {65const UTF32 *SourceStart = &Source;66const UTF32 *SourceEnd = SourceStart + 1;67UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr);68UTF8 *TargetEnd = TargetStart + 4;69ConversionResult CR = ConvertUTF32toUTF8(70&SourceStart, SourceEnd, &TargetStart, TargetEnd, strictConversion);71if (CR != conversionOK)72return false;7374ResultPtr = reinterpret_cast<char *>(TargetStart);75return true;76}7778bool hasUTF16ByteOrderMark(ArrayRef<char> S) {79return (S.size() >= 2 && ((S[0] == '\xff' && S[1] == '\xfe') ||80(S[0] == '\xfe' && S[1] == '\xff')));81}8283bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {84assert(Out.empty());8586// Error out on an uneven byte count.87if (SrcBytes.size() % 2)88return false;8990// Avoid OOB by returning early on empty input.91if (SrcBytes.empty())92return true;9394const UTF16 *Src = reinterpret_cast<const UTF16 *>(SrcBytes.begin());95const UTF16 *SrcEnd = reinterpret_cast<const UTF16 *>(SrcBytes.end());9697assert((uintptr_t)Src % sizeof(UTF16) == 0);9899// Byteswap if necessary.100std::vector<UTF16> ByteSwapped;101if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {102ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);103for (UTF16 &I : ByteSwapped)104I = llvm::byteswap<uint16_t>(I);105Src = &ByteSwapped[0];106SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;107}108109// Skip the BOM for conversion.110if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE)111Src++;112113// Just allocate enough space up front. We'll shrink it later. Allocate114// enough that we can fit a null terminator without reallocating.115Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1);116UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]);117UTF8 *DstEnd = Dst + Out.size();118119ConversionResult CR =120ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);121assert(CR != targetExhausted);122123if (CR != conversionOK) {124Out.clear();125return false;126}127128Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);129Out.push_back(0);130Out.pop_back();131return true;132}133134bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out) {135return convertUTF16ToUTF8String(136llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),137Src.size() * sizeof(UTF16)),138Out);139}140141bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {142assert(Out.empty());143144// Error out on an uneven byte count.145if (SrcBytes.size() % 4)146return false;147148// Avoid OOB by returning early on empty input.149if (SrcBytes.empty())150return true;151152const UTF32 *Src = reinterpret_cast<const UTF32 *>(SrcBytes.begin());153const UTF32 *SrcEnd = reinterpret_cast<const UTF32 *>(SrcBytes.end());154155assert((uintptr_t)Src % sizeof(UTF32) == 0);156157// Byteswap if necessary.158std::vector<UTF32> ByteSwapped;159if (Src[0] == UNI_UTF32_BYTE_ORDER_MARK_SWAPPED) {160ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);161for (UTF32 &I : ByteSwapped)162I = llvm::byteswap<uint32_t>(I);163Src = &ByteSwapped[0];164SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;165}166167// Skip the BOM for conversion.168if (Src[0] == UNI_UTF32_BYTE_ORDER_MARK_NATIVE)169Src++;170171// Just allocate enough space up front. We'll shrink it later. Allocate172// enough that we can fit a null terminator without reallocating.173Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1);174UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]);175UTF8 *DstEnd = Dst + Out.size();176177ConversionResult CR =178ConvertUTF32toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);179assert(CR != targetExhausted);180181if (CR != conversionOK) {182Out.clear();183return false;184}185186Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);187Out.push_back(0);188Out.pop_back();189return true;190}191192bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out) {193return convertUTF32ToUTF8String(194llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),195Src.size() * sizeof(UTF32)),196Out);197}198199bool convertUTF8ToUTF16String(StringRef SrcUTF8,200SmallVectorImpl<UTF16> &DstUTF16) {201assert(DstUTF16.empty());202203// Avoid OOB by returning early on empty input.204if (SrcUTF8.empty()) {205DstUTF16.push_back(0);206DstUTF16.pop_back();207return true;208}209210const UTF8 *Src = reinterpret_cast<const UTF8 *>(SrcUTF8.begin());211const UTF8 *SrcEnd = reinterpret_cast<const UTF8 *>(SrcUTF8.end());212213// Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding214// as UTF-16 should always require the same amount or less code units than the215// UTF-8 encoding. Allocate one extra byte for the null terminator though,216// so that someone calling DstUTF16.data() gets a null terminated string.217// We resize down later so we don't have to worry that this over allocates.218DstUTF16.resize(SrcUTF8.size()+1);219UTF16 *Dst = &DstUTF16[0];220UTF16 *DstEnd = Dst + DstUTF16.size();221222ConversionResult CR =223ConvertUTF8toUTF16(&Src, SrcEnd, &Dst, DstEnd, strictConversion);224assert(CR != targetExhausted);225226if (CR != conversionOK) {227DstUTF16.clear();228return false;229}230231DstUTF16.resize(Dst - &DstUTF16[0]);232DstUTF16.push_back(0);233DstUTF16.pop_back();234return true;235}236237static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 ||238sizeof(wchar_t) == 4,239"Expected wchar_t to be 1, 2, or 4 bytes");240241template <typename TResult>242static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source,243TResult &Result) {244// Even in the case of UTF-16, the number of bytes in a UTF-8 string is245// at least as large as the number of elements in the resulting wide246// string, because surrogate pairs take at least 4 bytes in UTF-8.247Result.resize(Source.size() + 1);248char *ResultPtr = reinterpret_cast<char *>(&Result[0]);249const UTF8 *ErrorPtr;250if (!ConvertUTF8toWide(sizeof(wchar_t), Source, ResultPtr, ErrorPtr)) {251Result.clear();252return false;253}254Result.resize(reinterpret_cast<wchar_t *>(ResultPtr) - &Result[0]);255return true;256}257258bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result) {259return ConvertUTF8toWideInternal(Source, Result);260}261262bool ConvertUTF8toWide(const char *Source, std::wstring &Result) {263if (!Source) {264Result.clear();265return true;266}267return ConvertUTF8toWide(llvm::StringRef(Source), Result);268}269270bool convertWideToUTF8(const std::wstring &Source, std::string &Result) {271if (sizeof(wchar_t) == 1) {272const UTF8 *Start = reinterpret_cast<const UTF8 *>(Source.data());273const UTF8 *End =274reinterpret_cast<const UTF8 *>(Source.data() + Source.size());275if (!isLegalUTF8String(&Start, End))276return false;277Result.resize(Source.size());278memcpy(&Result[0], Source.data(), Source.size());279return true;280} else if (sizeof(wchar_t) == 2) {281return convertUTF16ToUTF8String(282llvm::ArrayRef<UTF16>(reinterpret_cast<const UTF16 *>(Source.data()),283Source.size()),284Result);285} else if (sizeof(wchar_t) == 4) {286const UTF32 *Start = reinterpret_cast<const UTF32 *>(Source.data());287const UTF32 *End =288reinterpret_cast<const UTF32 *>(Source.data() + Source.size());289Result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * Source.size());290UTF8 *ResultPtr = reinterpret_cast<UTF8 *>(&Result[0]);291UTF8 *ResultEnd = reinterpret_cast<UTF8 *>(&Result[0] + Result.size());292if (ConvertUTF32toUTF8(&Start, End, &ResultPtr, ResultEnd,293strictConversion) == conversionOK) {294Result.resize(reinterpret_cast<char *>(ResultPtr) - &Result[0]);295return true;296} else {297Result.clear();298return false;299}300} else {301llvm_unreachable(302"Control should never reach this point; see static_assert further up");303}304}305306} // end namespace llvm307308309310