Path: blob/main/contrib/llvm-project/clang/lib/Lex/LiteralSupport.cpp
35233 views
//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file implements the NumericLiteralParser, CharLiteralParser, and9// StringLiteralParser interfaces.10//11//===----------------------------------------------------------------------===//1213#include "clang/Lex/LiteralSupport.h"14#include "clang/Basic/CharInfo.h"15#include "clang/Basic/LangOptions.h"16#include "clang/Basic/SourceLocation.h"17#include "clang/Basic/TargetInfo.h"18#include "clang/Lex/LexDiagnostic.h"19#include "clang/Lex/Lexer.h"20#include "clang/Lex/Preprocessor.h"21#include "clang/Lex/Token.h"22#include "llvm/ADT/APInt.h"23#include "llvm/ADT/SmallVector.h"24#include "llvm/ADT/StringExtras.h"25#include "llvm/ADT/StringSwitch.h"26#include "llvm/Support/ConvertUTF.h"27#include "llvm/Support/Error.h"28#include "llvm/Support/ErrorHandling.h"29#include "llvm/Support/Unicode.h"30#include <algorithm>31#include <cassert>32#include <cstddef>33#include <cstdint>34#include <cstring>35#include <string>3637using namespace clang;3839static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {40switch (kind) {41default: llvm_unreachable("Unknown token type!");42case tok::char_constant:43case tok::string_literal:44case tok::utf8_char_constant:45case tok::utf8_string_literal:46return Target.getCharWidth();47case tok::wide_char_constant:48case tok::wide_string_literal:49return Target.getWCharWidth();50case tok::utf16_char_constant:51case tok::utf16_string_literal:52return Target.getChar16Width();53case tok::utf32_char_constant:54case tok::utf32_string_literal:55return Target.getChar32Width();56}57}5859static unsigned getEncodingPrefixLen(tok::TokenKind kind) {60switch (kind) {61default:62llvm_unreachable("Unknown token type!");63case tok::char_constant:64case tok::string_literal:65return 0;66case tok::utf8_char_constant:67case tok::utf8_string_literal:68return 2;69case tok::wide_char_constant:70case tok::wide_string_literal:71case tok::utf16_char_constant:72case tok::utf16_string_literal:73case tok::utf32_char_constant:74case tok::utf32_string_literal:75return 1;76}77}7879static CharSourceRange MakeCharSourceRange(const LangOptions &Features,80FullSourceLoc TokLoc,81const char *TokBegin,82const char *TokRangeBegin,83const char *TokRangeEnd) {84SourceLocation Begin =85Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,86TokLoc.getManager(), Features);87SourceLocation End =88Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,89TokLoc.getManager(), Features);90return CharSourceRange::getCharRange(Begin, End);91}9293/// Produce a diagnostic highlighting some portion of a literal.94///95/// Emits the diagnostic \p DiagID, highlighting the range of characters from96/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be97/// a substring of a spelling buffer for the token beginning at \p TokBegin.98static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,99const LangOptions &Features, FullSourceLoc TokLoc,100const char *TokBegin, const char *TokRangeBegin,101const char *TokRangeEnd, unsigned DiagID) {102SourceLocation Begin =103Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,104TokLoc.getManager(), Features);105return Diags->Report(Begin, DiagID) <<106MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);107}108109static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {110switch (Escape) {111case '\'':112case '"':113case '?':114case '\\':115case 'a':116case 'b':117case 'f':118case 'n':119case 'r':120case 't':121case 'v':122return true;123}124return false;125}126127/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in128/// either a character or a string literal.129static unsigned ProcessCharEscape(const char *ThisTokBegin,130const char *&ThisTokBuf,131const char *ThisTokEnd, bool &HadError,132FullSourceLoc Loc, unsigned CharWidth,133DiagnosticsEngine *Diags,134const LangOptions &Features,135StringLiteralEvalMethod EvalMethod) {136const char *EscapeBegin = ThisTokBuf;137bool Delimited = false;138bool EndDelimiterFound = false;139140// Skip the '\' char.141++ThisTokBuf;142143// We know that this character can't be off the end of the buffer, because144// that would have been \", which would not have been the end of string.145unsigned ResultChar = *ThisTokBuf++;146char Escape = ResultChar;147switch (ResultChar) {148// These map to themselves.149case '\\': case '\'': case '"': case '?': break;150151// These have fixed mappings.152case 'a':153// TODO: K&R: the meaning of '\\a' is different in traditional C154ResultChar = 7;155break;156case 'b':157ResultChar = 8;158break;159case 'e':160if (Diags)161Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,162diag::ext_nonstandard_escape) << "e";163ResultChar = 27;164break;165case 'E':166if (Diags)167Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,168diag::ext_nonstandard_escape) << "E";169ResultChar = 27;170break;171case 'f':172ResultChar = 12;173break;174case 'n':175ResultChar = 10;176break;177case 'r':178ResultChar = 13;179break;180case 't':181ResultChar = 9;182break;183case 'v':184ResultChar = 11;185break;186case 'x': { // Hex escape.187ResultChar = 0;188if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {189Delimited = true;190ThisTokBuf++;191if (*ThisTokBuf == '}') {192Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,193diag::err_delimited_escape_empty);194return ResultChar;195}196} else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {197if (Diags)198Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,199diag::err_hex_escape_no_digits) << "x";200return ResultChar;201}202203// Hex escapes are a maximal series of hex digits.204bool Overflow = false;205for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {206if (Delimited && *ThisTokBuf == '}') {207ThisTokBuf++;208EndDelimiterFound = true;209break;210}211int CharVal = llvm::hexDigitValue(*ThisTokBuf);212if (CharVal == -1) {213// Non delimited hex escape sequences stop at the first non-hex digit.214if (!Delimited)215break;216HadError = true;217if (Diags)218Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,219diag::err_delimited_escape_invalid)220<< StringRef(ThisTokBuf, 1);221continue;222}223// About to shift out a digit?224if (ResultChar & 0xF0000000)225Overflow = true;226ResultChar <<= 4;227ResultChar |= CharVal;228}229// See if any bits will be truncated when evaluated as a character.230if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {231Overflow = true;232ResultChar &= ~0U >> (32-CharWidth);233}234235// Check for overflow.236if (!HadError && Overflow) { // Too many digits to fit in237HadError = true;238if (Diags)239Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,240diag::err_escape_too_large)241<< 0;242}243break;244}245case '0': case '1': case '2': case '3':246case '4': case '5': case '6': case '7': {247// Octal escapes.248--ThisTokBuf;249ResultChar = 0;250251// Octal escapes are a series of octal digits with maximum length 3.252// "\0123" is a two digit sequence equal to "\012" "3".253unsigned NumDigits = 0;254do {255ResultChar <<= 3;256ResultChar |= *ThisTokBuf++ - '0';257++NumDigits;258} while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&259ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');260261// Check for overflow. Reject '\777', but not L'\777'.262if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {263if (Diags)264Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,265diag::err_escape_too_large) << 1;266ResultChar &= ~0U >> (32-CharWidth);267}268break;269}270case 'o': {271bool Overflow = false;272if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {273HadError = true;274if (Diags)275Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,276diag::err_delimited_escape_missing_brace)277<< "o";278279break;280}281ResultChar = 0;282Delimited = true;283++ThisTokBuf;284if (*ThisTokBuf == '}') {285Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,286diag::err_delimited_escape_empty);287return ResultChar;288}289290while (ThisTokBuf != ThisTokEnd) {291if (*ThisTokBuf == '}') {292EndDelimiterFound = true;293ThisTokBuf++;294break;295}296if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {297HadError = true;298if (Diags)299Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,300diag::err_delimited_escape_invalid)301<< StringRef(ThisTokBuf, 1);302ThisTokBuf++;303continue;304}305// Check if one of the top three bits is set before shifting them out.306if (ResultChar & 0xE0000000)307Overflow = true;308309ResultChar <<= 3;310ResultChar |= *ThisTokBuf++ - '0';311}312// Check for overflow. Reject '\777', but not L'\777'.313if (!HadError &&314(Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {315HadError = true;316if (Diags)317Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,318diag::err_escape_too_large)319<< 1;320ResultChar &= ~0U >> (32 - CharWidth);321}322break;323}324// Otherwise, these are not valid escapes.325case '(': case '{': case '[': case '%':326// GCC accepts these as extensions. We warn about them as such though.327if (Diags)328Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,329diag::ext_nonstandard_escape)330<< std::string(1, ResultChar);331break;332default:333if (!Diags)334break;335336if (isPrintable(ResultChar))337Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,338diag::ext_unknown_escape)339<< std::string(1, ResultChar);340else341Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,342diag::ext_unknown_escape)343<< "x" + llvm::utohexstr(ResultChar);344break;345}346347if (Delimited && Diags) {348if (!EndDelimiterFound)349Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,350diag::err_expected)351<< tok::r_brace;352else if (!HadError) {353Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,354Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence355: diag::ext_delimited_escape_sequence)356<< /*delimited*/ 0 << (Features.CPlusPlus ? 1 : 0);357}358}359360if (EvalMethod == StringLiteralEvalMethod::Unevaluated &&361!IsEscapeValidInUnevaluatedStringLiteral(Escape)) {362Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,363diag::err_unevaluated_string_invalid_escape_sequence)364<< StringRef(EscapeBegin, ThisTokBuf - EscapeBegin);365HadError = true;366}367368return ResultChar;369}370371static void appendCodePoint(unsigned Codepoint,372llvm::SmallVectorImpl<char> &Str) {373char ResultBuf[4];374char *ResultPtr = ResultBuf;375if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))376Str.append(ResultBuf, ResultPtr);377}378379void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {380for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {381if (*I != '\\') {382Buf.push_back(*I);383continue;384}385386++I;387char Kind = *I;388++I;389390assert(Kind == 'u' || Kind == 'U' || Kind == 'N');391uint32_t CodePoint = 0;392393if (Kind == 'u' && *I == '{') {394for (++I; *I != '}'; ++I) {395unsigned Value = llvm::hexDigitValue(*I);396assert(Value != -1U);397CodePoint <<= 4;398CodePoint += Value;399}400appendCodePoint(CodePoint, Buf);401continue;402}403404if (Kind == 'N') {405assert(*I == '{');406++I;407auto Delim = std::find(I, Input.end(), '}');408assert(Delim != Input.end());409StringRef Name(I, std::distance(I, Delim));410std::optional<llvm::sys::unicode::LooseMatchingResult> Res =411llvm::sys::unicode::nameToCodepointLooseMatching(Name);412assert(Res && "could not find a codepoint that was previously found");413CodePoint = Res->CodePoint;414assert(CodePoint != 0xFFFFFFFF);415appendCodePoint(CodePoint, Buf);416I = Delim;417continue;418}419420unsigned NumHexDigits;421if (Kind == 'u')422NumHexDigits = 4;423else424NumHexDigits = 8;425426assert(I + NumHexDigits <= E);427428for (; NumHexDigits != 0; ++I, --NumHexDigits) {429unsigned Value = llvm::hexDigitValue(*I);430assert(Value != -1U);431432CodePoint <<= 4;433CodePoint += Value;434}435436appendCodePoint(CodePoint, Buf);437--I;438}439}440441bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K,442const LangOptions &LO) {443return LO.MicrosoftExt &&444(K == tok::kw___FUNCTION__ || K == tok::kw_L__FUNCTION__ ||445K == tok::kw___FUNCSIG__ || K == tok::kw_L__FUNCSIG__ ||446K == tok::kw___FUNCDNAME__);447}448449bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) {450return tok::isStringLiteral(Tok.getKind()) ||451isFunctionLocalStringLiteralMacro(Tok.getKind(), LO);452}453454static bool ProcessNumericUCNEscape(const char *ThisTokBegin,455const char *&ThisTokBuf,456const char *ThisTokEnd, uint32_t &UcnVal,457unsigned short &UcnLen, bool &Delimited,458FullSourceLoc Loc, DiagnosticsEngine *Diags,459const LangOptions &Features,460bool in_char_string_literal = false) {461const char *UcnBegin = ThisTokBuf;462bool HasError = false;463bool EndDelimiterFound = false;464465// Skip the '\u' char's.466ThisTokBuf += 2;467Delimited = false;468if (UcnBegin[1] == 'u' && in_char_string_literal &&469ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {470Delimited = true;471ThisTokBuf++;472} else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {473if (Diags)474Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,475diag::err_hex_escape_no_digits)476<< StringRef(&ThisTokBuf[-1], 1);477return false;478}479UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);480481bool Overflow = false;482unsigned short Count = 0;483for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);484++ThisTokBuf) {485if (Delimited && *ThisTokBuf == '}') {486++ThisTokBuf;487EndDelimiterFound = true;488break;489}490int CharVal = llvm::hexDigitValue(*ThisTokBuf);491if (CharVal == -1) {492HasError = true;493if (!Delimited)494break;495if (Diags) {496Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,497diag::err_delimited_escape_invalid)498<< StringRef(ThisTokBuf, 1);499}500Count++;501continue;502}503if (UcnVal & 0xF0000000) {504Overflow = true;505continue;506}507UcnVal <<= 4;508UcnVal |= CharVal;509Count++;510}511512if (Overflow) {513if (Diags)514Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,515diag::err_escape_too_large)516<< 0;517return false;518}519520if (Delimited && !EndDelimiterFound) {521if (Diags) {522Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,523diag::err_expected)524<< tok::r_brace;525}526return false;527}528529// If we didn't consume the proper number of digits, there is a problem.530if (Count == 0 || (!Delimited && Count != UcnLen)) {531if (Diags)532Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,533Delimited ? diag::err_delimited_escape_empty534: diag::err_ucn_escape_incomplete);535return false;536}537return !HasError;538}539540static void DiagnoseInvalidUnicodeCharacterName(541DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,542const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,543llvm::StringRef Name) {544545Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,546diag::err_invalid_ucn_name)547<< Name;548549namespace u = llvm::sys::unicode;550551std::optional<u::LooseMatchingResult> Res =552u::nameToCodepointLooseMatching(Name);553if (Res) {554Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,555diag::note_invalid_ucn_name_loose_matching)556<< FixItHint::CreateReplacement(557MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,558TokRangeEnd),559Res->Name);560return;561}562563unsigned Distance = 0;564SmallVector<u::MatchForCodepointName> Matches =565u::nearestMatchesForCodepointName(Name, 5);566assert(!Matches.empty() && "No unicode characters found");567568for (const auto &Match : Matches) {569if (Distance == 0)570Distance = Match.Distance;571if (std::max(Distance, Match.Distance) -572std::min(Distance, Match.Distance) >5733)574break;575Distance = Match.Distance;576577std::string Str;578llvm::UTF32 V = Match.Value;579bool Converted =580llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);581(void)Converted;582assert(Converted && "Found a match wich is not a unicode character");583584Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,585diag::note_invalid_ucn_name_candidate)586<< Match.Name << llvm::utohexstr(Match.Value)587<< Str // FIXME: Fix the rendering of non printable characters588<< FixItHint::CreateReplacement(589MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,590TokRangeEnd),591Match.Name);592}593}594595static bool ProcessNamedUCNEscape(const char *ThisTokBegin,596const char *&ThisTokBuf,597const char *ThisTokEnd, uint32_t &UcnVal,598unsigned short &UcnLen, FullSourceLoc Loc,599DiagnosticsEngine *Diags,600const LangOptions &Features) {601const char *UcnBegin = ThisTokBuf;602assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');603ThisTokBuf += 2;604if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {605if (Diags) {606Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,607diag::err_delimited_escape_missing_brace)608<< StringRef(&ThisTokBuf[-1], 1);609}610return false;611}612ThisTokBuf++;613const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) {614return C == '}' || isVerticalWhitespace(C);615});616bool Incomplete = ClosingBrace == ThisTokEnd;617bool Empty = ClosingBrace == ThisTokBuf;618if (Incomplete || Empty) {619if (Diags) {620Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,621Incomplete ? diag::err_ucn_escape_incomplete622: diag::err_delimited_escape_empty)623<< StringRef(&UcnBegin[1], 1);624}625ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;626return false;627}628StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);629ThisTokBuf = ClosingBrace + 1;630std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name);631if (!Res) {632if (Diags)633DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,634&UcnBegin[3], ClosingBrace, Name);635return false;636}637UcnVal = *Res;638UcnLen = UcnVal > 0xFFFF ? 8 : 4;639return true;640}641642/// ProcessUCNEscape - Read the Universal Character Name, check constraints and643/// return the UTF32.644static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,645const char *ThisTokEnd, uint32_t &UcnVal,646unsigned short &UcnLen, FullSourceLoc Loc,647DiagnosticsEngine *Diags,648const LangOptions &Features,649bool in_char_string_literal = false) {650651bool HasError;652const char *UcnBegin = ThisTokBuf;653bool IsDelimitedEscapeSequence = false;654bool IsNamedEscapeSequence = false;655if (ThisTokBuf[1] == 'N') {656IsNamedEscapeSequence = true;657HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,658UcnVal, UcnLen, Loc, Diags, Features);659} else {660HasError =661!ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,662UcnLen, IsDelimitedEscapeSequence, Loc, Diags,663Features, in_char_string_literal);664}665if (HasError)666return false;667668// Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]669if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints670UcnVal > 0x10FFFF) { // maximum legal UTF32 value671if (Diags)672Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,673diag::err_ucn_escape_invalid);674return false;675}676677// C23 and C++11 allow UCNs that refer to control characters678// and basic source characters inside character and string literals679if (UcnVal < 0xa0 &&680// $, @, ` are allowed in all language modes681(UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {682bool IsError =683(!(Features.CPlusPlus11 || Features.C23) || !in_char_string_literal);684if (Diags) {685char BasicSCSChar = UcnVal;686if (UcnVal >= 0x20 && UcnVal < 0x7f)687Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,688IsError ? diag::err_ucn_escape_basic_scs689: Features.CPlusPlus690? diag::warn_cxx98_compat_literal_ucn_escape_basic_scs691: diag::warn_c23_compat_literal_ucn_escape_basic_scs)692<< StringRef(&BasicSCSChar, 1);693else694Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,695IsError ? diag::err_ucn_control_character696: Features.CPlusPlus697? diag::warn_cxx98_compat_literal_ucn_control_character698: diag::warn_c23_compat_literal_ucn_control_character);699}700if (IsError)701return false;702}703704if (!Features.CPlusPlus && !Features.C99 && Diags)705Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,706diag::warn_ucn_not_valid_in_c89_literal);707708if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)709Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,710Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence711: diag::ext_delimited_escape_sequence)712<< (IsNamedEscapeSequence ? 1 : 0) << (Features.CPlusPlus ? 1 : 0);713714return true;715}716717/// MeasureUCNEscape - Determine the number of bytes within the resulting string718/// which this UCN will occupy.719static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,720const char *ThisTokEnd, unsigned CharByteWidth,721const LangOptions &Features, bool &HadError) {722// UTF-32: 4 bytes per escape.723if (CharByteWidth == 4)724return 4;725726uint32_t UcnVal = 0;727unsigned short UcnLen = 0;728FullSourceLoc Loc;729730if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,731UcnLen, Loc, nullptr, Features, true)) {732HadError = true;733return 0;734}735736// UTF-16: 2 bytes for BMP, 4 bytes otherwise.737if (CharByteWidth == 2)738return UcnVal <= 0xFFFF ? 2 : 4;739740// UTF-8.741if (UcnVal < 0x80)742return 1;743if (UcnVal < 0x800)744return 2;745if (UcnVal < 0x10000)746return 3;747return 4;748}749750/// EncodeUCNEscape - Read the Universal Character Name, check constraints and751/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of752/// StringLiteralParser. When we decide to implement UCN's for identifiers,753/// we will likely rework our support for UCN's.754static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,755const char *ThisTokEnd,756char *&ResultBuf, bool &HadError,757FullSourceLoc Loc, unsigned CharByteWidth,758DiagnosticsEngine *Diags,759const LangOptions &Features) {760typedef uint32_t UTF32;761UTF32 UcnVal = 0;762unsigned short UcnLen = 0;763if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,764Loc, Diags, Features, true)) {765HadError = true;766return;767}768769assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&770"only character widths of 1, 2, or 4 bytes supported");771772(void)UcnLen;773assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");774775if (CharByteWidth == 4) {776// FIXME: Make the type of the result buffer correct instead of777// using reinterpret_cast.778llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);779*ResultPtr = UcnVal;780ResultBuf += 4;781return;782}783784if (CharByteWidth == 2) {785// FIXME: Make the type of the result buffer correct instead of786// using reinterpret_cast.787llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);788789if (UcnVal <= (UTF32)0xFFFF) {790*ResultPtr = UcnVal;791ResultBuf += 2;792return;793}794795// Convert to UTF16.796UcnVal -= 0x10000;797*ResultPtr = 0xD800 + (UcnVal >> 10);798*(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);799ResultBuf += 4;800return;801}802803assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");804805// Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.806// The conversion below was inspired by:807// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c808// First, we determine how many bytes the result will require.809typedef uint8_t UTF8;810811unsigned short bytesToWrite = 0;812if (UcnVal < (UTF32)0x80)813bytesToWrite = 1;814else if (UcnVal < (UTF32)0x800)815bytesToWrite = 2;816else if (UcnVal < (UTF32)0x10000)817bytesToWrite = 3;818else819bytesToWrite = 4;820821const unsigned byteMask = 0xBF;822const unsigned byteMark = 0x80;823824// Once the bits are split out into bytes of UTF8, this is a mask OR-ed825// into the first byte, depending on how many bytes follow.826static const UTF8 firstByteMark[5] = {8270x00, 0x00, 0xC0, 0xE0, 0xF0828};829// Finally, we write the bytes into ResultBuf.830ResultBuf += bytesToWrite;831switch (bytesToWrite) { // note: everything falls through.832case 4:833*--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;834[[fallthrough]];835case 3:836*--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;837[[fallthrough]];838case 2:839*--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;840[[fallthrough]];841case 1:842*--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);843}844// Update the buffer.845ResultBuf += bytesToWrite;846}847848/// integer-constant: [C99 6.4.4.1]849/// decimal-constant integer-suffix850/// octal-constant integer-suffix851/// hexadecimal-constant integer-suffix852/// binary-literal integer-suffix [GNU, C++1y]853/// user-defined-integer-literal: [C++11 lex.ext]854/// decimal-literal ud-suffix855/// octal-literal ud-suffix856/// hexadecimal-literal ud-suffix857/// binary-literal ud-suffix [GNU, C++1y]858/// decimal-constant:859/// nonzero-digit860/// decimal-constant digit861/// octal-constant:862/// 0863/// octal-constant octal-digit864/// hexadecimal-constant:865/// hexadecimal-prefix hexadecimal-digit866/// hexadecimal-constant hexadecimal-digit867/// hexadecimal-prefix: one of868/// 0x 0X869/// binary-literal:870/// 0b binary-digit871/// 0B binary-digit872/// binary-literal binary-digit873/// integer-suffix:874/// unsigned-suffix [long-suffix]875/// unsigned-suffix [long-long-suffix]876/// long-suffix [unsigned-suffix]877/// long-long-suffix [unsigned-sufix]878/// nonzero-digit:879/// 1 2 3 4 5 6 7 8 9880/// octal-digit:881/// 0 1 2 3 4 5 6 7882/// hexadecimal-digit:883/// 0 1 2 3 4 5 6 7 8 9884/// a b c d e f885/// A B C D E F886/// binary-digit:887/// 0888/// 1889/// unsigned-suffix: one of890/// u U891/// long-suffix: one of892/// l L893/// long-long-suffix: one of894/// ll LL895///896/// floating-constant: [C99 6.4.4.2]897/// TODO: add rules...898///899NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,900SourceLocation TokLoc,901const SourceManager &SM,902const LangOptions &LangOpts,903const TargetInfo &Target,904DiagnosticsEngine &Diags)905: SM(SM), LangOpts(LangOpts), Diags(Diags),906ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {907908s = DigitsBegin = ThisTokBegin;909saw_exponent = false;910saw_period = false;911saw_ud_suffix = false;912saw_fixed_point_suffix = false;913isLong = false;914isUnsigned = false;915isLongLong = false;916isSizeT = false;917isHalf = false;918isFloat = false;919isImaginary = false;920isFloat16 = false;921isFloat128 = false;922MicrosoftInteger = 0;923isFract = false;924isAccum = false;925hadError = false;926isBitInt = false;927928// This routine assumes that the range begin/end matches the regex for integer929// and FP constants (specifically, the 'pp-number' regex), and assumes that930// the byte at "*end" is both valid and not part of the regex. Because of931// this, it doesn't have to check for 'overscan' in various places.932// Note: For HLSL, the end token is allowed to be '.' which would be in the933// 'pp-number' regex. This is required to support vector swizzles on numeric934// constants (i.e. 1.xx or 1.5f.rrr).935if (isPreprocessingNumberBody(*ThisTokEnd) &&936!(LangOpts.HLSL && *ThisTokEnd == '.')) {937Diags.Report(TokLoc, diag::err_lexing_numeric);938hadError = true;939return;940}941942if (*s == '0') { // parse radix943ParseNumberStartingWithZero(TokLoc);944if (hadError)945return;946} else { // the first digit is non-zero947radix = 10;948s = SkipDigits(s);949if (s == ThisTokEnd) {950// Done.951} else {952ParseDecimalOrOctalCommon(TokLoc);953if (hadError)954return;955}956}957958SuffixBegin = s;959checkSeparator(TokLoc, s, CSK_AfterDigits);960961// Initial scan to lookahead for fixed point suffix.962if (LangOpts.FixedPoint) {963for (const char *c = s; c != ThisTokEnd; ++c) {964if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') {965saw_fixed_point_suffix = true;966break;967}968}969}970971// Parse the suffix. At this point we can classify whether we have an FP or972// integer constant.973bool isFixedPointConstant = isFixedPointLiteral();974bool isFPConstant = isFloatingLiteral();975bool HasSize = false;976bool DoubleUnderscore = false;977978// Loop over all of the characters of the suffix. If we see something bad,979// we break out of the loop.980for (; s != ThisTokEnd; ++s) {981switch (*s) {982case 'R':983case 'r':984if (!LangOpts.FixedPoint)985break;986if (isFract || isAccum) break;987if (!(saw_period || saw_exponent)) break;988isFract = true;989continue;990case 'K':991case 'k':992if (!LangOpts.FixedPoint)993break;994if (isFract || isAccum) break;995if (!(saw_period || saw_exponent)) break;996isAccum = true;997continue;998case 'h': // FP Suffix for "half".999case 'H':1000// OpenCL Extension v1.2 s9.5 - h or H suffix for half type.1001if (!(LangOpts.Half || LangOpts.FixedPoint))1002break;1003if (isIntegerLiteral()) break; // Error for integer constant.1004if (HasSize)1005break;1006HasSize = true;1007isHalf = true;1008continue; // Success.1009case 'f': // FP Suffix for "float"1010case 'F':1011if (!isFPConstant) break; // Error for integer constant.1012if (HasSize)1013break;1014HasSize = true;10151016// CUDA host and device may have different _Float16 support, therefore1017// allows f16 literals to avoid false alarm.1018// When we compile for OpenMP target offloading on NVPTX, f16 suffix1019// should also be supported.1020// ToDo: more precise check for CUDA.1021// TODO: AMDGPU might also support it in the future.1022if ((Target.hasFloat16Type() || LangOpts.CUDA ||1023(LangOpts.OpenMPIsTargetDevice && Target.getTriple().isNVPTX())) &&1024s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') {1025s += 2; // success, eat up 2 characters.1026isFloat16 = true;1027continue;1028}10291030isFloat = true;1031continue; // Success.1032case 'q': // FP Suffix for "__float128"1033case 'Q':1034if (!isFPConstant) break; // Error for integer constant.1035if (HasSize)1036break;1037HasSize = true;1038isFloat128 = true;1039continue; // Success.1040case 'u':1041case 'U':1042if (isFPConstant) break; // Error for floating constant.1043if (isUnsigned) break; // Cannot be repeated.1044isUnsigned = true;1045continue; // Success.1046case 'l':1047case 'L':1048if (HasSize)1049break;1050HasSize = true;10511052// Check for long long. The L's need to be adjacent and the same case.1053if (s[1] == s[0]) {1054assert(s + 1 < ThisTokEnd && "didn't maximally munch?");1055if (isFPConstant) break; // long long invalid for floats.1056isLongLong = true;1057++s; // Eat both of them.1058} else {1059isLong = true;1060}1061continue; // Success.1062case 'z':1063case 'Z':1064if (isFPConstant)1065break; // Invalid for floats.1066if (HasSize)1067break;1068HasSize = true;1069isSizeT = true;1070continue;1071case 'i':1072case 'I':1073if (LangOpts.MicrosoftExt && !isFPConstant) {1074// Allow i8, i16, i32, and i64. First, look ahead and check if1075// suffixes are Microsoft integers and not the imaginary unit.1076uint8_t Bits = 0;1077size_t ToSkip = 0;1078switch (s[1]) {1079case '8': // i8 suffix1080Bits = 8;1081ToSkip = 2;1082break;1083case '1':1084if (s[2] == '6') { // i16 suffix1085Bits = 16;1086ToSkip = 3;1087}1088break;1089case '3':1090if (s[2] == '2') { // i32 suffix1091Bits = 32;1092ToSkip = 3;1093}1094break;1095case '6':1096if (s[2] == '4') { // i64 suffix1097Bits = 64;1098ToSkip = 3;1099}1100break;1101default:1102break;1103}1104if (Bits) {1105if (HasSize)1106break;1107HasSize = true;1108MicrosoftInteger = Bits;1109s += ToSkip;1110assert(s <= ThisTokEnd && "didn't maximally munch?");1111break;1112}1113}1114[[fallthrough]];1115case 'j':1116case 'J':1117if (isImaginary) break; // Cannot be repeated.1118isImaginary = true;1119continue; // Success.1120case '_':1121if (isFPConstant)1122break; // Invalid for floats1123if (HasSize)1124break;1125// There is currently no way to reach this with DoubleUnderscore set.1126// If new double underscope literals are added handle it here as above.1127assert(!DoubleUnderscore && "unhandled double underscore case");1128if (LangOpts.CPlusPlus && s + 2 < ThisTokEnd &&1129s[1] == '_') { // s + 2 < ThisTokEnd to ensure some character exists1130// after __1131DoubleUnderscore = true;1132s += 2; // Skip both '_'1133if (s + 1 < ThisTokEnd &&1134(*s == 'u' || *s == 'U')) { // Ensure some character after 'u'/'U'1135isUnsigned = true;1136++s;1137}1138if (s + 1 < ThisTokEnd &&1139((*s == 'w' && *(++s) == 'b') || (*s == 'W' && *(++s) == 'B'))) {1140isBitInt = true;1141HasSize = true;1142continue;1143}1144}1145break;1146case 'w':1147case 'W':1148if (isFPConstant)1149break; // Invalid for floats.1150if (HasSize)1151break; // Invalid if we already have a size for the literal.11521153// wb and WB are allowed, but a mixture of cases like Wb or wB is not. We1154// explicitly do not support the suffix in C++ as an extension because a1155// library-based UDL that resolves to a library type may be more1156// appropriate there. The same rules apply for __wb/__WB.1157if ((!LangOpts.CPlusPlus || DoubleUnderscore) && s + 1 < ThisTokEnd &&1158((s[0] == 'w' && s[1] == 'b') || (s[0] == 'W' && s[1] == 'B'))) {1159isBitInt = true;1160HasSize = true;1161++s; // Skip both characters (2nd char skipped on continue).1162continue; // Success.1163}1164}1165// If we reached here, there was an error or a ud-suffix.1166break;1167}11681169// "i", "if", and "il" are user-defined suffixes in C++1y.1170if (s != ThisTokEnd || isImaginary) {1171// FIXME: Don't bother expanding UCNs if !tok.hasUCN().1172expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));1173if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {1174if (!isImaginary) {1175// Any suffix pieces we might have parsed are actually part of the1176// ud-suffix.1177isLong = false;1178isUnsigned = false;1179isLongLong = false;1180isSizeT = false;1181isFloat = false;1182isFloat16 = false;1183isHalf = false;1184isImaginary = false;1185isBitInt = false;1186MicrosoftInteger = 0;1187saw_fixed_point_suffix = false;1188isFract = false;1189isAccum = false;1190}11911192saw_ud_suffix = true;1193return;1194}11951196if (s != ThisTokEnd) {1197// Report an error if there are any.1198Diags.Report(Lexer::AdvanceToTokenCharacter(1199TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),1200diag::err_invalid_suffix_constant)1201<< StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)1202<< (isFixedPointConstant ? 2 : isFPConstant);1203hadError = true;1204}1205}12061207if (!hadError && saw_fixed_point_suffix) {1208assert(isFract || isAccum);1209}1210}12111212/// ParseDecimalOrOctalCommon - This method is called for decimal or octal1213/// numbers. It issues an error for illegal digits, and handles floating point1214/// parsing. If it detects a floating point number, the radix is set to 10.1215void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){1216assert((radix == 8 || radix == 10) && "Unexpected radix");12171218// If we have a hex digit other than 'e' (which denotes a FP exponent) then1219// the code is using an incorrect base.1220if (isHexDigit(*s) && *s != 'e' && *s != 'E' &&1221!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {1222Diags.Report(1223Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),1224diag::err_invalid_digit)1225<< StringRef(s, 1) << (radix == 8 ? 1 : 0);1226hadError = true;1227return;1228}12291230if (*s == '.') {1231checkSeparator(TokLoc, s, CSK_AfterDigits);1232s++;1233radix = 10;1234saw_period = true;1235checkSeparator(TokLoc, s, CSK_BeforeDigits);1236s = SkipDigits(s); // Skip suffix.1237}1238if (*s == 'e' || *s == 'E') { // exponent1239checkSeparator(TokLoc, s, CSK_AfterDigits);1240const char *Exponent = s;1241s++;1242radix = 10;1243saw_exponent = true;1244if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign1245const char *first_non_digit = SkipDigits(s);1246if (containsDigits(s, first_non_digit)) {1247checkSeparator(TokLoc, s, CSK_BeforeDigits);1248s = first_non_digit;1249} else {1250if (!hadError) {1251Diags.Report(Lexer::AdvanceToTokenCharacter(1252TokLoc, Exponent - ThisTokBegin, SM, LangOpts),1253diag::err_exponent_has_no_digits);1254hadError = true;1255}1256return;1257}1258}1259}12601261/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved1262/// suffixes as ud-suffixes, because the diagnostic experience is better if we1263/// treat it as an invalid suffix.1264bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,1265StringRef Suffix) {1266if (!LangOpts.CPlusPlus11 || Suffix.empty())1267return false;12681269// By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.1270// Suffixes starting with '__' (double underscore) are for use by1271// the implementation.1272if (Suffix.starts_with("_") && !Suffix.starts_with("__"))1273return true;12741275// In C++11, there are no library suffixes.1276if (!LangOpts.CPlusPlus14)1277return false;12781279// In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.1280// Per tweaked N3660, "il", "i", and "if" are also used in the library.1281// In C++2a "d" and "y" are used in the library.1282return llvm::StringSwitch<bool>(Suffix)1283.Cases("h", "min", "s", true)1284.Cases("ms", "us", "ns", true)1285.Cases("il", "i", "if", true)1286.Cases("d", "y", LangOpts.CPlusPlus20)1287.Default(false);1288}12891290void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,1291const char *Pos,1292CheckSeparatorKind IsAfterDigits) {1293if (IsAfterDigits == CSK_AfterDigits) {1294if (Pos == ThisTokBegin)1295return;1296--Pos;1297} else if (Pos == ThisTokEnd)1298return;12991300if (isDigitSeparator(*Pos)) {1301Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,1302LangOpts),1303diag::err_digit_separator_not_between_digits)1304<< IsAfterDigits;1305hadError = true;1306}1307}13081309/// ParseNumberStartingWithZero - This method is called when the first character1310/// of the number is found to be a zero. This means it is either an octal1311/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or1312/// a floating point number (01239.123e4). Eat the prefix, determining the1313/// radix etc.1314void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {1315assert(s[0] == '0' && "Invalid method call");1316s++;13171318int c1 = s[0];13191320// Handle a hex number like 0x1234.1321if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {1322s++;1323assert(s < ThisTokEnd && "didn't maximally munch?");1324radix = 16;1325DigitsBegin = s;1326s = SkipHexDigits(s);1327bool HasSignificandDigits = containsDigits(DigitsBegin, s);1328if (s == ThisTokEnd) {1329// Done.1330} else if (*s == '.') {1331s++;1332saw_period = true;1333const char *floatDigitsBegin = s;1334s = SkipHexDigits(s);1335if (containsDigits(floatDigitsBegin, s))1336HasSignificandDigits = true;1337if (HasSignificandDigits)1338checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);1339}13401341if (!HasSignificandDigits) {1342Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,1343LangOpts),1344diag::err_hex_constant_requires)1345<< LangOpts.CPlusPlus << 1;1346hadError = true;1347return;1348}13491350// A binary exponent can appear with or with a '.'. If dotted, the1351// binary exponent is required.1352if (*s == 'p' || *s == 'P') {1353checkSeparator(TokLoc, s, CSK_AfterDigits);1354const char *Exponent = s;1355s++;1356saw_exponent = true;1357if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign1358const char *first_non_digit = SkipDigits(s);1359if (!containsDigits(s, first_non_digit)) {1360if (!hadError) {1361Diags.Report(Lexer::AdvanceToTokenCharacter(1362TokLoc, Exponent - ThisTokBegin, SM, LangOpts),1363diag::err_exponent_has_no_digits);1364hadError = true;1365}1366return;1367}1368checkSeparator(TokLoc, s, CSK_BeforeDigits);1369s = first_non_digit;13701371if (!LangOpts.HexFloats)1372Diags.Report(TokLoc, LangOpts.CPlusPlus1373? diag::ext_hex_literal_invalid1374: diag::ext_hex_constant_invalid);1375else if (LangOpts.CPlusPlus17)1376Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);1377} else if (saw_period) {1378Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,1379LangOpts),1380diag::err_hex_constant_requires)1381<< LangOpts.CPlusPlus << 0;1382hadError = true;1383}1384return;1385}13861387// Handle simple binary numbers 0b010101388if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {1389// 0b101010 is a C++14 and C23 extension.1390unsigned DiagId;1391if (LangOpts.CPlusPlus14)1392DiagId = diag::warn_cxx11_compat_binary_literal;1393else if (LangOpts.C23)1394DiagId = diag::warn_c23_compat_binary_literal;1395else if (LangOpts.CPlusPlus)1396DiagId = diag::ext_binary_literal_cxx14;1397else1398DiagId = diag::ext_binary_literal;1399Diags.Report(TokLoc, DiagId);1400++s;1401assert(s < ThisTokEnd && "didn't maximally munch?");1402radix = 2;1403DigitsBegin = s;1404s = SkipBinaryDigits(s);1405if (s == ThisTokEnd) {1406// Done.1407} else if (isHexDigit(*s) &&1408!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {1409Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,1410LangOpts),1411diag::err_invalid_digit)1412<< StringRef(s, 1) << 2;1413hadError = true;1414}1415// Other suffixes will be diagnosed by the caller.1416return;1417}14181419// For now, the radix is set to 8. If we discover that we have a1420// floating point constant, the radix will change to 10. Octal floating1421// point constants are not permitted (only decimal and hexadecimal).1422radix = 8;1423const char *PossibleNewDigitStart = s;1424s = SkipOctalDigits(s);1425// When the value is 0 followed by a suffix (like 0wb), we want to leave 01426// as the start of the digits. So if skipping octal digits does not skip1427// anything, we leave the digit start where it was.1428if (s != PossibleNewDigitStart)1429DigitsBegin = PossibleNewDigitStart;14301431if (s == ThisTokEnd)1432return; // Done, simple octal number like 0123414331434// If we have some other non-octal digit that *is* a decimal digit, see if1435// this is part of a floating point number like 094.123 or 09e1.1436if (isDigit(*s)) {1437const char *EndDecimal = SkipDigits(s);1438if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {1439s = EndDecimal;1440radix = 10;1441}1442}14431444ParseDecimalOrOctalCommon(TokLoc);1445}14461447static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {1448switch (Radix) {1449case 2:1450return NumDigits <= 64;1451case 8:1452return NumDigits <= 64 / 3; // Digits are groups of 3 bits.1453case 10:1454return NumDigits <= 19; // floor(log10(2^64))1455case 16:1456return NumDigits <= 64 / 4; // Digits are groups of 4 bits.1457default:1458llvm_unreachable("impossible Radix");1459}1460}14611462/// GetIntegerValue - Convert this numeric literal value to an APInt that1463/// matches Val's input width. If there is an overflow, set Val to the low bits1464/// of the result and return true. Otherwise, return false.1465bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {1466// Fast path: Compute a conservative bound on the maximum number of1467// bits per digit in this radix. If we can't possibly overflow a1468// uint64 based on that bound then do the simple conversion to1469// integer. This avoids the expensive overflow checking below, and1470// handles the common cases that matter (small decimal integers and1471// hex/octal values which don't overflow).1472const unsigned NumDigits = SuffixBegin - DigitsBegin;1473if (alwaysFitsInto64Bits(radix, NumDigits)) {1474uint64_t N = 0;1475for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)1476if (!isDigitSeparator(*Ptr))1477N = N * radix + llvm::hexDigitValue(*Ptr);14781479// This will truncate the value to Val's input width. Simply check1480// for overflow by comparing.1481Val = N;1482return Val.getZExtValue() != N;1483}14841485Val = 0;1486const char *Ptr = DigitsBegin;14871488llvm::APInt RadixVal(Val.getBitWidth(), radix);1489llvm::APInt CharVal(Val.getBitWidth(), 0);1490llvm::APInt OldVal = Val;14911492bool OverflowOccurred = false;1493while (Ptr < SuffixBegin) {1494if (isDigitSeparator(*Ptr)) {1495++Ptr;1496continue;1497}14981499unsigned C = llvm::hexDigitValue(*Ptr++);15001501// If this letter is out of bound for this radix, reject it.1502assert(C < radix && "NumericLiteralParser ctor should have rejected this");15031504CharVal = C;15051506// Add the digit to the value in the appropriate radix. If adding in digits1507// made the value smaller, then this overflowed.1508OldVal = Val;15091510// Multiply by radix, did overflow occur on the multiply?1511Val *= RadixVal;1512OverflowOccurred |= Val.udiv(RadixVal) != OldVal;15131514// Add value, did overflow occur on the value?1515// (a + b) ult b <=> overflow1516Val += CharVal;1517OverflowOccurred |= Val.ult(CharVal);1518}1519return OverflowOccurred;1520}15211522llvm::APFloat::opStatus1523NumericLiteralParser::GetFloatValue(llvm::APFloat &Result,1524llvm::RoundingMode RM) {1525using llvm::APFloat;15261527unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);15281529llvm::SmallString<16> Buffer;1530StringRef Str(ThisTokBegin, n);1531if (Str.contains('\'')) {1532Buffer.reserve(n);1533std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),1534&isDigitSeparator);1535Str = Buffer;1536}15371538auto StatusOrErr = Result.convertFromString(Str, RM);1539assert(StatusOrErr && "Invalid floating point representation");1540return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr1541: APFloat::opInvalidOp;1542}15431544static inline bool IsExponentPart(char c, bool isHex) {1545if (isHex)1546return c == 'p' || c == 'P';1547return c == 'e' || c == 'E';1548}15491550bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {1551assert(radix == 16 || radix == 10);15521553// Find how many digits are needed to store the whole literal.1554unsigned NumDigits = SuffixBegin - DigitsBegin;1555if (saw_period) --NumDigits;15561557// Initial scan of the exponent if it exists1558bool ExpOverflowOccurred = false;1559bool NegativeExponent = false;1560const char *ExponentBegin;1561uint64_t Exponent = 0;1562int64_t BaseShift = 0;1563if (saw_exponent) {1564const char *Ptr = DigitsBegin;15651566while (!IsExponentPart(*Ptr, radix == 16))1567++Ptr;1568ExponentBegin = Ptr;1569++Ptr;1570NegativeExponent = *Ptr == '-';1571if (NegativeExponent) ++Ptr;15721573unsigned NumExpDigits = SuffixBegin - Ptr;1574if (alwaysFitsInto64Bits(radix, NumExpDigits)) {1575llvm::StringRef ExpStr(Ptr, NumExpDigits);1576llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);1577Exponent = ExpInt.getZExtValue();1578} else {1579ExpOverflowOccurred = true;1580}15811582if (NegativeExponent) BaseShift -= Exponent;1583else BaseShift += Exponent;1584}15851586// Number of bits needed for decimal literal is1587// ceil(NumDigits * log2(10)) Integral part1588// + Scale Fractional part1589// + ceil(Exponent * log2(10)) Exponent1590// --------------------------------------------------1591// ceil((NumDigits + Exponent) * log2(10)) + Scale1592//1593// But for simplicity in handling integers, we can round up log2(10) to 4,1594// making:1595// 4 * (NumDigits + Exponent) + Scale1596//1597// Number of digits needed for hexadecimal literal is1598// 4 * NumDigits Integral part1599// + Scale Fractional part1600// + Exponent Exponent1601// --------------------------------------------------1602// (4 * NumDigits) + Scale + Exponent1603uint64_t NumBitsNeeded;1604if (radix == 10)1605NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;1606else1607NumBitsNeeded = 4 * NumDigits + Exponent + Scale;16081609if (NumBitsNeeded > std::numeric_limits<unsigned>::max())1610ExpOverflowOccurred = true;1611llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);16121613bool FoundDecimal = false;16141615int64_t FractBaseShift = 0;1616const char *End = saw_exponent ? ExponentBegin : SuffixBegin;1617for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {1618if (*Ptr == '.') {1619FoundDecimal = true;1620continue;1621}16221623// Normal reading of an integer1624unsigned C = llvm::hexDigitValue(*Ptr);1625assert(C < radix && "NumericLiteralParser ctor should have rejected this");16261627Val *= radix;1628Val += C;16291630if (FoundDecimal)1631// Keep track of how much we will need to adjust this value by from the1632// number of digits past the radix point.1633--FractBaseShift;1634}16351636// For a radix of 16, we will be multiplying by 2 instead of 16.1637if (radix == 16) FractBaseShift *= 4;1638BaseShift += FractBaseShift;16391640Val <<= Scale;16411642uint64_t Base = (radix == 16) ? 2 : 10;1643if (BaseShift > 0) {1644for (int64_t i = 0; i < BaseShift; ++i) {1645Val *= Base;1646}1647} else if (BaseShift < 0) {1648for (int64_t i = BaseShift; i < 0 && !Val.isZero(); ++i)1649Val = Val.udiv(Base);1650}16511652bool IntOverflowOccurred = false;1653auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());1654if (Val.getBitWidth() > StoreVal.getBitWidth()) {1655IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));1656StoreVal = Val.trunc(StoreVal.getBitWidth());1657} else if (Val.getBitWidth() < StoreVal.getBitWidth()) {1658IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);1659StoreVal = Val.zext(StoreVal.getBitWidth());1660} else {1661StoreVal = Val;1662}16631664return IntOverflowOccurred || ExpOverflowOccurred;1665}16661667/// \verbatim1668/// user-defined-character-literal: [C++11 lex.ext]1669/// character-literal ud-suffix1670/// ud-suffix:1671/// identifier1672/// character-literal: [C++11 lex.ccon]1673/// ' c-char-sequence '1674/// u' c-char-sequence '1675/// U' c-char-sequence '1676/// L' c-char-sequence '1677/// u8' c-char-sequence ' [C++1z lex.ccon]1678/// c-char-sequence:1679/// c-char1680/// c-char-sequence c-char1681/// c-char:1682/// any member of the source character set except the single-quote ',1683/// backslash \, or new-line character1684/// escape-sequence1685/// universal-character-name1686/// escape-sequence:1687/// simple-escape-sequence1688/// octal-escape-sequence1689/// hexadecimal-escape-sequence1690/// simple-escape-sequence:1691/// one of \' \" \? \\ \a \b \f \n \r \t \v1692/// octal-escape-sequence:1693/// \ octal-digit1694/// \ octal-digit octal-digit1695/// \ octal-digit octal-digit octal-digit1696/// hexadecimal-escape-sequence:1697/// \x hexadecimal-digit1698/// hexadecimal-escape-sequence hexadecimal-digit1699/// universal-character-name: [C++11 lex.charset]1700/// \u hex-quad1701/// \U hex-quad hex-quad1702/// hex-quad:1703/// hex-digit hex-digit hex-digit hex-digit1704/// \endverbatim1705///1706CharLiteralParser::CharLiteralParser(const char *begin, const char *end,1707SourceLocation Loc, Preprocessor &PP,1708tok::TokenKind kind) {1709// At this point we know that the character matches the regex "(L|u|U)?'.*'".1710HadError = false;17111712Kind = kind;17131714const char *TokBegin = begin;17151716// Skip over wide character determinant.1717if (Kind != tok::char_constant)1718++begin;1719if (Kind == tok::utf8_char_constant)1720++begin;17211722// Skip over the entry quote.1723if (begin[0] != '\'') {1724PP.Diag(Loc, diag::err_lexing_char);1725HadError = true;1726return;1727}17281729++begin;17301731// Remove an optional ud-suffix.1732if (end[-1] != '\'') {1733const char *UDSuffixEnd = end;1734do {1735--end;1736} while (end[-1] != '\'');1737// FIXME: Don't bother with this if !tok.hasUCN().1738expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));1739UDSuffixOffset = end - TokBegin;1740}17411742// Trim the ending quote.1743assert(end != begin && "Invalid token lexed");1744--end;17451746// FIXME: The "Value" is an uint64_t so we can handle char literals of1747// up to 64-bits.1748// FIXME: This extensively assumes that 'char' is 8-bits.1749assert(PP.getTargetInfo().getCharWidth() == 8 &&1750"Assumes char is 8 bits");1751assert(PP.getTargetInfo().getIntWidth() <= 64 &&1752(PP.getTargetInfo().getIntWidth() & 7) == 0 &&1753"Assumes sizeof(int) on target is <= 64 and a multiple of char");1754assert(PP.getTargetInfo().getWCharWidth() <= 64 &&1755"Assumes sizeof(wchar) on target is <= 64");17561757SmallVector<uint32_t, 4> codepoint_buffer;1758codepoint_buffer.resize(end - begin);1759uint32_t *buffer_begin = &codepoint_buffer.front();1760uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();17611762// Unicode escapes representing characters that cannot be correctly1763// represented in a single code unit are disallowed in character literals1764// by this implementation.1765uint32_t largest_character_for_kind;1766if (tok::wide_char_constant == Kind) {1767largest_character_for_kind =17680xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());1769} else if (tok::utf8_char_constant == Kind) {1770largest_character_for_kind = 0x7F;1771} else if (tok::utf16_char_constant == Kind) {1772largest_character_for_kind = 0xFFFF;1773} else if (tok::utf32_char_constant == Kind) {1774largest_character_for_kind = 0x10FFFF;1775} else {1776largest_character_for_kind = 0x7Fu;1777}17781779while (begin != end) {1780// Is this a span of non-escape characters?1781if (begin[0] != '\\') {1782char const *start = begin;1783do {1784++begin;1785} while (begin != end && *begin != '\\');17861787char const *tmp_in_start = start;1788uint32_t *tmp_out_start = buffer_begin;1789llvm::ConversionResult res =1790llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),1791reinterpret_cast<llvm::UTF8 const *>(begin),1792&buffer_begin, buffer_end, llvm::strictConversion);1793if (res != llvm::conversionOK) {1794// If we see bad encoding for unprefixed character literals, warn and1795// simply copy the byte values, for compatibility with gcc and1796// older versions of clang.1797bool NoErrorOnBadEncoding = isOrdinary();1798unsigned Msg = diag::err_bad_character_encoding;1799if (NoErrorOnBadEncoding)1800Msg = diag::warn_bad_character_encoding;1801PP.Diag(Loc, Msg);1802if (NoErrorOnBadEncoding) {1803start = tmp_in_start;1804buffer_begin = tmp_out_start;1805for (; start != begin; ++start, ++buffer_begin)1806*buffer_begin = static_cast<uint8_t>(*start);1807} else {1808HadError = true;1809}1810} else {1811for (; tmp_out_start < buffer_begin; ++tmp_out_start) {1812if (*tmp_out_start > largest_character_for_kind) {1813HadError = true;1814PP.Diag(Loc, diag::err_character_too_large);1815}1816}1817}18181819continue;1820}1821// Is this a Universal Character Name escape?1822if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {1823unsigned short UcnLen = 0;1824if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,1825FullSourceLoc(Loc, PP.getSourceManager()),1826&PP.getDiagnostics(), PP.getLangOpts(), true)) {1827HadError = true;1828} else if (*buffer_begin > largest_character_for_kind) {1829HadError = true;1830PP.Diag(Loc, diag::err_character_too_large);1831}18321833++buffer_begin;1834continue;1835}1836unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());1837uint64_t result =1838ProcessCharEscape(TokBegin, begin, end, HadError,1839FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,1840&PP.getDiagnostics(), PP.getLangOpts(),1841StringLiteralEvalMethod::Evaluated);1842*buffer_begin++ = result;1843}18441845unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();18461847if (NumCharsSoFar > 1) {1848if (isOrdinary() && NumCharsSoFar == 4)1849PP.Diag(Loc, diag::warn_four_char_character_literal);1850else if (isOrdinary())1851PP.Diag(Loc, diag::warn_multichar_character_literal);1852else {1853PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);1854HadError = true;1855}1856IsMultiChar = true;1857} else {1858IsMultiChar = false;1859}18601861llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);18621863// Narrow character literals act as though their value is concatenated1864// in this implementation, but warn on overflow.1865bool multi_char_too_long = false;1866if (isOrdinary() && isMultiChar()) {1867LitVal = 0;1868for (size_t i = 0; i < NumCharsSoFar; ++i) {1869// check for enough leading zeros to shift into1870multi_char_too_long |= (LitVal.countl_zero() < 8);1871LitVal <<= 8;1872LitVal = LitVal + (codepoint_buffer[i] & 0xFF);1873}1874} else if (NumCharsSoFar > 0) {1875// otherwise just take the last character1876LitVal = buffer_begin[-1];1877}18781879if (!HadError && multi_char_too_long) {1880PP.Diag(Loc, diag::warn_char_constant_too_large);1881}18821883// Transfer the value from APInt to uint64_t1884Value = LitVal.getZExtValue();18851886// If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")1887// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple1888// character constants are not sign extended in the this implementation:1889// '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.1890if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) &&1891PP.getLangOpts().CharIsSigned)1892Value = (signed char)Value;1893}18941895/// \verbatim1896/// string-literal: [C++0x lex.string]1897/// encoding-prefix " [s-char-sequence] "1898/// encoding-prefix R raw-string1899/// encoding-prefix:1900/// u81901/// u1902/// U1903/// L1904/// s-char-sequence:1905/// s-char1906/// s-char-sequence s-char1907/// s-char:1908/// any member of the source character set except the double-quote ",1909/// backslash \, or new-line character1910/// escape-sequence1911/// universal-character-name1912/// raw-string:1913/// " d-char-sequence ( r-char-sequence ) d-char-sequence "1914/// r-char-sequence:1915/// r-char1916/// r-char-sequence r-char1917/// r-char:1918/// any member of the source character set, except a right parenthesis )1919/// followed by the initial d-char-sequence (which may be empty)1920/// followed by a double quote ".1921/// d-char-sequence:1922/// d-char1923/// d-char-sequence d-char1924/// d-char:1925/// any member of the basic source character set except:1926/// space, the left parenthesis (, the right parenthesis ),1927/// the backslash \, and the control characters representing horizontal1928/// tab, vertical tab, form feed, and newline.1929/// escape-sequence: [C++0x lex.ccon]1930/// simple-escape-sequence1931/// octal-escape-sequence1932/// hexadecimal-escape-sequence1933/// simple-escape-sequence:1934/// one of \' \" \? \\ \a \b \f \n \r \t \v1935/// octal-escape-sequence:1936/// \ octal-digit1937/// \ octal-digit octal-digit1938/// \ octal-digit octal-digit octal-digit1939/// hexadecimal-escape-sequence:1940/// \x hexadecimal-digit1941/// hexadecimal-escape-sequence hexadecimal-digit1942/// universal-character-name:1943/// \u hex-quad1944/// \U hex-quad hex-quad1945/// hex-quad:1946/// hex-digit hex-digit hex-digit hex-digit1947/// \endverbatim1948///1949StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,1950Preprocessor &PP,1951StringLiteralEvalMethod EvalMethod)1952: SM(PP.getSourceManager()), Features(PP.getLangOpts()),1953Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),1954MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),1955ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),1956Pascal(false) {1957init(StringToks);1958}19591960void StringLiteralParser::init(ArrayRef<Token> StringToks){1961// The literal token may have come from an invalid source location (e.g. due1962// to a PCH error), in which case the token length will be 0.1963if (StringToks.empty() || StringToks[0].getLength() < 2)1964return DiagnoseLexingError(SourceLocation());19651966// Scan all of the string portions, remember the max individual token length,1967// computing a bound on the concatenated string length, and see whether any1968// piece is a wide-string. If any of the string portions is a wide-string1969// literal, the result is a wide-string literal [C99 6.4.5p4].1970assert(!StringToks.empty() && "expected at least one token");1971MaxTokenLength = StringToks[0].getLength();1972assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");1973SizeBound = StringToks[0].getLength() - 2; // -2 for "".1974hadError = false;19751976// Determines the kind of string from the prefix1977Kind = tok::string_literal;19781979/// (C99 5.1.1.2p1). The common case is only one string fragment.1980for (const Token &Tok : StringToks) {1981if (Tok.getLength() < 2)1982return DiagnoseLexingError(Tok.getLocation());19831984// The string could be shorter than this if it needs cleaning, but this is a1985// reasonable bound, which is all we need.1986assert(Tok.getLength() >= 2 && "literal token is invalid!");1987SizeBound += Tok.getLength() - 2; // -2 for "".19881989// Remember maximum string piece length.1990if (Tok.getLength() > MaxTokenLength)1991MaxTokenLength = Tok.getLength();19921993// Remember if we see any wide or utf-8/16/32 strings.1994// Also check for illegal concatenations.1995if (isUnevaluated() && Tok.getKind() != tok::string_literal) {1996if (Diags) {1997SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter(1998Tok.getLocation(), getEncodingPrefixLen(Tok.getKind()), SM,1999Features);2000CharSourceRange Range =2001CharSourceRange::getCharRange({Tok.getLocation(), PrefixEndLoc});2002StringRef Prefix(SM.getCharacterData(Tok.getLocation()),2003getEncodingPrefixLen(Tok.getKind()));2004Diags->Report(Tok.getLocation(),2005Features.CPlusPlus262006? diag::err_unevaluated_string_prefix2007: diag::warn_unevaluated_string_prefix)2008<< Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(Range);2009}2010if (Features.CPlusPlus26)2011hadError = true;2012} else if (Tok.isNot(Kind) && Tok.isNot(tok::string_literal)) {2013if (isOrdinary()) {2014Kind = Tok.getKind();2015} else {2016if (Diags)2017Diags->Report(Tok.getLocation(), diag::err_unsupported_string_concat);2018hadError = true;2019}2020}2021}20222023// Include space for the null terminator.2024++SizeBound;20252026// TODO: K&R warning: "traditional C rejects string constant concatenation"20272028// Get the width in bytes of char/wchar_t/char16_t/char32_t2029CharByteWidth = getCharWidth(Kind, Target);2030assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");2031CharByteWidth /= 8;20322033// The output buffer size needs to be large enough to hold wide characters.2034// This is a worst-case assumption which basically corresponds to L"" "long".2035SizeBound *= CharByteWidth;20362037// Size the temporary buffer to hold the result string data.2038ResultBuf.resize(SizeBound);20392040// Likewise, but for each string piece.2041SmallString<512> TokenBuf;2042TokenBuf.resize(MaxTokenLength);20432044// Loop over all the strings, getting their spelling, and expanding them to2045// wide strings as appropriate.2046ResultPtr = &ResultBuf[0]; // Next byte to fill in.20472048Pascal = false;20492050SourceLocation UDSuffixTokLoc;20512052for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {2053const char *ThisTokBuf = &TokenBuf[0];2054// Get the spelling of the token, which eliminates trigraphs, etc. We know2055// that ThisTokBuf points to a buffer that is big enough for the whole token2056// and 'spelled' tokens can only shrink.2057bool StringInvalid = false;2058unsigned ThisTokLen =2059Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,2060&StringInvalid);2061if (StringInvalid)2062return DiagnoseLexingError(StringToks[i].getLocation());20632064const char *ThisTokBegin = ThisTokBuf;2065const char *ThisTokEnd = ThisTokBuf+ThisTokLen;20662067// Remove an optional ud-suffix.2068if (ThisTokEnd[-1] != '"') {2069const char *UDSuffixEnd = ThisTokEnd;2070do {2071--ThisTokEnd;2072} while (ThisTokEnd[-1] != '"');20732074StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);20752076if (UDSuffixBuf.empty()) {2077if (StringToks[i].hasUCN())2078expandUCNs(UDSuffixBuf, UDSuffix);2079else2080UDSuffixBuf.assign(UDSuffix);2081UDSuffixToken = i;2082UDSuffixOffset = ThisTokEnd - ThisTokBuf;2083UDSuffixTokLoc = StringToks[i].getLocation();2084} else {2085SmallString<32> ExpandedUDSuffix;2086if (StringToks[i].hasUCN()) {2087expandUCNs(ExpandedUDSuffix, UDSuffix);2088UDSuffix = ExpandedUDSuffix;2089}20902091// C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the2092// result of a concatenation involving at least one user-defined-string-2093// literal, all the participating user-defined-string-literals shall2094// have the same ud-suffix.2095bool UnevaluatedStringHasUDL = isUnevaluated() && !UDSuffix.empty();2096if (UDSuffixBuf != UDSuffix || UnevaluatedStringHasUDL) {2097if (Diags) {2098SourceLocation TokLoc = StringToks[i].getLocation();2099if (UnevaluatedStringHasUDL) {2100Diags->Report(TokLoc, diag::err_unevaluated_string_udl)2101<< SourceRange(TokLoc, TokLoc);2102} else {2103Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)2104<< UDSuffixBuf << UDSuffix2105<< SourceRange(UDSuffixTokLoc, UDSuffixTokLoc);2106}2107}2108hadError = true;2109}2110}2111}21122113// Strip the end quote.2114--ThisTokEnd;21152116// TODO: Input character set mapping support.21172118// Skip marker for wide or unicode strings.2119if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {2120++ThisTokBuf;2121// Skip 8 of u8 marker for utf8 strings.2122if (ThisTokBuf[0] == '8')2123++ThisTokBuf;2124}21252126// Check for raw string2127if (ThisTokBuf[0] == 'R') {2128if (ThisTokBuf[1] != '"') {2129// The file may have come from PCH and then changed after loading the2130// PCH; Fail gracefully.2131return DiagnoseLexingError(StringToks[i].getLocation());2132}2133ThisTokBuf += 2; // skip R"21342135// C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 162136// characters.2137constexpr unsigned MaxRawStrDelimLen = 16;21382139const char *Prefix = ThisTokBuf;2140while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&2141ThisTokBuf[0] != '(')2142++ThisTokBuf;2143if (ThisTokBuf[0] != '(')2144return DiagnoseLexingError(StringToks[i].getLocation());2145++ThisTokBuf; // skip '('21462147// Remove same number of characters from the end2148ThisTokEnd -= ThisTokBuf - Prefix;2149if (ThisTokEnd < ThisTokBuf)2150return DiagnoseLexingError(StringToks[i].getLocation());21512152// C++14 [lex.string]p4: A source-file new-line in a raw string literal2153// results in a new-line in the resulting execution string-literal.2154StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);2155while (!RemainingTokenSpan.empty()) {2156// Split the string literal on \r\n boundaries.2157size_t CRLFPos = RemainingTokenSpan.find("\r\n");2158StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);2159StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);21602161// Copy everything before the \r\n sequence into the string literal.2162if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))2163hadError = true;21642165// Point into the \n inside the \r\n sequence and operate on the2166// remaining portion of the literal.2167RemainingTokenSpan = AfterCRLF.substr(1);2168}2169} else {2170if (ThisTokBuf[0] != '"') {2171// The file may have come from PCH and then changed after loading the2172// PCH; Fail gracefully.2173return DiagnoseLexingError(StringToks[i].getLocation());2174}2175++ThisTokBuf; // skip "21762177// Check if this is a pascal string2178if (!isUnevaluated() && Features.PascalStrings &&2179ThisTokBuf + 1 != ThisTokEnd && ThisTokBuf[0] == '\\' &&2180ThisTokBuf[1] == 'p') {21812182// If the \p sequence is found in the first token, we have a pascal string2183// Otherwise, if we already have a pascal string, ignore the first \p2184if (i == 0) {2185++ThisTokBuf;2186Pascal = true;2187} else if (Pascal)2188ThisTokBuf += 2;2189}21902191while (ThisTokBuf != ThisTokEnd) {2192// Is this a span of non-escape characters?2193if (ThisTokBuf[0] != '\\') {2194const char *InStart = ThisTokBuf;2195do {2196++ThisTokBuf;2197} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');21982199// Copy the character span over.2200if (CopyStringFragment(StringToks[i], ThisTokBegin,2201StringRef(InStart, ThisTokBuf - InStart)))2202hadError = true;2203continue;2204}2205// Is this a Universal Character Name escape?2206if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||2207ThisTokBuf[1] == 'N') {2208EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,2209ResultPtr, hadError,2210FullSourceLoc(StringToks[i].getLocation(), SM),2211CharByteWidth, Diags, Features);2212continue;2213}2214// Otherwise, this is a non-UCN escape character. Process it.2215unsigned ResultChar =2216ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,2217FullSourceLoc(StringToks[i].getLocation(), SM),2218CharByteWidth * 8, Diags, Features, EvalMethod);22192220if (CharByteWidth == 4) {2221// FIXME: Make the type of the result buffer correct instead of2222// using reinterpret_cast.2223llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);2224*ResultWidePtr = ResultChar;2225ResultPtr += 4;2226} else if (CharByteWidth == 2) {2227// FIXME: Make the type of the result buffer correct instead of2228// using reinterpret_cast.2229llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);2230*ResultWidePtr = ResultChar & 0xFFFF;2231ResultPtr += 2;2232} else {2233assert(CharByteWidth == 1 && "Unexpected char width");2234*ResultPtr++ = ResultChar & 0xFF;2235}2236}2237}2238}22392240assert((!Pascal || !isUnevaluated()) &&2241"Pascal string in unevaluated context");2242if (Pascal) {2243if (CharByteWidth == 4) {2244// FIXME: Make the type of the result buffer correct instead of2245// using reinterpret_cast.2246llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());2247ResultWidePtr[0] = GetNumStringChars() - 1;2248} else if (CharByteWidth == 2) {2249// FIXME: Make the type of the result buffer correct instead of2250// using reinterpret_cast.2251llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());2252ResultWidePtr[0] = GetNumStringChars() - 1;2253} else {2254assert(CharByteWidth == 1 && "Unexpected char width");2255ResultBuf[0] = GetNumStringChars() - 1;2256}22572258// Verify that pascal strings aren't too large.2259if (GetStringLength() > 256) {2260if (Diags)2261Diags->Report(StringToks.front().getLocation(),2262diag::err_pascal_string_too_long)2263<< SourceRange(StringToks.front().getLocation(),2264StringToks.back().getLocation());2265hadError = true;2266return;2267}2268} else if (Diags) {2269// Complain if this string literal has too many characters.2270unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;22712272if (GetNumStringChars() > MaxChars)2273Diags->Report(StringToks.front().getLocation(),2274diag::ext_string_too_long)2275<< GetNumStringChars() << MaxChars2276<< (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)2277<< SourceRange(StringToks.front().getLocation(),2278StringToks.back().getLocation());2279}2280}22812282static const char *resyncUTF8(const char *Err, const char *End) {2283if (Err == End)2284return End;2285End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);2286while (++Err != End && (*Err & 0xC0) == 0x80)2287;2288return Err;2289}22902291/// This function copies from Fragment, which is a sequence of bytes2292/// within Tok's contents (which begin at TokBegin) into ResultPtr.2293/// Performs widening for multi-byte characters.2294bool StringLiteralParser::CopyStringFragment(const Token &Tok,2295const char *TokBegin,2296StringRef Fragment) {2297const llvm::UTF8 *ErrorPtrTmp;2298if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))2299return false;23002301// If we see bad encoding for unprefixed string literals, warn and2302// simply copy the byte values, for compatibility with gcc and older2303// versions of clang.2304bool NoErrorOnBadEncoding = isOrdinary();2305if (NoErrorOnBadEncoding) {2306memcpy(ResultPtr, Fragment.data(), Fragment.size());2307ResultPtr += Fragment.size();2308}23092310if (Diags) {2311const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);23122313FullSourceLoc SourceLoc(Tok.getLocation(), SM);2314const DiagnosticBuilder &Builder =2315Diag(Diags, Features, SourceLoc, TokBegin,2316ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),2317NoErrorOnBadEncoding ? diag::warn_bad_string_encoding2318: diag::err_bad_string_encoding);23192320const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());2321StringRef NextFragment(NextStart, Fragment.end()-NextStart);23222323// Decode into a dummy buffer.2324SmallString<512> Dummy;2325Dummy.reserve(Fragment.size() * CharByteWidth);2326char *Ptr = Dummy.data();23272328while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {2329const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);2330NextStart = resyncUTF8(ErrorPtr, Fragment.end());2331Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,2332ErrorPtr, NextStart);2333NextFragment = StringRef(NextStart, Fragment.end()-NextStart);2334}2335}2336return !NoErrorOnBadEncoding;2337}23382339void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {2340hadError = true;2341if (Diags)2342Diags->Report(Loc, diag::err_lexing_string);2343}23442345/// getOffsetOfStringByte - This function returns the offset of the2346/// specified byte of the string data represented by Token. This handles2347/// advancing over escape sequences in the string.2348unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,2349unsigned ByteNo) const {2350// Get the spelling of the token.2351SmallString<32> SpellingBuffer;2352SpellingBuffer.resize(Tok.getLength());23532354bool StringInvalid = false;2355const char *SpellingPtr = &SpellingBuffer[0];2356unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,2357&StringInvalid);2358if (StringInvalid)2359return 0;23602361const char *SpellingStart = SpellingPtr;2362const char *SpellingEnd = SpellingPtr+TokLen;23632364// Handle UTF-8 strings just like narrow strings.2365if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')2366SpellingPtr += 2;23672368assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&2369SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");23702371// For raw string literals, this is easy.2372if (SpellingPtr[0] == 'R') {2373assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");2374// Skip 'R"'.2375SpellingPtr += 2;2376while (*SpellingPtr != '(') {2377++SpellingPtr;2378assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");2379}2380// Skip '('.2381++SpellingPtr;2382return SpellingPtr - SpellingStart + ByteNo;2383}23842385// Skip over the leading quote2386assert(SpellingPtr[0] == '"' && "Should be a string literal!");2387++SpellingPtr;23882389// Skip over bytes until we find the offset we're looking for.2390while (ByteNo) {2391assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");23922393// Step over non-escapes simply.2394if (*SpellingPtr != '\\') {2395++SpellingPtr;2396--ByteNo;2397continue;2398}23992400// Otherwise, this is an escape character. Advance over it.2401bool HadError = false;2402if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||2403SpellingPtr[1] == 'N') {2404const char *EscapePtr = SpellingPtr;2405unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,24061, Features, HadError);2407if (Len > ByteNo) {2408// ByteNo is somewhere within the escape sequence.2409SpellingPtr = EscapePtr;2410break;2411}2412ByteNo -= Len;2413} else {2414ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,2415FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,2416Diags, Features, StringLiteralEvalMethod::Evaluated);2417--ByteNo;2418}2419assert(!HadError && "This method isn't valid on erroneous strings");2420}24212422return SpellingPtr-SpellingStart;2423}24242425/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved2426/// suffixes as ud-suffixes, because the diagnostic experience is better if we2427/// treat it as an invalid suffix.2428bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,2429StringRef Suffix) {2430return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||2431Suffix == "sv";2432}243324342435