Path: blob/main/contrib/llvm-project/clang/lib/Lex/Lexer.cpp
35234 views
//===- Lexer.cpp - C Language Family Lexer --------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file implements the Lexer and Token interfaces.9//10//===----------------------------------------------------------------------===//1112#include "clang/Lex/Lexer.h"13#include "UnicodeCharSets.h"14#include "clang/Basic/CharInfo.h"15#include "clang/Basic/Diagnostic.h"16#include "clang/Basic/IdentifierTable.h"17#include "clang/Basic/LLVM.h"18#include "clang/Basic/LangOptions.h"19#include "clang/Basic/SourceLocation.h"20#include "clang/Basic/SourceManager.h"21#include "clang/Basic/TokenKinds.h"22#include "clang/Lex/LexDiagnostic.h"23#include "clang/Lex/LiteralSupport.h"24#include "clang/Lex/MultipleIncludeOpt.h"25#include "clang/Lex/Preprocessor.h"26#include "clang/Lex/PreprocessorOptions.h"27#include "clang/Lex/Token.h"28#include "llvm/ADT/STLExtras.h"29#include "llvm/ADT/StringExtras.h"30#include "llvm/ADT/StringRef.h"31#include "llvm/ADT/StringSwitch.h"32#include "llvm/Support/Compiler.h"33#include "llvm/Support/ConvertUTF.h"34#include "llvm/Support/MathExtras.h"35#include "llvm/Support/MemoryBufferRef.h"36#include "llvm/Support/NativeFormatting.h"37#include "llvm/Support/Unicode.h"38#include "llvm/Support/UnicodeCharRanges.h"39#include <algorithm>40#include <cassert>41#include <cstddef>42#include <cstdint>43#include <cstring>44#include <optional>45#include <string>46#include <tuple>47#include <utility>4849#ifdef __SSE4_2__50#include <nmmintrin.h>51#endif5253using namespace clang;5455//===----------------------------------------------------------------------===//56// Token Class Implementation57//===----------------------------------------------------------------------===//5859/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.60bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {61if (isAnnotation())62return false;63if (const IdentifierInfo *II = getIdentifierInfo())64return II->getObjCKeywordID() == objcKey;65return false;66}6768/// getObjCKeywordID - Return the ObjC keyword kind.69tok::ObjCKeywordKind Token::getObjCKeywordID() const {70if (isAnnotation())71return tok::objc_not_keyword;72const IdentifierInfo *specId = getIdentifierInfo();73return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;74}7576/// Determine whether the token kind starts a simple-type-specifier.77bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const {78switch (getKind()) {79case tok::annot_typename:80case tok::annot_decltype:81case tok::annot_pack_indexing_type:82return true;8384case tok::kw_short:85case tok::kw_long:86case tok::kw___int64:87case tok::kw___int128:88case tok::kw_signed:89case tok::kw_unsigned:90case tok::kw_void:91case tok::kw_char:92case tok::kw_int:93case tok::kw_half:94case tok::kw_float:95case tok::kw_double:96case tok::kw___bf16:97case tok::kw__Float16:98case tok::kw___float128:99case tok::kw___ibm128:100case tok::kw_wchar_t:101case tok::kw_bool:102case tok::kw__Bool:103case tok::kw__Accum:104case tok::kw__Fract:105case tok::kw__Sat:106#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:107#include "clang/Basic/TransformTypeTraits.def"108case tok::kw___auto_type:109case tok::kw_char16_t:110case tok::kw_char32_t:111case tok::kw_typeof:112case tok::kw_decltype:113case tok::kw_char8_t:114return getIdentifierInfo()->isKeyword(LangOpts);115116default:117return false;118}119}120121//===----------------------------------------------------------------------===//122// Lexer Class Implementation123//===----------------------------------------------------------------------===//124125void Lexer::anchor() {}126127void Lexer::InitLexer(const char *BufStart, const char *BufPtr,128const char *BufEnd) {129BufferStart = BufStart;130BufferPtr = BufPtr;131BufferEnd = BufEnd;132133assert(BufEnd[0] == 0 &&134"We assume that the input buffer has a null character at the end"135" to simplify lexing!");136137// Check whether we have a BOM in the beginning of the buffer. If yes - act138// accordingly. Right now we support only UTF-8 with and without BOM, so, just139// skip the UTF-8 BOM if it's present.140if (BufferStart == BufferPtr) {141// Determine the size of the BOM.142StringRef Buf(BufferStart, BufferEnd - BufferStart);143size_t BOMLength = llvm::StringSwitch<size_t>(Buf)144.StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM145.Default(0);146147// Skip the BOM.148BufferPtr += BOMLength;149}150151Is_PragmaLexer = false;152CurrentConflictMarkerState = CMK_None;153154// Start of the file is a start of line.155IsAtStartOfLine = true;156IsAtPhysicalStartOfLine = true;157158HasLeadingSpace = false;159HasLeadingEmptyMacro = false;160161// We are not after parsing a #.162ParsingPreprocessorDirective = false;163164// We are not after parsing #include.165ParsingFilename = false;166167// We are not in raw mode. Raw mode disables diagnostics and interpretation168// of tokens (e.g. identifiers, thus disabling macro expansion). It is used169// to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block170// or otherwise skipping over tokens.171LexingRawMode = false;172173// Default to not keeping comments.174ExtendedTokenMode = 0;175176NewLinePtr = nullptr;177}178179/// Lexer constructor - Create a new lexer object for the specified buffer180/// with the specified preprocessor managing the lexing process. This lexer181/// assumes that the associated file buffer and Preprocessor objects will182/// outlive it, so it doesn't take ownership of either of them.183Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,184Preprocessor &PP, bool IsFirstIncludeOfFile)185: PreprocessorLexer(&PP, FID),186FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),187LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),188IsFirstTimeLexingFile(IsFirstIncludeOfFile) {189InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),190InputFile.getBufferEnd());191192resetExtendedTokenMode();193}194195/// Lexer constructor - Create a new raw lexer object. This object is only196/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text197/// range will outlive it, so it doesn't take ownership of it.198Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,199const char *BufStart, const char *BufPtr, const char *BufEnd,200bool IsFirstIncludeOfFile)201: FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),202IsFirstTimeLexingFile(IsFirstIncludeOfFile) {203InitLexer(BufStart, BufPtr, BufEnd);204205// We *are* in raw mode.206LexingRawMode = true;207}208209/// Lexer constructor - Create a new raw lexer object. This object is only210/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text211/// range will outlive it, so it doesn't take ownership of it.212Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,213const SourceManager &SM, const LangOptions &langOpts,214bool IsFirstIncludeOfFile)215: Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),216FromFile.getBufferStart(), FromFile.getBufferEnd(),217IsFirstIncludeOfFile) {}218219void Lexer::resetExtendedTokenMode() {220assert(PP && "Cannot reset token mode without a preprocessor");221if (LangOpts.TraditionalCPP)222SetKeepWhitespaceMode(true);223else224SetCommentRetentionState(PP->getCommentRetentionState());225}226227/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for228/// _Pragma expansion. This has a variety of magic semantics that this method229/// sets up. It returns a new'd Lexer that must be delete'd when done.230///231/// On entrance to this routine, TokStartLoc is a macro location which has a232/// spelling loc that indicates the bytes to be lexed for the token and an233/// expansion location that indicates where all lexed tokens should be234/// "expanded from".235///236/// TODO: It would really be nice to make _Pragma just be a wrapper around a237/// normal lexer that remaps tokens as they fly by. This would require making238/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer239/// interface that could handle this stuff. This would pull GetMappedTokenLoc240/// out of the critical path of the lexer!241///242Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,243SourceLocation ExpansionLocStart,244SourceLocation ExpansionLocEnd,245unsigned TokLen, Preprocessor &PP) {246SourceManager &SM = PP.getSourceManager();247248// Create the lexer as if we were going to lex the file normally.249FileID SpellingFID = SM.getFileID(SpellingLoc);250llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);251Lexer *L = new Lexer(SpellingFID, InputFile, PP);252253// Now that the lexer is created, change the start/end locations so that we254// just lex the subsection of the file that we want. This is lexing from a255// scratch buffer.256const char *StrData = SM.getCharacterData(SpellingLoc);257258L->BufferPtr = StrData;259L->BufferEnd = StrData+TokLen;260assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");261262// Set the SourceLocation with the remapping information. This ensures that263// GetMappedTokenLoc will remap the tokens as they are lexed.264L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),265ExpansionLocStart,266ExpansionLocEnd, TokLen);267268// Ensure that the lexer thinks it is inside a directive, so that end \n will269// return an EOD token.270L->ParsingPreprocessorDirective = true;271272// This lexer really is for _Pragma.273L->Is_PragmaLexer = true;274return L;275}276277void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {278this->IsAtPhysicalStartOfLine = IsAtStartOfLine;279this->IsAtStartOfLine = IsAtStartOfLine;280assert((BufferStart + Offset) <= BufferEnd);281BufferPtr = BufferStart + Offset;282}283284template <typename T> static void StringifyImpl(T &Str, char Quote) {285typename T::size_type i = 0, e = Str.size();286while (i < e) {287if (Str[i] == '\\' || Str[i] == Quote) {288Str.insert(Str.begin() + i, '\\');289i += 2;290++e;291} else if (Str[i] == '\n' || Str[i] == '\r') {292// Replace '\r\n' and '\n\r' to '\\' followed by 'n'.293if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&294Str[i] != Str[i + 1]) {295Str[i] = '\\';296Str[i + 1] = 'n';297} else {298// Replace '\n' and '\r' to '\\' followed by 'n'.299Str[i] = '\\';300Str.insert(Str.begin() + i + 1, 'n');301++e;302}303i += 2;304} else305++i;306}307}308309std::string Lexer::Stringify(StringRef Str, bool Charify) {310std::string Result = std::string(Str);311char Quote = Charify ? '\'' : '"';312StringifyImpl(Result, Quote);313return Result;314}315316void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }317318//===----------------------------------------------------------------------===//319// Token Spelling320//===----------------------------------------------------------------------===//321322/// Slow case of getSpelling. Extract the characters comprising the323/// spelling of this token from the provided input buffer.324static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,325const LangOptions &LangOpts, char *Spelling) {326assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");327328size_t Length = 0;329const char *BufEnd = BufPtr + Tok.getLength();330331if (tok::isStringLiteral(Tok.getKind())) {332// Munch the encoding-prefix and opening double-quote.333while (BufPtr < BufEnd) {334auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);335Spelling[Length++] = CharAndSize.Char;336BufPtr += CharAndSize.Size;337338if (Spelling[Length - 1] == '"')339break;340}341342// Raw string literals need special handling; trigraph expansion and line343// splicing do not occur within their d-char-sequence nor within their344// r-char-sequence.345if (Length >= 2 &&346Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {347// Search backwards from the end of the token to find the matching closing348// quote.349const char *RawEnd = BufEnd;350do --RawEnd; while (*RawEnd != '"');351size_t RawLength = RawEnd - BufPtr + 1;352353// Everything between the quotes is included verbatim in the spelling.354memcpy(Spelling + Length, BufPtr, RawLength);355Length += RawLength;356BufPtr += RawLength;357358// The rest of the token is lexed normally.359}360}361362while (BufPtr < BufEnd) {363auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);364Spelling[Length++] = CharAndSize.Char;365BufPtr += CharAndSize.Size;366}367368assert(Length < Tok.getLength() &&369"NeedsCleaning flag set on token that didn't need cleaning!");370return Length;371}372373/// getSpelling() - Return the 'spelling' of this token. The spelling of a374/// token are the characters used to represent the token in the source file375/// after trigraph expansion and escaped-newline folding. In particular, this376/// wants to get the true, uncanonicalized, spelling of things like digraphs377/// UCNs, etc.378StringRef Lexer::getSpelling(SourceLocation loc,379SmallVectorImpl<char> &buffer,380const SourceManager &SM,381const LangOptions &options,382bool *invalid) {383// Break down the source location.384std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);385386// Try to the load the file buffer.387bool invalidTemp = false;388StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);389if (invalidTemp) {390if (invalid) *invalid = true;391return {};392}393394const char *tokenBegin = file.data() + locInfo.second;395396// Lex from the start of the given location.397Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,398file.begin(), tokenBegin, file.end());399Token token;400lexer.LexFromRawLexer(token);401402unsigned length = token.getLength();403404// Common case: no need for cleaning.405if (!token.needsCleaning())406return StringRef(tokenBegin, length);407408// Hard case, we need to relex the characters into the string.409buffer.resize(length);410buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));411return StringRef(buffer.data(), buffer.size());412}413414/// getSpelling() - Return the 'spelling' of this token. The spelling of a415/// token are the characters used to represent the token in the source file416/// after trigraph expansion and escaped-newline folding. In particular, this417/// wants to get the true, uncanonicalized, spelling of things like digraphs418/// UCNs, etc.419std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,420const LangOptions &LangOpts, bool *Invalid) {421assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");422423bool CharDataInvalid = false;424const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),425&CharDataInvalid);426if (Invalid)427*Invalid = CharDataInvalid;428if (CharDataInvalid)429return {};430431// If this token contains nothing interesting, return it directly.432if (!Tok.needsCleaning())433return std::string(TokStart, TokStart + Tok.getLength());434435std::string Result;436Result.resize(Tok.getLength());437Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));438return Result;439}440441/// getSpelling - This method is used to get the spelling of a token into a442/// preallocated buffer, instead of as an std::string. The caller is required443/// to allocate enough space for the token, which is guaranteed to be at least444/// Tok.getLength() bytes long. The actual length of the token is returned.445///446/// Note that this method may do two possible things: it may either fill in447/// the buffer specified with characters, or it may *change the input pointer*448/// to point to a constant buffer with the data already in it (avoiding a449/// copy). The caller is not allowed to modify the returned buffer pointer450/// if an internal buffer is returned.451unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,452const SourceManager &SourceMgr,453const LangOptions &LangOpts, bool *Invalid) {454assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");455456const char *TokStart = nullptr;457// NOTE: this has to be checked *before* testing for an IdentifierInfo.458if (Tok.is(tok::raw_identifier))459TokStart = Tok.getRawIdentifier().data();460else if (!Tok.hasUCN()) {461if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {462// Just return the string from the identifier table, which is very quick.463Buffer = II->getNameStart();464return II->getLength();465}466}467468// NOTE: this can be checked even after testing for an IdentifierInfo.469if (Tok.isLiteral())470TokStart = Tok.getLiteralData();471472if (!TokStart) {473// Compute the start of the token in the input lexer buffer.474bool CharDataInvalid = false;475TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);476if (Invalid)477*Invalid = CharDataInvalid;478if (CharDataInvalid) {479Buffer = "";480return 0;481}482}483484// If this token contains nothing interesting, return it directly.485if (!Tok.needsCleaning()) {486Buffer = TokStart;487return Tok.getLength();488}489490// Otherwise, hard case, relex the characters into the string.491return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));492}493494/// MeasureTokenLength - Relex the token at the specified location and return495/// its length in bytes in the input file. If the token needs cleaning (e.g.496/// includes a trigraph or an escaped newline) then this count includes bytes497/// that are part of that.498unsigned Lexer::MeasureTokenLength(SourceLocation Loc,499const SourceManager &SM,500const LangOptions &LangOpts) {501Token TheTok;502if (getRawToken(Loc, TheTok, SM, LangOpts))503return 0;504return TheTok.getLength();505}506507/// Relex the token at the specified location.508/// \returns true if there was a failure, false on success.509bool Lexer::getRawToken(SourceLocation Loc, Token &Result,510const SourceManager &SM,511const LangOptions &LangOpts,512bool IgnoreWhiteSpace) {513// TODO: this could be special cased for common tokens like identifiers, ')',514// etc to make this faster, if it mattered. Just look at StrData[0] to handle515// all obviously single-char tokens. This could use516// Lexer::isObviouslySimpleCharacter for example to handle identifiers or517// something.518519// If this comes from a macro expansion, we really do want the macro name, not520// the token this macro expanded to.521Loc = SM.getExpansionLoc(Loc);522std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);523bool Invalid = false;524StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);525if (Invalid)526return true;527528const char *StrData = Buffer.data()+LocInfo.second;529530if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))531return true;532533// Create a lexer starting at the beginning of this token.534Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,535Buffer.begin(), StrData, Buffer.end());536TheLexer.SetCommentRetentionState(true);537TheLexer.LexFromRawLexer(Result);538return false;539}540541/// Returns the pointer that points to the beginning of line that contains542/// the given offset, or null if the offset if invalid.543static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {544const char *BufStart = Buffer.data();545if (Offset >= Buffer.size())546return nullptr;547548const char *LexStart = BufStart + Offset;549for (; LexStart != BufStart; --LexStart) {550if (isVerticalWhitespace(LexStart[0]) &&551!Lexer::isNewLineEscaped(BufStart, LexStart)) {552// LexStart should point at first character of logical line.553++LexStart;554break;555}556}557return LexStart;558}559560static SourceLocation getBeginningOfFileToken(SourceLocation Loc,561const SourceManager &SM,562const LangOptions &LangOpts) {563assert(Loc.isFileID());564std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);565if (LocInfo.first.isInvalid())566return Loc;567568bool Invalid = false;569StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);570if (Invalid)571return Loc;572573// Back up from the current location until we hit the beginning of a line574// (or the buffer). We'll relex from that point.575const char *StrData = Buffer.data() + LocInfo.second;576const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);577if (!LexStart || LexStart == StrData)578return Loc;579580// Create a lexer starting at the beginning of this token.581SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);582Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,583Buffer.end());584TheLexer.SetCommentRetentionState(true);585586// Lex tokens until we find the token that contains the source location.587Token TheTok;588do {589TheLexer.LexFromRawLexer(TheTok);590591if (TheLexer.getBufferLocation() > StrData) {592// Lexing this token has taken the lexer past the source location we're593// looking for. If the current token encompasses our source location,594// return the beginning of that token.595if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)596return TheTok.getLocation();597598// We ended up skipping over the source location entirely, which means599// that it points into whitespace. We're done here.600break;601}602} while (TheTok.getKind() != tok::eof);603604// We've passed our source location; just return the original source location.605return Loc;606}607608SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,609const SourceManager &SM,610const LangOptions &LangOpts) {611if (Loc.isFileID())612return getBeginningOfFileToken(Loc, SM, LangOpts);613614if (!SM.isMacroArgExpansion(Loc))615return Loc;616617SourceLocation FileLoc = SM.getSpellingLoc(Loc);618SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);619std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);620std::pair<FileID, unsigned> BeginFileLocInfo =621SM.getDecomposedLoc(BeginFileLoc);622assert(FileLocInfo.first == BeginFileLocInfo.first &&623FileLocInfo.second >= BeginFileLocInfo.second);624return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);625}626627namespace {628629enum PreambleDirectiveKind {630PDK_Skipped,631PDK_Unknown632};633634} // namespace635636PreambleBounds Lexer::ComputePreamble(StringRef Buffer,637const LangOptions &LangOpts,638unsigned MaxLines) {639// Create a lexer starting at the beginning of the file. Note that we use a640// "fake" file source location at offset 1 so that the lexer will track our641// position within the file.642const SourceLocation::UIntTy StartOffset = 1;643SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);644Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),645Buffer.end());646TheLexer.SetCommentRetentionState(true);647648bool InPreprocessorDirective = false;649Token TheTok;650SourceLocation ActiveCommentLoc;651652unsigned MaxLineOffset = 0;653if (MaxLines) {654const char *CurPtr = Buffer.begin();655unsigned CurLine = 0;656while (CurPtr != Buffer.end()) {657char ch = *CurPtr++;658if (ch == '\n') {659++CurLine;660if (CurLine == MaxLines)661break;662}663}664if (CurPtr != Buffer.end())665MaxLineOffset = CurPtr - Buffer.begin();666}667668do {669TheLexer.LexFromRawLexer(TheTok);670671if (InPreprocessorDirective) {672// If we've hit the end of the file, we're done.673if (TheTok.getKind() == tok::eof) {674break;675}676677// If we haven't hit the end of the preprocessor directive, skip this678// token.679if (!TheTok.isAtStartOfLine())680continue;681682// We've passed the end of the preprocessor directive, and will look683// at this token again below.684InPreprocessorDirective = false;685}686687// Keep track of the # of lines in the preamble.688if (TheTok.isAtStartOfLine()) {689unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;690691// If we were asked to limit the number of lines in the preamble,692// and we're about to exceed that limit, we're done.693if (MaxLineOffset && TokOffset >= MaxLineOffset)694break;695}696697// Comments are okay; skip over them.698if (TheTok.getKind() == tok::comment) {699if (ActiveCommentLoc.isInvalid())700ActiveCommentLoc = TheTok.getLocation();701continue;702}703704if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {705// This is the start of a preprocessor directive.706Token HashTok = TheTok;707InPreprocessorDirective = true;708ActiveCommentLoc = SourceLocation();709710// Figure out which directive this is. Since we're lexing raw tokens,711// we don't have an identifier table available. Instead, just look at712// the raw identifier to recognize and categorize preprocessor directives.713TheLexer.LexFromRawLexer(TheTok);714if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {715StringRef Keyword = TheTok.getRawIdentifier();716PreambleDirectiveKind PDK717= llvm::StringSwitch<PreambleDirectiveKind>(Keyword)718.Case("include", PDK_Skipped)719.Case("__include_macros", PDK_Skipped)720.Case("define", PDK_Skipped)721.Case("undef", PDK_Skipped)722.Case("line", PDK_Skipped)723.Case("error", PDK_Skipped)724.Case("pragma", PDK_Skipped)725.Case("import", PDK_Skipped)726.Case("include_next", PDK_Skipped)727.Case("warning", PDK_Skipped)728.Case("ident", PDK_Skipped)729.Case("sccs", PDK_Skipped)730.Case("assert", PDK_Skipped)731.Case("unassert", PDK_Skipped)732.Case("if", PDK_Skipped)733.Case("ifdef", PDK_Skipped)734.Case("ifndef", PDK_Skipped)735.Case("elif", PDK_Skipped)736.Case("elifdef", PDK_Skipped)737.Case("elifndef", PDK_Skipped)738.Case("else", PDK_Skipped)739.Case("endif", PDK_Skipped)740.Default(PDK_Unknown);741742switch (PDK) {743case PDK_Skipped:744continue;745746case PDK_Unknown:747// We don't know what this directive is; stop at the '#'.748break;749}750}751752// We only end up here if we didn't recognize the preprocessor753// directive or it was one that can't occur in the preamble at this754// point. Roll back the current token to the location of the '#'.755TheTok = HashTok;756} else if (TheTok.isAtStartOfLine() &&757TheTok.getKind() == tok::raw_identifier &&758TheTok.getRawIdentifier() == "module" &&759LangOpts.CPlusPlusModules) {760// The initial global module fragment introducer "module;" is part of761// the preamble, which runs up to the module declaration "module foo;".762Token ModuleTok = TheTok;763do {764TheLexer.LexFromRawLexer(TheTok);765} while (TheTok.getKind() == tok::comment);766if (TheTok.getKind() != tok::semi) {767// Not global module fragment, roll back.768TheTok = ModuleTok;769break;770}771continue;772}773774// We hit a token that we don't recognize as being in the775// "preprocessing only" part of the file, so we're no longer in776// the preamble.777break;778} while (true);779780SourceLocation End;781if (ActiveCommentLoc.isValid())782End = ActiveCommentLoc; // don't truncate a decl comment.783else784End = TheTok.getLocation();785786return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),787TheTok.isAtStartOfLine());788}789790unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,791const SourceManager &SM,792const LangOptions &LangOpts) {793// Figure out how many physical characters away the specified expansion794// character is. This needs to take into consideration newlines and795// trigraphs.796bool Invalid = false;797const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);798799// If they request the first char of the token, we're trivially done.800if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))801return 0;802803unsigned PhysOffset = 0;804805// The usual case is that tokens don't contain anything interesting. Skip806// over the uninteresting characters. If a token only consists of simple807// chars, this method is extremely fast.808while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {809if (CharNo == 0)810return PhysOffset;811++TokPtr;812--CharNo;813++PhysOffset;814}815816// If we have a character that may be a trigraph or escaped newline, use a817// lexer to parse it correctly.818for (; CharNo; --CharNo) {819auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);820TokPtr += CharAndSize.Size;821PhysOffset += CharAndSize.Size;822}823824// Final detail: if we end up on an escaped newline, we want to return the825// location of the actual byte of the token. For example foo\<newline>bar826// advanced by 3 should return the location of b, not of \\. One compounding827// detail of this is that the escape may be made by a trigraph.828if (!Lexer::isObviouslySimpleCharacter(*TokPtr))829PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;830831return PhysOffset;832}833834/// Computes the source location just past the end of the835/// token at this source location.836///837/// This routine can be used to produce a source location that838/// points just past the end of the token referenced by \p Loc, and839/// is generally used when a diagnostic needs to point just after a840/// token where it expected something different that it received. If841/// the returned source location would not be meaningful (e.g., if842/// it points into a macro), this routine returns an invalid843/// source location.844///845/// \param Offset an offset from the end of the token, where the source846/// location should refer to. The default offset (0) produces a source847/// location pointing just past the end of the token; an offset of 1 produces848/// a source location pointing to the last character in the token, etc.849SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,850const SourceManager &SM,851const LangOptions &LangOpts) {852if (Loc.isInvalid())853return {};854855if (Loc.isMacroID()) {856if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))857return {}; // Points inside the macro expansion.858}859860unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);861if (Len > Offset)862Len = Len - Offset;863else864return Loc;865866return Loc.getLocWithOffset(Len);867}868869/// Returns true if the given MacroID location points at the first870/// token of the macro expansion.871bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,872const SourceManager &SM,873const LangOptions &LangOpts,874SourceLocation *MacroBegin) {875assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");876877SourceLocation expansionLoc;878if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))879return false;880881if (expansionLoc.isFileID()) {882// No other macro expansions, this is the first.883if (MacroBegin)884*MacroBegin = expansionLoc;885return true;886}887888return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);889}890891/// Returns true if the given MacroID location points at the last892/// token of the macro expansion.893bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,894const SourceManager &SM,895const LangOptions &LangOpts,896SourceLocation *MacroEnd) {897assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");898899SourceLocation spellLoc = SM.getSpellingLoc(loc);900unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);901if (tokLen == 0)902return false;903904SourceLocation afterLoc = loc.getLocWithOffset(tokLen);905SourceLocation expansionLoc;906if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))907return false;908909if (expansionLoc.isFileID()) {910// No other macro expansions.911if (MacroEnd)912*MacroEnd = expansionLoc;913return true;914}915916return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);917}918919static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,920const SourceManager &SM,921const LangOptions &LangOpts) {922SourceLocation Begin = Range.getBegin();923SourceLocation End = Range.getEnd();924assert(Begin.isFileID() && End.isFileID());925if (Range.isTokenRange()) {926End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);927if (End.isInvalid())928return {};929}930931// Break down the source locations.932FileID FID;933unsigned BeginOffs;934std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);935if (FID.isInvalid())936return {};937938unsigned EndOffs;939if (!SM.isInFileID(End, FID, &EndOffs) ||940BeginOffs > EndOffs)941return {};942943return CharSourceRange::getCharRange(Begin, End);944}945946// Assumes that `Loc` is in an expansion.947static bool isInExpansionTokenRange(const SourceLocation Loc,948const SourceManager &SM) {949return SM.getSLocEntry(SM.getFileID(Loc))950.getExpansion()951.isExpansionTokenRange();952}953954CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,955const SourceManager &SM,956const LangOptions &LangOpts) {957SourceLocation Begin = Range.getBegin();958SourceLocation End = Range.getEnd();959if (Begin.isInvalid() || End.isInvalid())960return {};961962if (Begin.isFileID() && End.isFileID())963return makeRangeFromFileLocs(Range, SM, LangOpts);964965if (Begin.isMacroID() && End.isFileID()) {966if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))967return {};968Range.setBegin(Begin);969return makeRangeFromFileLocs(Range, SM, LangOpts);970}971972if (Begin.isFileID() && End.isMacroID()) {973if (Range.isTokenRange()) {974if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))975return {};976// Use the *original* end, not the expanded one in `End`.977Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));978} else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))979return {};980Range.setEnd(End);981return makeRangeFromFileLocs(Range, SM, LangOpts);982}983984assert(Begin.isMacroID() && End.isMacroID());985SourceLocation MacroBegin, MacroEnd;986if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&987((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,988&MacroEnd)) ||989(Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,990&MacroEnd)))) {991Range.setBegin(MacroBegin);992Range.setEnd(MacroEnd);993// Use the *original* `End`, not the expanded one in `MacroEnd`.994if (Range.isTokenRange())995Range.setTokenRange(isInExpansionTokenRange(End, SM));996return makeRangeFromFileLocs(Range, SM, LangOpts);997}998999bool Invalid = false;1000const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),1001&Invalid);1002if (Invalid)1003return {};10041005if (BeginEntry.getExpansion().isMacroArgExpansion()) {1006const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),1007&Invalid);1008if (Invalid)1009return {};10101011if (EndEntry.getExpansion().isMacroArgExpansion() &&1012BeginEntry.getExpansion().getExpansionLocStart() ==1013EndEntry.getExpansion().getExpansionLocStart()) {1014Range.setBegin(SM.getImmediateSpellingLoc(Begin));1015Range.setEnd(SM.getImmediateSpellingLoc(End));1016return makeFileCharRange(Range, SM, LangOpts);1017}1018}10191020return {};1021}10221023StringRef Lexer::getSourceText(CharSourceRange Range,1024const SourceManager &SM,1025const LangOptions &LangOpts,1026bool *Invalid) {1027Range = makeFileCharRange(Range, SM, LangOpts);1028if (Range.isInvalid()) {1029if (Invalid) *Invalid = true;1030return {};1031}10321033// Break down the source location.1034std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());1035if (beginInfo.first.isInvalid()) {1036if (Invalid) *Invalid = true;1037return {};1038}10391040unsigned EndOffs;1041if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||1042beginInfo.second > EndOffs) {1043if (Invalid) *Invalid = true;1044return {};1045}10461047// Try to the load the file buffer.1048bool invalidTemp = false;1049StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);1050if (invalidTemp) {1051if (Invalid) *Invalid = true;1052return {};1053}10541055if (Invalid) *Invalid = false;1056return file.substr(beginInfo.second, EndOffs - beginInfo.second);1057}10581059StringRef Lexer::getImmediateMacroName(SourceLocation Loc,1060const SourceManager &SM,1061const LangOptions &LangOpts) {1062assert(Loc.isMacroID() && "Only reasonable to call this on macros");10631064// Find the location of the immediate macro expansion.1065while (true) {1066FileID FID = SM.getFileID(Loc);1067const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);1068const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();1069Loc = Expansion.getExpansionLocStart();1070if (!Expansion.isMacroArgExpansion())1071break;10721073// For macro arguments we need to check that the argument did not come1074// from an inner macro, e.g: "MAC1( MAC2(foo) )"10751076// Loc points to the argument id of the macro definition, move to the1077// macro expansion.1078Loc = SM.getImmediateExpansionRange(Loc).getBegin();1079SourceLocation SpellLoc = Expansion.getSpellingLoc();1080if (SpellLoc.isFileID())1081break; // No inner macro.10821083// If spelling location resides in the same FileID as macro expansion1084// location, it means there is no inner macro.1085FileID MacroFID = SM.getFileID(Loc);1086if (SM.isInFileID(SpellLoc, MacroFID))1087break;10881089// Argument came from inner macro.1090Loc = SpellLoc;1091}10921093// Find the spelling location of the start of the non-argument expansion1094// range. This is where the macro name was spelled in order to begin1095// expanding this macro.1096Loc = SM.getSpellingLoc(Loc);10971098// Dig out the buffer where the macro name was spelled and the extents of the1099// name so that we can render it into the expansion note.1100std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);1101unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);1102StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);1103return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);1104}11051106StringRef Lexer::getImmediateMacroNameForDiagnostics(1107SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {1108assert(Loc.isMacroID() && "Only reasonable to call this on macros");1109// Walk past macro argument expansions.1110while (SM.isMacroArgExpansion(Loc))1111Loc = SM.getImmediateExpansionRange(Loc).getBegin();11121113// If the macro's spelling isn't FileID or from scratch space, then it's1114// actually a token paste or stringization (or similar) and not a macro at1115// all.1116SourceLocation SpellLoc = SM.getSpellingLoc(Loc);1117if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))1118return {};11191120// Find the spelling location of the start of the non-argument expansion1121// range. This is where the macro name was spelled in order to begin1122// expanding this macro.1123Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());11241125// Dig out the buffer where the macro name was spelled and the extents of the1126// name so that we can render it into the expansion note.1127std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);1128unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);1129StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);1130return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);1131}11321133bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) {1134return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);1135}11361137bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {1138assert(isVerticalWhitespace(Str[0]));1139if (Str - 1 < BufferStart)1140return false;11411142if ((Str[0] == '\n' && Str[-1] == '\r') ||1143(Str[0] == '\r' && Str[-1] == '\n')) {1144if (Str - 2 < BufferStart)1145return false;1146--Str;1147}1148--Str;11491150// Rewind to first non-space character:1151while (Str > BufferStart && isHorizontalWhitespace(*Str))1152--Str;11531154return *Str == '\\';1155}11561157StringRef Lexer::getIndentationForLine(SourceLocation Loc,1158const SourceManager &SM) {1159if (Loc.isInvalid() || Loc.isMacroID())1160return {};1161std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);1162if (LocInfo.first.isInvalid())1163return {};1164bool Invalid = false;1165StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);1166if (Invalid)1167return {};1168const char *Line = findBeginningOfLine(Buffer, LocInfo.second);1169if (!Line)1170return {};1171StringRef Rest = Buffer.substr(Line - Buffer.data());1172size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");1173return NumWhitespaceChars == StringRef::npos1174? ""1175: Rest.take_front(NumWhitespaceChars);1176}11771178//===----------------------------------------------------------------------===//1179// Diagnostics forwarding code.1180//===----------------------------------------------------------------------===//11811182/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the1183/// lexer buffer was all expanded at a single point, perform the mapping.1184/// This is currently only used for _Pragma implementation, so it is the slow1185/// path of the hot getSourceLocation method. Do not allow it to be inlined.1186static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(1187Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);1188static SourceLocation GetMappedTokenLoc(Preprocessor &PP,1189SourceLocation FileLoc,1190unsigned CharNo, unsigned TokLen) {1191assert(FileLoc.isMacroID() && "Must be a macro expansion");11921193// Otherwise, we're lexing "mapped tokens". This is used for things like1194// _Pragma handling. Combine the expansion location of FileLoc with the1195// spelling location.1196SourceManager &SM = PP.getSourceManager();11971198// Create a new SLoc which is expanded from Expansion(FileLoc) but whose1199// characters come from spelling(FileLoc)+Offset.1200SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);1201SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);12021203// Figure out the expansion loc range, which is the range covered by the1204// original _Pragma(...) sequence.1205CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);12061207return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);1208}12091210/// getSourceLocation - Return a source location identifier for the specified1211/// offset in the current file.1212SourceLocation Lexer::getSourceLocation(const char *Loc,1213unsigned TokLen) const {1214assert(Loc >= BufferStart && Loc <= BufferEnd &&1215"Location out of range for this buffer!");12161217// In the normal case, we're just lexing from a simple file buffer, return1218// the file id from FileLoc with the offset specified.1219unsigned CharNo = Loc-BufferStart;1220if (FileLoc.isFileID())1221return FileLoc.getLocWithOffset(CharNo);12221223// Otherwise, this is the _Pragma lexer case, which pretends that all of the1224// tokens are lexed from where the _Pragma was defined.1225assert(PP && "This doesn't work on raw lexers");1226return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);1227}12281229/// Diag - Forwarding function for diagnostics. This translate a source1230/// position in the current buffer into a SourceLocation object for rendering.1231DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {1232return PP->Diag(getSourceLocation(Loc), DiagID);1233}12341235//===----------------------------------------------------------------------===//1236// Trigraph and Escaped Newline Handling Code.1237//===----------------------------------------------------------------------===//12381239/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,1240/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.1241static char GetTrigraphCharForLetter(char Letter) {1242switch (Letter) {1243default: return 0;1244case '=': return '#';1245case ')': return ']';1246case '(': return '[';1247case '!': return '|';1248case '\'': return '^';1249case '>': return '}';1250case '/': return '\\';1251case '<': return '{';1252case '-': return '~';1253}1254}12551256/// DecodeTrigraphChar - If the specified character is a legal trigraph when1257/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,1258/// return the result character. Finally, emit a warning about trigraph use1259/// whether trigraphs are enabled or not.1260static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {1261char Res = GetTrigraphCharForLetter(*CP);1262if (!Res)1263return Res;12641265if (!Trigraphs) {1266if (L && !L->isLexingRawMode())1267L->Diag(CP-2, diag::trigraph_ignored);1268return 0;1269}12701271if (L && !L->isLexingRawMode())1272L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);1273return Res;1274}12751276/// getEscapedNewLineSize - Return the size of the specified escaped newline,1277/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a1278/// trigraph equivalent on entry to this function.1279unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {1280unsigned Size = 0;1281while (isWhitespace(Ptr[Size])) {1282++Size;12831284if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')1285continue;12861287// If this is a \r\n or \n\r, skip the other half.1288if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&1289Ptr[Size-1] != Ptr[Size])1290++Size;12911292return Size;1293}12941295// Not an escaped newline, must be a \t or something else.1296return 0;1297}12981299/// SkipEscapedNewLines - If P points to an escaped newline (or a series of1300/// them), skip over them and return the first non-escaped-newline found,1301/// otherwise return P.1302const char *Lexer::SkipEscapedNewLines(const char *P) {1303while (true) {1304const char *AfterEscape;1305if (*P == '\\') {1306AfterEscape = P+1;1307} else if (*P == '?') {1308// If not a trigraph for escape, bail out.1309if (P[1] != '?' || P[2] != '/')1310return P;1311// FIXME: Take LangOpts into account; the language might not1312// support trigraphs.1313AfterEscape = P+3;1314} else {1315return P;1316}13171318unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);1319if (NewLineSize == 0) return P;1320P = AfterEscape+NewLineSize;1321}1322}13231324std::optional<Token> Lexer::findNextToken(SourceLocation Loc,1325const SourceManager &SM,1326const LangOptions &LangOpts) {1327if (Loc.isMacroID()) {1328if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))1329return std::nullopt;1330}1331Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);13321333// Break down the source location.1334std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);13351336// Try to load the file buffer.1337bool InvalidTemp = false;1338StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);1339if (InvalidTemp)1340return std::nullopt;13411342const char *TokenBegin = File.data() + LocInfo.second;13431344// Lex from the start of the given location.1345Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),1346TokenBegin, File.end());1347// Find the token.1348Token Tok;1349lexer.LexFromRawLexer(Tok);1350return Tok;1351}13521353/// Checks that the given token is the first token that occurs after the1354/// given location (this excludes comments and whitespace). Returns the location1355/// immediately after the specified token. If the token is not found or the1356/// location is inside a macro, the returned source location will be invalid.1357SourceLocation Lexer::findLocationAfterToken(1358SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,1359const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {1360std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);1361if (!Tok || Tok->isNot(TKind))1362return {};1363SourceLocation TokenLoc = Tok->getLocation();13641365// Calculate how much whitespace needs to be skipped if any.1366unsigned NumWhitespaceChars = 0;1367if (SkipTrailingWhitespaceAndNewLine) {1368const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();1369unsigned char C = *TokenEnd;1370while (isHorizontalWhitespace(C)) {1371C = *(++TokenEnd);1372NumWhitespaceChars++;1373}13741375// Skip \r, \n, \r\n, or \n\r1376if (C == '\n' || C == '\r') {1377char PrevC = C;1378C = *(++TokenEnd);1379NumWhitespaceChars++;1380if ((C == '\n' || C == '\r') && C != PrevC)1381NumWhitespaceChars++;1382}1383}13841385return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);1386}13871388/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,1389/// get its size, and return it. This is tricky in several cases:1390/// 1. If currently at the start of a trigraph, we warn about the trigraph,1391/// then either return the trigraph (skipping 3 chars) or the '?',1392/// depending on whether trigraphs are enabled or not.1393/// 2. If this is an escaped newline (potentially with whitespace between1394/// the backslash and newline), implicitly skip the newline and return1395/// the char after it.1396///1397/// This handles the slow/uncommon case of the getCharAndSize method. Here we1398/// know that we can accumulate into Size, and that we have already incremented1399/// Ptr by Size bytes.1400///1401/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should1402/// be updated to match.1403Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {1404unsigned Size = 0;1405// If we have a slash, look for an escaped newline.1406if (Ptr[0] == '\\') {1407++Size;1408++Ptr;1409Slash:1410// Common case, backslash-char where the char is not whitespace.1411if (!isWhitespace(Ptr[0]))1412return {'\\', Size};14131414// See if we have optional whitespace characters between the slash and1415// newline.1416if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {1417// Remember that this token needs to be cleaned.1418if (Tok) Tok->setFlag(Token::NeedsCleaning);14191420// Warn if there was whitespace between the backslash and newline.1421if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())1422Diag(Ptr, diag::backslash_newline_space);14231424// Found backslash<whitespace><newline>. Parse the char after it.1425Size += EscapedNewLineSize;1426Ptr += EscapedNewLineSize;14271428// Use slow version to accumulate a correct size field.1429auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);1430CharAndSize.Size += Size;1431return CharAndSize;1432}14331434// Otherwise, this is not an escaped newline, just return the slash.1435return {'\\', Size};1436}14371438// If this is a trigraph, process it.1439if (Ptr[0] == '?' && Ptr[1] == '?') {1440// If this is actually a legal trigraph (not something like "??x"), emit1441// a trigraph warning. If so, and if trigraphs are enabled, return it.1442if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,1443LangOpts.Trigraphs)) {1444// Remember that this token needs to be cleaned.1445if (Tok) Tok->setFlag(Token::NeedsCleaning);14461447Ptr += 3;1448Size += 3;1449if (C == '\\') goto Slash;1450return {C, Size};1451}1452}14531454// If this is neither, return a single character.1455return {*Ptr, Size + 1u};1456}14571458/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the1459/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,1460/// and that we have already incremented Ptr by Size bytes.1461///1462/// NOTE: When this method is updated, getCharAndSizeSlow (above) should1463/// be updated to match.1464Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,1465const LangOptions &LangOpts) {14661467unsigned Size = 0;1468// If we have a slash, look for an escaped newline.1469if (Ptr[0] == '\\') {1470++Size;1471++Ptr;1472Slash:1473// Common case, backslash-char where the char is not whitespace.1474if (!isWhitespace(Ptr[0]))1475return {'\\', Size};14761477// See if we have optional whitespace characters followed by a newline.1478if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {1479// Found backslash<whitespace><newline>. Parse the char after it.1480Size += EscapedNewLineSize;1481Ptr += EscapedNewLineSize;14821483// Use slow version to accumulate a correct size field.1484auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);1485CharAndSize.Size += Size;1486return CharAndSize;1487}14881489// Otherwise, this is not an escaped newline, just return the slash.1490return {'\\', Size};1491}14921493// If this is a trigraph, process it.1494if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {1495// If this is actually a legal trigraph (not something like "??x"), return1496// it.1497if (char C = GetTrigraphCharForLetter(Ptr[2])) {1498Ptr += 3;1499Size += 3;1500if (C == '\\') goto Slash;1501return {C, Size};1502}1503}15041505// If this is neither, return a single character.1506return {*Ptr, Size + 1u};1507}15081509//===----------------------------------------------------------------------===//1510// Helper methods for lexing.1511//===----------------------------------------------------------------------===//15121513/// Routine that indiscriminately sets the offset into the source file.1514void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {1515BufferPtr = BufferStart + Offset;1516if (BufferPtr > BufferEnd)1517BufferPtr = BufferEnd;1518// FIXME: What exactly does the StartOfLine bit mean? There are two1519// possible meanings for the "start" of the line: the first token on the1520// unexpanded line, or the first token on the expanded line.1521IsAtStartOfLine = StartOfLine;1522IsAtPhysicalStartOfLine = StartOfLine;1523}15241525static bool isUnicodeWhitespace(uint32_t Codepoint) {1526static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(1527UnicodeWhitespaceCharRanges);1528return UnicodeWhitespaceChars.contains(Codepoint);1529}15301531static llvm::SmallString<5> codepointAsHexString(uint32_t C) {1532llvm::SmallString<5> CharBuf;1533llvm::raw_svector_ostream CharOS(CharBuf);1534llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);1535return CharBuf;1536}15371538// To mitigate https://github.com/llvm/llvm-project/issues/54732,1539// we allow "Mathematical Notation Characters" in identifiers.1540// This is a proposed profile that extends the XID_Start/XID_continue1541// with mathematical symbols, superscipts and subscripts digits1542// found in some production software.1543// https://www.unicode.org/L2/L2022/22230-math-profile.pdf1544static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,1545bool IsStart, bool &IsExtension) {1546static const llvm::sys::UnicodeCharSet MathStartChars(1547MathematicalNotationProfileIDStartRanges);1548static const llvm::sys::UnicodeCharSet MathContinueChars(1549MathematicalNotationProfileIDContinueRanges);1550if (MathStartChars.contains(C) ||1551(!IsStart && MathContinueChars.contains(C))) {1552IsExtension = true;1553return true;1554}1555return false;1556}15571558static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,1559bool &IsExtension) {1560if (LangOpts.AsmPreprocessor) {1561return false;1562} else if (LangOpts.DollarIdents && '$' == C) {1563return true;1564} else if (LangOpts.CPlusPlus || LangOpts.C23) {1565// A non-leading codepoint must have the XID_Continue property.1566// XIDContinueRanges doesn't contains characters also in XIDStartRanges,1567// so we need to check both tables.1568// '_' doesn't have the XID_Continue property but is allowed in C and C++.1569static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);1570static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);1571if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))1572return true;1573return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,1574IsExtension);1575} else if (LangOpts.C11) {1576static const llvm::sys::UnicodeCharSet C11AllowedIDChars(1577C11AllowedIDCharRanges);1578return C11AllowedIDChars.contains(C);1579} else {1580static const llvm::sys::UnicodeCharSet C99AllowedIDChars(1581C99AllowedIDCharRanges);1582return C99AllowedIDChars.contains(C);1583}1584}15851586static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,1587bool &IsExtension) {1588assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");1589IsExtension = false;1590if (LangOpts.AsmPreprocessor) {1591return false;1592}1593if (LangOpts.CPlusPlus || LangOpts.C23) {1594static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);1595if (XIDStartChars.contains(C))1596return true;1597return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,1598IsExtension);1599}1600if (!isAllowedIDChar(C, LangOpts, IsExtension))1601return false;1602if (LangOpts.C11) {1603static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(1604C11DisallowedInitialIDCharRanges);1605return !C11DisallowedInitialIDChars.contains(C);1606}1607static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(1608C99DisallowedInitialIDCharRanges);1609return !C99DisallowedInitialIDChars.contains(C);1610}16111612static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C,1613CharSourceRange Range) {16141615static const llvm::sys::UnicodeCharSet MathStartChars(1616MathematicalNotationProfileIDStartRanges);1617static const llvm::sys::UnicodeCharSet MathContinueChars(1618MathematicalNotationProfileIDContinueRanges);16191620(void)MathStartChars;1621(void)MathContinueChars;1622assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&1623"Unexpected mathematical notation codepoint");1624Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)1625<< codepointAsHexString(C) << Range;1626}16271628static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,1629const char *End) {1630return CharSourceRange::getCharRange(L.getSourceLocation(Begin),1631L.getSourceLocation(End));1632}16331634static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,1635CharSourceRange Range, bool IsFirst) {1636// Check C99 compatibility.1637if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {1638enum {1639CannotAppearInIdentifier = 0,1640CannotStartIdentifier1641};16421643static const llvm::sys::UnicodeCharSet C99AllowedIDChars(1644C99AllowedIDCharRanges);1645static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(1646C99DisallowedInitialIDCharRanges);1647if (!C99AllowedIDChars.contains(C)) {1648Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)1649<< Range1650<< CannotAppearInIdentifier;1651} else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {1652Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)1653<< Range1654<< CannotStartIdentifier;1655}1656}1657}16581659/// After encountering UTF-8 character C and interpreting it as an identifier1660/// character, check whether it's a homoglyph for a common non-identifier1661/// source character that is unlikely to be an intentional identifier1662/// character and warn if so.1663static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,1664CharSourceRange Range) {1665// FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).1666struct HomoglyphPair {1667uint32_t Character;1668char LooksLike;1669bool operator<(HomoglyphPair R) const { return Character < R.Character; }1670};1671static constexpr HomoglyphPair SortedHomoglyphs[] = {1672{U'\u00ad', 0}, // SOFT HYPHEN1673{U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK1674{U'\u037e', ';'}, // GREEK QUESTION MARK1675{U'\u200b', 0}, // ZERO WIDTH SPACE1676{U'\u200c', 0}, // ZERO WIDTH NON-JOINER1677{U'\u200d', 0}, // ZERO WIDTH JOINER1678{U'\u2060', 0}, // WORD JOINER1679{U'\u2061', 0}, // FUNCTION APPLICATION1680{U'\u2062', 0}, // INVISIBLE TIMES1681{U'\u2063', 0}, // INVISIBLE SEPARATOR1682{U'\u2064', 0}, // INVISIBLE PLUS1683{U'\u2212', '-'}, // MINUS SIGN1684{U'\u2215', '/'}, // DIVISION SLASH1685{U'\u2216', '\\'}, // SET MINUS1686{U'\u2217', '*'}, // ASTERISK OPERATOR1687{U'\u2223', '|'}, // DIVIDES1688{U'\u2227', '^'}, // LOGICAL AND1689{U'\u2236', ':'}, // RATIO1690{U'\u223c', '~'}, // TILDE OPERATOR1691{U'\ua789', ':'}, // MODIFIER LETTER COLON1692{U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE1693{U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK1694{U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN1695{U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN1696{U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN1697{U'\uff06', '&'}, // FULLWIDTH AMPERSAND1698{U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS1699{U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS1700{U'\uff0a', '*'}, // FULLWIDTH ASTERISK1701{U'\uff0b', '+'}, // FULLWIDTH ASTERISK1702{U'\uff0c', ','}, // FULLWIDTH COMMA1703{U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS1704{U'\uff0e', '.'}, // FULLWIDTH FULL STOP1705{U'\uff0f', '/'}, // FULLWIDTH SOLIDUS1706{U'\uff1a', ':'}, // FULLWIDTH COLON1707{U'\uff1b', ';'}, // FULLWIDTH SEMICOLON1708{U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN1709{U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN1710{U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN1711{U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK1712{U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT1713{U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET1714{U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS1715{U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET1716{U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT1717{U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET1718{U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE1719{U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET1720{U'\uff5e', '~'}, // FULLWIDTH TILDE1721{0, 0}1722};1723auto Homoglyph =1724std::lower_bound(std::begin(SortedHomoglyphs),1725std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});1726if (Homoglyph->Character == C) {1727if (Homoglyph->LooksLike) {1728const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};1729Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)1730<< Range << codepointAsHexString(C) << LooksLikeStr;1731} else {1732Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)1733<< Range << codepointAsHexString(C);1734}1735}1736}17371738static void diagnoseInvalidUnicodeCodepointInIdentifier(1739DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,1740CharSourceRange Range, bool IsFirst) {1741if (isASCII(CodePoint))1742return;17431744bool IsExtension;1745bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);1746bool IsIDContinue =1747IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);17481749if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))1750return;17511752bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;17531754if (!IsFirst || InvalidOnlyAtStart) {1755Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)1756<< Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)1757<< FixItHint::CreateRemoval(Range);1758} else {1759Diags.Report(Range.getBegin(), diag::err_character_not_allowed)1760<< Range << codepointAsHexString(CodePoint)1761<< FixItHint::CreateRemoval(Range);1762}1763}17641765bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,1766Token &Result) {1767const char *UCNPtr = CurPtr + Size;1768uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);1769if (CodePoint == 0) {1770return false;1771}1772bool IsExtension = false;1773if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {1774if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))1775return false;1776if (!isLexingRawMode() && !ParsingPreprocessorDirective &&1777!PP->isPreprocessedOutput())1778diagnoseInvalidUnicodeCodepointInIdentifier(1779PP->getDiagnostics(), LangOpts, CodePoint,1780makeCharRange(*this, CurPtr, UCNPtr),1781/*IsFirst=*/false);17821783// We got a unicode codepoint that is neither a space nor a1784// a valid identifier part.1785// Carry on as if the codepoint was valid for recovery purposes.1786} else if (!isLexingRawMode()) {1787if (IsExtension)1788diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,1789makeCharRange(*this, CurPtr, UCNPtr));17901791maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,1792makeCharRange(*this, CurPtr, UCNPtr),1793/*IsFirst=*/false);1794}17951796Result.setFlag(Token::HasUCN);1797if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||1798(UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))1799CurPtr = UCNPtr;1800else1801while (CurPtr != UCNPtr)1802(void)getAndAdvanceChar(CurPtr, Result);1803return true;1804}18051806bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {1807llvm::UTF32 CodePoint;18081809// If a UTF-8 codepoint appears immediately after an escaped new line,1810// CurPtr may point to the splicing \ on the preceding line,1811// so we need to skip it.1812unsigned FirstCodeUnitSize;1813getCharAndSize(CurPtr, FirstCodeUnitSize);1814const char *CharStart = CurPtr + FirstCodeUnitSize - 1;1815const char *UnicodePtr = CharStart;18161817llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(1818(const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,1819&CodePoint, llvm::strictConversion);1820if (ConvResult != llvm::conversionOK)1821return false;18221823bool IsExtension = false;1824if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,1825IsExtension)) {1826if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))1827return false;18281829if (!isLexingRawMode() && !ParsingPreprocessorDirective &&1830!PP->isPreprocessedOutput())1831diagnoseInvalidUnicodeCodepointInIdentifier(1832PP->getDiagnostics(), LangOpts, CodePoint,1833makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);1834// We got a unicode codepoint that is neither a space nor a1835// a valid identifier part. Carry on as if the codepoint was1836// valid for recovery purposes.1837} else if (!isLexingRawMode()) {1838if (IsExtension)1839diagnoseExtensionInIdentifier(1840PP->getDiagnostics(), CodePoint,1841makeCharRange(*this, CharStart, UnicodePtr));1842maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,1843makeCharRange(*this, CharStart, UnicodePtr),1844/*IsFirst=*/false);1845maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,1846makeCharRange(*this, CharStart, UnicodePtr));1847}18481849// Once we sucessfully parsed some UTF-8,1850// calling ConsumeChar ensures the NeedsCleaning flag is set on the token1851// being lexed, and that warnings about trailing spaces are emitted.1852ConsumeChar(CurPtr, FirstCodeUnitSize, Result);1853CurPtr = UnicodePtr;1854return true;1855}18561857bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,1858const char *CurPtr) {1859bool IsExtension = false;1860if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {1861if (!isLexingRawMode() && !ParsingPreprocessorDirective &&1862!PP->isPreprocessedOutput()) {1863if (IsExtension)1864diagnoseExtensionInIdentifier(PP->getDiagnostics(), C,1865makeCharRange(*this, BufferPtr, CurPtr));1866maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,1867makeCharRange(*this, BufferPtr, CurPtr),1868/*IsFirst=*/true);1869maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,1870makeCharRange(*this, BufferPtr, CurPtr));1871}18721873MIOpt.ReadToken();1874return LexIdentifierContinue(Result, CurPtr);1875}18761877if (!isLexingRawMode() && !ParsingPreprocessorDirective &&1878!PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&1879!isUnicodeWhitespace(C)) {1880// Non-ASCII characters tend to creep into source code unintentionally.1881// Instead of letting the parser complain about the unknown token,1882// just drop the character.1883// Note that we can /only/ do this when the non-ASCII character is actually1884// spelled as Unicode, not written as a UCN. The standard requires that1885// we not throw away any possible preprocessor tokens, but there's a1886// loophole in the mapping of Unicode characters to basic character set1887// characters that allows us to map these particular characters to, say,1888// whitespace.1889diagnoseInvalidUnicodeCodepointInIdentifier(1890PP->getDiagnostics(), LangOpts, C,1891makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);1892BufferPtr = CurPtr;1893return false;1894}18951896// Otherwise, we have an explicit UCN or a character that's unlikely to show1897// up by accident.1898MIOpt.ReadToken();1899FormTokenWithChars(Result, CurPtr, tok::unknown);1900return true;1901}19021903static const char *1904fastParseASCIIIdentifier(const char *CurPtr,1905[[maybe_unused]] const char *BufferEnd) {1906#ifdef __SSE4_2__1907alignas(16) static constexpr char AsciiIdentifierRange[16] = {1908'_', '_', 'A', 'Z', 'a', 'z', '0', '9',1909};1910constexpr ssize_t BytesPerRegister = 16;19111912__m128i AsciiIdentifierRangeV =1913_mm_load_si128((const __m128i *)AsciiIdentifierRange);19141915while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {1916__m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));19171918int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,1919_SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES |1920_SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY);1921CurPtr += Consumed;1922if (Consumed == BytesPerRegister)1923continue;1924return CurPtr;1925}1926#endif19271928unsigned char C = *CurPtr;1929while (isAsciiIdentifierContinue(C))1930C = *++CurPtr;1931return CurPtr;1932}19331934bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {1935// Match [_A-Za-z0-9]*, we have already matched an identifier start.19361937while (true) {19381939CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd);19401941unsigned Size;1942// Slow path: handle trigraph, unicode codepoints, UCNs.1943unsigned char C = getCharAndSize(CurPtr, Size);1944if (isAsciiIdentifierContinue(C)) {1945CurPtr = ConsumeChar(CurPtr, Size, Result);1946continue;1947}1948if (C == '$') {1949// If we hit a $ and they are not supported in identifiers, we are done.1950if (!LangOpts.DollarIdents)1951break;1952// Otherwise, emit a diagnostic and continue.1953if (!isLexingRawMode())1954Diag(CurPtr, diag::ext_dollar_in_identifier);1955CurPtr = ConsumeChar(CurPtr, Size, Result);1956continue;1957}1958if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))1959continue;1960if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))1961continue;1962// Neither an expected Unicode codepoint nor a UCN.1963break;1964}19651966const char *IdStart = BufferPtr;1967FormTokenWithChars(Result, CurPtr, tok::raw_identifier);1968Result.setRawIdentifierData(IdStart);19691970// If we are in raw mode, return this identifier raw. There is no need to1971// look up identifier information or attempt to macro expand it.1972if (LexingRawMode)1973return true;19741975// Fill in Result.IdentifierInfo and update the token kind,1976// looking up the identifier in the identifier table.1977const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);1978// Note that we have to call PP->LookUpIdentifierInfo() even for code1979// completion, it writes IdentifierInfo into Result, and callers rely on it.19801981// If the completion point is at the end of an identifier, we want to treat1982// the identifier as incomplete even if it resolves to a macro or a keyword.1983// This allows e.g. 'class^' to complete to 'classifier'.1984if (isCodeCompletionPoint(CurPtr)) {1985// Return the code-completion token.1986Result.setKind(tok::code_completion);1987// Skip the code-completion char and all immediate identifier characters.1988// This ensures we get consistent behavior when completing at any point in1989// an identifier (i.e. at the start, in the middle, at the end). Note that1990// only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code1991// simpler.1992assert(*CurPtr == 0 && "Completion character must be 0");1993++CurPtr;1994// Note that code completion token is not added as a separate character1995// when the completion point is at the end of the buffer. Therefore, we need1996// to check if the buffer has ended.1997if (CurPtr < BufferEnd) {1998while (isAsciiIdentifierContinue(*CurPtr))1999++CurPtr;2000}2001BufferPtr = CurPtr;2002return true;2003}20042005// Finally, now that we know we have an identifier, pass this off to the2006// preprocessor, which may macro expand it or something.2007if (II->isHandleIdentifierCase())2008return PP->HandleIdentifier(Result);20092010return true;2011}20122013/// isHexaLiteral - Return true if Start points to a hex constant.2014/// in microsoft mode (where this is supposed to be several different tokens).2015bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {2016auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);2017char C1 = CharAndSize1.Char;2018if (C1 != '0')2019return false;20202021auto CharAndSize2 =2022Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);2023char C2 = CharAndSize2.Char;2024return (C2 == 'x' || C2 == 'X');2025}20262027/// LexNumericConstant - Lex the remainder of a integer or floating point2028/// constant. From[-1] is the first character lexed. Return the end of the2029/// constant.2030bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {2031unsigned Size;2032char C = getCharAndSize(CurPtr, Size);2033char PrevCh = 0;2034while (isPreprocessingNumberBody(C)) {2035CurPtr = ConsumeChar(CurPtr, Size, Result);2036PrevCh = C;2037if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {2038CurPtr -= Size;2039break;2040}2041C = getCharAndSize(CurPtr, Size);2042}20432044// If we fell out, check for a sign, due to 1e+12. If we have one, continue.2045if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {2046// If we are in Microsoft mode, don't continue if the constant is hex.2047// For example, MSVC will accept the following as 3 tokens: 0x1234567e+12048if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))2049return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));2050}20512052// If we have a hex FP constant, continue.2053if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {2054// Outside C99 and C++17, we accept hexadecimal floating point numbers as a2055// not-quite-conforming extension. Only do so if this looks like it's2056// actually meant to be a hexfloat, and not if it has a ud-suffix.2057bool IsHexFloat = true;2058if (!LangOpts.C99) {2059if (!isHexaLiteral(BufferPtr, LangOpts))2060IsHexFloat = false;2061else if (!LangOpts.CPlusPlus17 &&2062std::find(BufferPtr, CurPtr, '_') != CurPtr)2063IsHexFloat = false;2064}2065if (IsHexFloat)2066return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));2067}20682069// If we have a digit separator, continue.2070if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {2071auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);2072if (isAsciiIdentifierContinue(Next)) {2073if (!isLexingRawMode())2074Diag(CurPtr, LangOpts.CPlusPlus2075? diag::warn_cxx11_compat_digit_separator2076: diag::warn_c23_compat_digit_separator);2077CurPtr = ConsumeChar(CurPtr, Size, Result);2078CurPtr = ConsumeChar(CurPtr, NextSize, Result);2079return LexNumericConstant(Result, CurPtr);2080}2081}20822083// If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.2084if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))2085return LexNumericConstant(Result, CurPtr);2086if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))2087return LexNumericConstant(Result, CurPtr);20882089// Update the location of token as well as BufferPtr.2090const char *TokStart = BufferPtr;2091FormTokenWithChars(Result, CurPtr, tok::numeric_constant);2092Result.setLiteralData(TokStart);2093return true;2094}20952096/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes2097/// in C++11, or warn on a ud-suffix in C++98.2098const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,2099bool IsStringLiteral) {2100assert(LangOpts.CPlusPlus);21012102// Maximally munch an identifier.2103unsigned Size;2104char C = getCharAndSize(CurPtr, Size);2105bool Consumed = false;21062107if (!isAsciiIdentifierStart(C)) {2108if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))2109Consumed = true;2110else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))2111Consumed = true;2112else2113return CurPtr;2114}21152116if (!LangOpts.CPlusPlus11) {2117if (!isLexingRawMode())2118Diag(CurPtr,2119C == '_' ? diag::warn_cxx11_compat_user_defined_literal2120: diag::warn_cxx11_compat_reserved_user_defined_literal)2121<< FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");2122return CurPtr;2123}21242125// C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix2126// that does not start with an underscore is ill-formed. As a conforming2127// extension, we treat all such suffixes as if they had whitespace before2128// them. We assume a suffix beginning with a UCN or UTF-8 character is more2129// likely to be a ud-suffix than a macro, however, and accept that.2130if (!Consumed) {2131bool IsUDSuffix = false;2132if (C == '_')2133IsUDSuffix = true;2134else if (IsStringLiteral && LangOpts.CPlusPlus14) {2135// In C++1y, we need to look ahead a few characters to see if this is a2136// valid suffix for a string literal or a numeric literal (this could be2137// the 'operator""if' defining a numeric literal operator).2138const unsigned MaxStandardSuffixLength = 3;2139char Buffer[MaxStandardSuffixLength] = { C };2140unsigned Consumed = Size;2141unsigned Chars = 1;2142while (true) {2143auto [Next, NextSize] =2144getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);2145if (!isAsciiIdentifierContinue(Next)) {2146// End of suffix. Check whether this is on the allowed list.2147const StringRef CompleteSuffix(Buffer, Chars);2148IsUDSuffix =2149StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);2150break;2151}21522153if (Chars == MaxStandardSuffixLength)2154// Too long: can't be a standard suffix.2155break;21562157Buffer[Chars++] = Next;2158Consumed += NextSize;2159}2160}21612162if (!IsUDSuffix) {2163if (!isLexingRawMode())2164Diag(CurPtr, LangOpts.MSVCCompat2165? diag::ext_ms_reserved_user_defined_literal2166: diag::ext_reserved_user_defined_literal)2167<< FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");2168return CurPtr;2169}21702171CurPtr = ConsumeChar(CurPtr, Size, Result);2172}21732174Result.setFlag(Token::HasUDSuffix);2175while (true) {2176C = getCharAndSize(CurPtr, Size);2177if (isAsciiIdentifierContinue(C)) {2178CurPtr = ConsumeChar(CurPtr, Size, Result);2179} else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {2180} else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {2181} else2182break;2183}21842185return CurPtr;2186}21872188/// LexStringLiteral - Lex the remainder of a string literal, after having lexed2189/// either " or L" or u8" or u" or U".2190bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,2191tok::TokenKind Kind) {2192const char *AfterQuote = CurPtr;2193// Does this string contain the \0 character?2194const char *NulCharacter = nullptr;21952196if (!isLexingRawMode() &&2197(Kind == tok::utf8_string_literal ||2198Kind == tok::utf16_string_literal ||2199Kind == tok::utf32_string_literal))2200Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal2201: diag::warn_c99_compat_unicode_literal);22022203char C = getAndAdvanceChar(CurPtr, Result);2204while (C != '"') {2205// Skip escaped characters. Escaped newlines will already be processed by2206// getAndAdvanceChar.2207if (C == '\\')2208C = getAndAdvanceChar(CurPtr, Result);22092210if (C == '\n' || C == '\r' || // Newline.2211(C == 0 && CurPtr-1 == BufferEnd)) { // End of file.2212if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)2213Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;2214FormTokenWithChars(Result, CurPtr-1, tok::unknown);2215return true;2216}22172218if (C == 0) {2219if (isCodeCompletionPoint(CurPtr-1)) {2220if (ParsingFilename)2221codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);2222else2223PP->CodeCompleteNaturalLanguage();2224FormTokenWithChars(Result, CurPtr - 1, tok::unknown);2225cutOffLexing();2226return true;2227}22282229NulCharacter = CurPtr-1;2230}2231C = getAndAdvanceChar(CurPtr, Result);2232}22332234// If we are in C++11, lex the optional ud-suffix.2235if (LangOpts.CPlusPlus)2236CurPtr = LexUDSuffix(Result, CurPtr, true);22372238// If a nul character existed in the string, warn about it.2239if (NulCharacter && !isLexingRawMode())2240Diag(NulCharacter, diag::null_in_char_or_string) << 1;22412242// Update the location of the token as well as the BufferPtr instance var.2243const char *TokStart = BufferPtr;2244FormTokenWithChars(Result, CurPtr, Kind);2245Result.setLiteralData(TokStart);2246return true;2247}22482249/// LexRawStringLiteral - Lex the remainder of a raw string literal, after2250/// having lexed R", LR", u8R", uR", or UR".2251bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,2252tok::TokenKind Kind) {2253// This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:2254// Between the initial and final double quote characters of the raw string,2255// any transformations performed in phases 1 and 2 (trigraphs,2256// universal-character-names, and line splicing) are reverted.22572258if (!isLexingRawMode())2259Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);22602261unsigned PrefixLen = 0;22622263while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) {2264if (!isLexingRawMode() &&2265llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {2266const char *Pos = &CurPtr[PrefixLen];2267Diag(Pos, LangOpts.CPlusPlus262268? diag::warn_cxx26_compat_raw_string_literal_character_set2269: diag::ext_cxx26_raw_string_literal_character_set)2270<< StringRef(Pos, 1);2271}2272++PrefixLen;2273}22742275// If the last character was not a '(', then we didn't lex a valid delimiter.2276if (CurPtr[PrefixLen] != '(') {2277if (!isLexingRawMode()) {2278const char *PrefixEnd = &CurPtr[PrefixLen];2279if (PrefixLen == 16) {2280Diag(PrefixEnd, diag::err_raw_delim_too_long);2281} else if (*PrefixEnd == '\n') {2282Diag(PrefixEnd, diag::err_invalid_newline_raw_delim);2283} else {2284Diag(PrefixEnd, diag::err_invalid_char_raw_delim)2285<< StringRef(PrefixEnd, 1);2286}2287}22882289// Search for the next '"' in hopes of salvaging the lexer. Unfortunately,2290// it's possible the '"' was intended to be part of the raw string, but2291// there's not much we can do about that.2292while (true) {2293char C = *CurPtr++;22942295if (C == '"')2296break;2297if (C == 0 && CurPtr-1 == BufferEnd) {2298--CurPtr;2299break;2300}2301}23022303FormTokenWithChars(Result, CurPtr, tok::unknown);2304return true;2305}23062307// Save prefix and move CurPtr past it2308const char *Prefix = CurPtr;2309CurPtr += PrefixLen + 1; // skip over prefix and '('23102311while (true) {2312char C = *CurPtr++;23132314if (C == ')') {2315// Check for prefix match and closing quote.2316if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {2317CurPtr += PrefixLen + 1; // skip over prefix and '"'2318break;2319}2320} else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.2321if (!isLexingRawMode())2322Diag(BufferPtr, diag::err_unterminated_raw_string)2323<< StringRef(Prefix, PrefixLen);2324FormTokenWithChars(Result, CurPtr-1, tok::unknown);2325return true;2326}2327}23282329// If we are in C++11, lex the optional ud-suffix.2330if (LangOpts.CPlusPlus)2331CurPtr = LexUDSuffix(Result, CurPtr, true);23322333// Update the location of token as well as BufferPtr.2334const char *TokStart = BufferPtr;2335FormTokenWithChars(Result, CurPtr, Kind);2336Result.setLiteralData(TokStart);2337return true;2338}23392340/// LexAngledStringLiteral - Lex the remainder of an angled string literal,2341/// after having lexed the '<' character. This is used for #include filenames.2342bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {2343// Does this string contain the \0 character?2344const char *NulCharacter = nullptr;2345const char *AfterLessPos = CurPtr;2346char C = getAndAdvanceChar(CurPtr, Result);2347while (C != '>') {2348// Skip escaped characters. Escaped newlines will already be processed by2349// getAndAdvanceChar.2350if (C == '\\')2351C = getAndAdvanceChar(CurPtr, Result);23522353if (isVerticalWhitespace(C) || // Newline.2354(C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.2355// If the filename is unterminated, then it must just be a lone <2356// character. Return this as such.2357FormTokenWithChars(Result, AfterLessPos, tok::less);2358return true;2359}23602361if (C == 0) {2362if (isCodeCompletionPoint(CurPtr - 1)) {2363codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);2364cutOffLexing();2365FormTokenWithChars(Result, CurPtr - 1, tok::unknown);2366return true;2367}2368NulCharacter = CurPtr-1;2369}2370C = getAndAdvanceChar(CurPtr, Result);2371}23722373// If a nul character existed in the string, warn about it.2374if (NulCharacter && !isLexingRawMode())2375Diag(NulCharacter, diag::null_in_char_or_string) << 1;23762377// Update the location of token as well as BufferPtr.2378const char *TokStart = BufferPtr;2379FormTokenWithChars(Result, CurPtr, tok::header_name);2380Result.setLiteralData(TokStart);2381return true;2382}23832384void Lexer::codeCompleteIncludedFile(const char *PathStart,2385const char *CompletionPoint,2386bool IsAngled) {2387// Completion only applies to the filename, after the last slash.2388StringRef PartialPath(PathStart, CompletionPoint - PathStart);2389llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";2390auto Slash = PartialPath.find_last_of(SlashChars);2391StringRef Dir =2392(Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);2393const char *StartOfFilename =2394(Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;2395// Code completion filter range is the filename only, up to completion point.2396PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(2397StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));2398// We should replace the characters up to the closing quote or closest slash,2399// if any.2400while (CompletionPoint < BufferEnd) {2401char Next = *(CompletionPoint + 1);2402if (Next == 0 || Next == '\r' || Next == '\n')2403break;2404++CompletionPoint;2405if (Next == (IsAngled ? '>' : '"'))2406break;2407if (SlashChars.contains(Next))2408break;2409}24102411PP->setCodeCompletionTokenRange(2412FileLoc.getLocWithOffset(StartOfFilename - BufferStart),2413FileLoc.getLocWithOffset(CompletionPoint - BufferStart));2414PP->CodeCompleteIncludedFile(Dir, IsAngled);2415}24162417/// LexCharConstant - Lex the remainder of a character constant, after having2418/// lexed either ' or L' or u8' or u' or U'.2419bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,2420tok::TokenKind Kind) {2421// Does this character contain the \0 character?2422const char *NulCharacter = nullptr;24232424if (!isLexingRawMode()) {2425if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)2426Diag(BufferPtr, LangOpts.CPlusPlus2427? diag::warn_cxx98_compat_unicode_literal2428: diag::warn_c99_compat_unicode_literal);2429else if (Kind == tok::utf8_char_constant)2430Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);2431}24322433char C = getAndAdvanceChar(CurPtr, Result);2434if (C == '\'') {2435if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)2436Diag(BufferPtr, diag::ext_empty_character);2437FormTokenWithChars(Result, CurPtr, tok::unknown);2438return true;2439}24402441while (C != '\'') {2442// Skip escaped characters.2443if (C == '\\')2444C = getAndAdvanceChar(CurPtr, Result);24452446if (C == '\n' || C == '\r' || // Newline.2447(C == 0 && CurPtr-1 == BufferEnd)) { // End of file.2448if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)2449Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;2450FormTokenWithChars(Result, CurPtr-1, tok::unknown);2451return true;2452}24532454if (C == 0) {2455if (isCodeCompletionPoint(CurPtr-1)) {2456PP->CodeCompleteNaturalLanguage();2457FormTokenWithChars(Result, CurPtr-1, tok::unknown);2458cutOffLexing();2459return true;2460}24612462NulCharacter = CurPtr-1;2463}2464C = getAndAdvanceChar(CurPtr, Result);2465}24662467// If we are in C++11, lex the optional ud-suffix.2468if (LangOpts.CPlusPlus)2469CurPtr = LexUDSuffix(Result, CurPtr, false);24702471// If a nul character existed in the character, warn about it.2472if (NulCharacter && !isLexingRawMode())2473Diag(NulCharacter, diag::null_in_char_or_string) << 0;24742475// Update the location of token as well as BufferPtr.2476const char *TokStart = BufferPtr;2477FormTokenWithChars(Result, CurPtr, Kind);2478Result.setLiteralData(TokStart);2479return true;2480}24812482/// SkipWhitespace - Efficiently skip over a series of whitespace characters.2483/// Update BufferPtr to point to the next non-whitespace character and return.2484///2485/// This method forms a token and returns true if KeepWhitespaceMode is enabled.2486bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,2487bool &TokAtPhysicalStartOfLine) {2488// Whitespace - Skip it, then return the token after the whitespace.2489bool SawNewline = isVerticalWhitespace(CurPtr[-1]);24902491unsigned char Char = *CurPtr;24922493const char *lastNewLine = nullptr;2494auto setLastNewLine = [&](const char *Ptr) {2495lastNewLine = Ptr;2496if (!NewLinePtr)2497NewLinePtr = Ptr;2498};2499if (SawNewline)2500setLastNewLine(CurPtr - 1);25012502// Skip consecutive spaces efficiently.2503while (true) {2504// Skip horizontal whitespace very aggressively.2505while (isHorizontalWhitespace(Char))2506Char = *++CurPtr;25072508// Otherwise if we have something other than whitespace, we're done.2509if (!isVerticalWhitespace(Char))2510break;25112512if (ParsingPreprocessorDirective) {2513// End of preprocessor directive line, let LexTokenInternal handle this.2514BufferPtr = CurPtr;2515return false;2516}25172518// OK, but handle newline.2519if (*CurPtr == '\n')2520setLastNewLine(CurPtr);2521SawNewline = true;2522Char = *++CurPtr;2523}25242525// If the client wants us to return whitespace, return it now.2526if (isKeepWhitespaceMode()) {2527FormTokenWithChars(Result, CurPtr, tok::unknown);2528if (SawNewline) {2529IsAtStartOfLine = true;2530IsAtPhysicalStartOfLine = true;2531}2532// FIXME: The next token will not have LeadingSpace set.2533return true;2534}25352536// If this isn't immediately after a newline, there is leading space.2537char PrevChar = CurPtr[-1];2538bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);25392540Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);2541if (SawNewline) {2542Result.setFlag(Token::StartOfLine);2543TokAtPhysicalStartOfLine = true;25442545if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {2546if (auto *Handler = PP->getEmptylineHandler())2547Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),2548getSourceLocation(lastNewLine)));2549}2550}25512552BufferPtr = CurPtr;2553return false;2554}25552556/// We have just read the // characters from input. Skip until we find the2557/// newline character that terminates the comment. Then update BufferPtr and2558/// return.2559///2560/// If we're in KeepCommentMode or any CommentHandler has inserted2561/// some tokens, this will store the first token and return true.2562bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,2563bool &TokAtPhysicalStartOfLine) {2564// If Line comments aren't explicitly enabled for this language, emit an2565// extension warning.2566if (!LineComment) {2567if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.2568Diag(BufferPtr, diag::ext_line_comment);25692570// Mark them enabled so we only emit one warning for this translation2571// unit.2572LineComment = true;2573}25742575// Scan over the body of the comment. The common case, when scanning, is that2576// the comment contains normal ascii characters with nothing interesting in2577// them. As such, optimize for this case with the inner loop.2578//2579// This loop terminates with CurPtr pointing at the newline (or end of buffer)2580// character that ends the line comment.25812582// C++23 [lex.phases] p12583// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a2584// diagnostic only once per entire ill-formed subsequence to avoid2585// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).2586bool UnicodeDecodingAlreadyDiagnosed = false;25872588char C;2589while (true) {2590C = *CurPtr;2591// Skip over characters in the fast loop.2592while (isASCII(C) && C != 0 && // Potentially EOF.2593C != '\n' && C != '\r') { // Newline or DOS-style newline.2594C = *++CurPtr;2595UnicodeDecodingAlreadyDiagnosed = false;2596}25972598if (!isASCII(C)) {2599unsigned Length = llvm::getUTF8SequenceSize(2600(const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);2601if (Length == 0) {2602if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())2603Diag(CurPtr, diag::warn_invalid_utf8_in_comment);2604UnicodeDecodingAlreadyDiagnosed = true;2605++CurPtr;2606} else {2607UnicodeDecodingAlreadyDiagnosed = false;2608CurPtr += Length;2609}2610continue;2611}26122613const char *NextLine = CurPtr;2614if (C != 0) {2615// We found a newline, see if it's escaped.2616const char *EscapePtr = CurPtr-1;2617bool HasSpace = false;2618while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.2619--EscapePtr;2620HasSpace = true;2621}26222623if (*EscapePtr == '\\')2624// Escaped newline.2625CurPtr = EscapePtr;2626else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&2627EscapePtr[-2] == '?' && LangOpts.Trigraphs)2628// Trigraph-escaped newline.2629CurPtr = EscapePtr-2;2630else2631break; // This is a newline, we're done.26322633// If there was space between the backslash and newline, warn about it.2634if (HasSpace && !isLexingRawMode())2635Diag(EscapePtr, diag::backslash_newline_space);2636}26372638// Otherwise, this is a hard case. Fall back on getAndAdvanceChar to2639// properly decode the character. Read it in raw mode to avoid emitting2640// diagnostics about things like trigraphs. If we see an escaped newline,2641// we'll handle it below.2642const char *OldPtr = CurPtr;2643bool OldRawMode = isLexingRawMode();2644LexingRawMode = true;2645C = getAndAdvanceChar(CurPtr, Result);2646LexingRawMode = OldRawMode;26472648// If we only read only one character, then no special handling is needed.2649// We're done and can skip forward to the newline.2650if (C != 0 && CurPtr == OldPtr+1) {2651CurPtr = NextLine;2652break;2653}26542655// If we read multiple characters, and one of those characters was a \r or2656// \n, then we had an escaped newline within the comment. Emit diagnostic2657// unless the next line is also a // comment.2658if (CurPtr != OldPtr + 1 && C != '/' &&2659(CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {2660for (; OldPtr != CurPtr; ++OldPtr)2661if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {2662// Okay, we found a // comment that ends in a newline, if the next2663// line is also a // comment, but has spaces, don't emit a diagnostic.2664if (isWhitespace(C)) {2665const char *ForwardPtr = CurPtr;2666while (isWhitespace(*ForwardPtr)) // Skip whitespace.2667++ForwardPtr;2668if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')2669break;2670}26712672if (!isLexingRawMode())2673Diag(OldPtr-1, diag::ext_multi_line_line_comment);2674break;2675}2676}26772678if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {2679--CurPtr;2680break;2681}26822683if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {2684PP->CodeCompleteNaturalLanguage();2685cutOffLexing();2686return false;2687}2688}26892690// Found but did not consume the newline. Notify comment handlers about the2691// comment unless we're in a #if 0 block.2692if (PP && !isLexingRawMode() &&2693PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),2694getSourceLocation(CurPtr)))) {2695BufferPtr = CurPtr;2696return true; // A token has to be returned.2697}26982699// If we are returning comments as tokens, return this comment as a token.2700if (inKeepCommentMode())2701return SaveLineComment(Result, CurPtr);27022703// If we are inside a preprocessor directive and we see the end of line,2704// return immediately, so that the lexer can return this as an EOD token.2705if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {2706BufferPtr = CurPtr;2707return false;2708}27092710// Otherwise, eat the \n character. We don't care if this is a \n\r or2711// \r\n sequence. This is an efficiency hack (because we know the \n can't2712// contribute to another token), it isn't needed for correctness. Note that2713// this is ok even in KeepWhitespaceMode, because we would have returned the2714// comment above in that mode.2715NewLinePtr = CurPtr++;27162717// The next returned token is at the start of the line.2718Result.setFlag(Token::StartOfLine);2719TokAtPhysicalStartOfLine = true;2720// No leading whitespace seen so far.2721Result.clearFlag(Token::LeadingSpace);2722BufferPtr = CurPtr;2723return false;2724}27252726/// If in save-comment mode, package up this Line comment in an appropriate2727/// way and return it.2728bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {2729// If we're not in a preprocessor directive, just return the // comment2730// directly.2731FormTokenWithChars(Result, CurPtr, tok::comment);27322733if (!ParsingPreprocessorDirective || LexingRawMode)2734return true;27352736// If this Line-style comment is in a macro definition, transmogrify it into2737// a C-style block comment.2738bool Invalid = false;2739std::string Spelling = PP->getSpelling(Result, &Invalid);2740if (Invalid)2741return true;27422743assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");2744Spelling[1] = '*'; // Change prefix to "/*".2745Spelling += "*/"; // add suffix.27462747Result.setKind(tok::comment);2748PP->CreateString(Spelling, Result,2749Result.getLocation(), Result.getLocation());2750return true;2751}27522753/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline2754/// character (either \\n or \\r) is part of an escaped newline sequence. Issue2755/// a diagnostic if so. We know that the newline is inside of a block comment.2756static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,2757bool Trigraphs) {2758assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');27592760// Position of the first trigraph in the ending sequence.2761const char *TrigraphPos = nullptr;2762// Position of the first whitespace after a '\' in the ending sequence.2763const char *SpacePos = nullptr;27642765while (true) {2766// Back up off the newline.2767--CurPtr;27682769// If this is a two-character newline sequence, skip the other character.2770if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {2771// \n\n or \r\r -> not escaped newline.2772if (CurPtr[0] == CurPtr[1])2773return false;2774// \n\r or \r\n -> skip the newline.2775--CurPtr;2776}27772778// If we have horizontal whitespace, skip over it. We allow whitespace2779// between the slash and newline.2780while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {2781SpacePos = CurPtr;2782--CurPtr;2783}27842785// If we have a slash, this is an escaped newline.2786if (*CurPtr == '\\') {2787--CurPtr;2788} else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {2789// This is a trigraph encoding of a slash.2790TrigraphPos = CurPtr - 2;2791CurPtr -= 3;2792} else {2793return false;2794}27952796// If the character preceding the escaped newline is a '*', then after line2797// splicing we have a '*/' ending the comment.2798if (*CurPtr == '*')2799break;28002801if (*CurPtr != '\n' && *CurPtr != '\r')2802return false;2803}28042805if (TrigraphPos) {2806// If no trigraphs are enabled, warn that we ignored this trigraph and2807// ignore this * character.2808if (!Trigraphs) {2809if (!L->isLexingRawMode())2810L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);2811return false;2812}2813if (!L->isLexingRawMode())2814L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);2815}28162817// Warn about having an escaped newline between the */ characters.2818if (!L->isLexingRawMode())2819L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);28202821// If there was space between the backslash and newline, warn about it.2822if (SpacePos && !L->isLexingRawMode())2823L->Diag(SpacePos, diag::backslash_newline_space);28242825return true;2826}28272828#ifdef __SSE2__2829#include <emmintrin.h>2830#elif __ALTIVEC__2831#include <altivec.h>2832#undef bool2833#endif28342835/// We have just read from input the / and * characters that started a comment.2836/// Read until we find the * and / characters that terminate the comment.2837/// Note that we don't bother decoding trigraphs or escaped newlines in block2838/// comments, because they cannot cause the comment to end. The only thing2839/// that can happen is the comment could end with an escaped newline between2840/// the terminating * and /.2841///2842/// If we're in KeepCommentMode or any CommentHandler has inserted2843/// some tokens, this will store the first token and return true.2844bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,2845bool &TokAtPhysicalStartOfLine) {2846// Scan one character past where we should, looking for a '/' character. Once2847// we find it, check to see if it was preceded by a *. This common2848// optimization helps people who like to put a lot of * characters in their2849// comments.28502851// The first character we get with newlines and trigraphs skipped to handle2852// the degenerate /*/ case below correctly if the * has an escaped newline2853// after it.2854unsigned CharSize;2855unsigned char C = getCharAndSize(CurPtr, CharSize);2856CurPtr += CharSize;2857if (C == 0 && CurPtr == BufferEnd+1) {2858if (!isLexingRawMode())2859Diag(BufferPtr, diag::err_unterminated_block_comment);2860--CurPtr;28612862// KeepWhitespaceMode should return this broken comment as a token. Since2863// it isn't a well formed comment, just return it as an 'unknown' token.2864if (isKeepWhitespaceMode()) {2865FormTokenWithChars(Result, CurPtr, tok::unknown);2866return true;2867}28682869BufferPtr = CurPtr;2870return false;2871}28722873// Check to see if the first character after the '/*' is another /. If so,2874// then this slash does not end the block comment, it is part of it.2875if (C == '/')2876C = *CurPtr++;28772878// C++23 [lex.phases] p12879// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a2880// diagnostic only once per entire ill-formed subsequence to avoid2881// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).2882bool UnicodeDecodingAlreadyDiagnosed = false;28832884while (true) {2885// Skip over all non-interesting characters until we find end of buffer or a2886// (probably ending) '/' character.2887if (CurPtr + 24 < BufferEnd &&2888// If there is a code-completion point avoid the fast scan because it2889// doesn't check for '\0'.2890!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {2891// While not aligned to a 16-byte boundary.2892while (C != '/' && (intptr_t)CurPtr % 16 != 0) {2893if (!isASCII(C))2894goto MultiByteUTF8;2895C = *CurPtr++;2896}2897if (C == '/') goto FoundSlash;28982899#ifdef __SSE2__2900__m128i Slashes = _mm_set1_epi8('/');2901while (CurPtr + 16 < BufferEnd) {2902int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);2903if (LLVM_UNLIKELY(Mask != 0)) {2904goto MultiByteUTF8;2905}2906// look for slashes2907int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,2908Slashes));2909if (cmp != 0) {2910// Adjust the pointer to point directly after the first slash. It's2911// not necessary to set C here, it will be overwritten at the end of2912// the outer loop.2913CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;2914goto FoundSlash;2915}2916CurPtr += 16;2917}2918#elif __ALTIVEC__2919__vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,29200x80, 0x80, 0x80, 0x80, 0x80, 0x80,29210x80, 0x80, 0x80, 0x80};2922__vector unsigned char Slashes = {2923'/', '/', '/', '/', '/', '/', '/', '/',2924'/', '/', '/', '/', '/', '/', '/', '/'2925};2926while (CurPtr + 16 < BufferEnd) {2927if (LLVM_UNLIKELY(2928vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))2929goto MultiByteUTF8;2930if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {2931break;2932}2933CurPtr += 16;2934}29352936#else2937while (CurPtr + 16 < BufferEnd) {2938bool HasNonASCII = false;2939for (unsigned I = 0; I < 16; ++I)2940HasNonASCII |= !isASCII(CurPtr[I]);29412942if (LLVM_UNLIKELY(HasNonASCII))2943goto MultiByteUTF8;29442945bool HasSlash = false;2946for (unsigned I = 0; I < 16; ++I)2947HasSlash |= CurPtr[I] == '/';2948if (HasSlash)2949break;2950CurPtr += 16;2951}2952#endif29532954// It has to be one of the bytes scanned, increment to it and read one.2955C = *CurPtr++;2956}29572958// Loop to scan the remainder, warning on invalid UTF-82959// if the corresponding warning is enabled, emitting a diagnostic only once2960// per sequence that cannot be decoded.2961while (C != '/' && C != '\0') {2962if (isASCII(C)) {2963UnicodeDecodingAlreadyDiagnosed = false;2964C = *CurPtr++;2965continue;2966}2967MultiByteUTF8:2968// CurPtr is 1 code unit past C, so to decode2969// the codepoint, we need to read from the previous position.2970unsigned Length = llvm::getUTF8SequenceSize(2971(const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);2972if (Length == 0) {2973if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())2974Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);2975UnicodeDecodingAlreadyDiagnosed = true;2976} else {2977UnicodeDecodingAlreadyDiagnosed = false;2978CurPtr += Length - 1;2979}2980C = *CurPtr++;2981}29822983if (C == '/') {2984FoundSlash:2985if (CurPtr[-2] == '*') // We found the final */. We're done!2986break;29872988if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {2989if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,2990LangOpts.Trigraphs)) {2991// We found the final */, though it had an escaped newline between the2992// * and /. We're done!2993break;2994}2995}2996if (CurPtr[0] == '*' && CurPtr[1] != '/') {2997// If this is a /* inside of the comment, emit a warning. Don't do this2998// if this is a /*/, which will end the comment. This misses cases with2999// embedded escaped newlines, but oh well.3000if (!isLexingRawMode())3001Diag(CurPtr-1, diag::warn_nested_block_comment);3002}3003} else if (C == 0 && CurPtr == BufferEnd+1) {3004if (!isLexingRawMode())3005Diag(BufferPtr, diag::err_unterminated_block_comment);3006// Note: the user probably forgot a */. We could continue immediately3007// after the /*, but this would involve lexing a lot of what really is the3008// comment, which surely would confuse the parser.3009--CurPtr;30103011// KeepWhitespaceMode should return this broken comment as a token. Since3012// it isn't a well formed comment, just return it as an 'unknown' token.3013if (isKeepWhitespaceMode()) {3014FormTokenWithChars(Result, CurPtr, tok::unknown);3015return true;3016}30173018BufferPtr = CurPtr;3019return false;3020} else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {3021PP->CodeCompleteNaturalLanguage();3022cutOffLexing();3023return false;3024}30253026C = *CurPtr++;3027}30283029// Notify comment handlers about the comment unless we're in a #if 0 block.3030if (PP && !isLexingRawMode() &&3031PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),3032getSourceLocation(CurPtr)))) {3033BufferPtr = CurPtr;3034return true; // A token has to be returned.3035}30363037// If we are returning comments as tokens, return this comment as a token.3038if (inKeepCommentMode()) {3039FormTokenWithChars(Result, CurPtr, tok::comment);3040return true;3041}30423043// It is common for the tokens immediately after a /**/ comment to be3044// whitespace. Instead of going through the big switch, handle it3045// efficiently now. This is safe even in KeepWhitespaceMode because we would3046// have already returned above with the comment as a token.3047if (isHorizontalWhitespace(*CurPtr)) {3048SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);3049return false;3050}30513052// Otherwise, just return so that the next character will be lexed as a token.3053BufferPtr = CurPtr;3054Result.setFlag(Token::LeadingSpace);3055return false;3056}30573058//===----------------------------------------------------------------------===//3059// Primary Lexing Entry Points3060//===----------------------------------------------------------------------===//30613062/// ReadToEndOfLine - Read the rest of the current preprocessor line as an3063/// uninterpreted string. This switches the lexer out of directive mode.3064void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {3065assert(ParsingPreprocessorDirective && ParsingFilename == false &&3066"Must be in a preprocessing directive!");3067Token Tmp;3068Tmp.startToken();30693070// CurPtr - Cache BufferPtr in an automatic variable.3071const char *CurPtr = BufferPtr;3072while (true) {3073char Char = getAndAdvanceChar(CurPtr, Tmp);3074switch (Char) {3075default:3076if (Result)3077Result->push_back(Char);3078break;3079case 0: // Null.3080// Found end of file?3081if (CurPtr-1 != BufferEnd) {3082if (isCodeCompletionPoint(CurPtr-1)) {3083PP->CodeCompleteNaturalLanguage();3084cutOffLexing();3085return;3086}30873088// Nope, normal character, continue.3089if (Result)3090Result->push_back(Char);3091break;3092}3093// FALL THROUGH.3094[[fallthrough]];3095case '\r':3096case '\n':3097// Okay, we found the end of the line. First, back up past the \0, \r, \n.3098assert(CurPtr[-1] == Char && "Trigraphs for newline?");3099BufferPtr = CurPtr-1;31003101// Next, lex the character, which should handle the EOD transition.3102Lex(Tmp);3103if (Tmp.is(tok::code_completion)) {3104if (PP)3105PP->CodeCompleteNaturalLanguage();3106Lex(Tmp);3107}3108assert(Tmp.is(tok::eod) && "Unexpected token!");31093110// Finally, we're done;3111return;3112}3113}3114}31153116/// LexEndOfFile - CurPtr points to the end of this file. Handle this3117/// condition, reporting diagnostics and handling other edge cases as required.3118/// This returns true if Result contains a token, false if PP.Lex should be3119/// called again.3120bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {3121// If we hit the end of the file while parsing a preprocessor directive,3122// end the preprocessor directive first. The next token returned will3123// then be the end of file.3124if (ParsingPreprocessorDirective) {3125// Done parsing the "line".3126ParsingPreprocessorDirective = false;3127// Update the location of token as well as BufferPtr.3128FormTokenWithChars(Result, CurPtr, tok::eod);31293130// Restore comment saving mode, in case it was disabled for directive.3131if (PP)3132resetExtendedTokenMode();3133return true; // Have a token.3134}31353136// If we are in raw mode, return this event as an EOF token. Let the caller3137// that put us in raw mode handle the event.3138if (isLexingRawMode()) {3139Result.startToken();3140BufferPtr = BufferEnd;3141FormTokenWithChars(Result, BufferEnd, tok::eof);3142return true;3143}31443145if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {3146PP->setRecordedPreambleConditionalStack(ConditionalStack);3147// If the preamble cuts off the end of a header guard, consider it guarded.3148// The guard is valid for the preamble content itself, and for tools the3149// most useful answer is "yes, this file has a header guard".3150if (!ConditionalStack.empty())3151MIOpt.ExitTopLevelConditional();3152ConditionalStack.clear();3153}31543155// Issue diagnostics for unterminated #if and missing newline.31563157// If we are in a #if directive, emit an error.3158while (!ConditionalStack.empty()) {3159if (PP->getCodeCompletionFileLoc() != FileLoc)3160PP->Diag(ConditionalStack.back().IfLoc,3161diag::err_pp_unterminated_conditional);3162ConditionalStack.pop_back();3163}31643165// C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue3166// a pedwarn.3167if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {3168DiagnosticsEngine &Diags = PP->getDiagnostics();3169SourceLocation EndLoc = getSourceLocation(BufferEnd);3170unsigned DiagID;31713172if (LangOpts.CPlusPlus11) {3173// C++11 [lex.phases] 2.2 p23174// Prefer the C++98 pedantic compatibility warning over the generic,3175// non-extension, user-requested "missing newline at EOF" warning.3176if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {3177DiagID = diag::warn_cxx98_compat_no_newline_eof;3178} else {3179DiagID = diag::warn_no_newline_eof;3180}3181} else {3182DiagID = diag::ext_no_newline_eof;3183}31843185Diag(BufferEnd, DiagID)3186<< FixItHint::CreateInsertion(EndLoc, "\n");3187}31883189BufferPtr = CurPtr;31903191// Finally, let the preprocessor handle this.3192return PP->HandleEndOfFile(Result, isPragmaLexer());3193}31943195/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from3196/// the specified lexer will return a tok::l_paren token, 0 if it is something3197/// else and 2 if there are no more tokens in the buffer controlled by the3198/// lexer.3199unsigned Lexer::isNextPPTokenLParen() {3200assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");32013202if (isDependencyDirectivesLexer()) {3203if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())3204return 2;3205return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(3206tok::l_paren);3207}32083209// Switch to 'skipping' mode. This will ensure that we can lex a token3210// without emitting diagnostics, disables macro expansion, and will cause EOF3211// to return an EOF token instead of popping the include stack.3212LexingRawMode = true;32133214// Save state that can be changed while lexing so that we can restore it.3215const char *TmpBufferPtr = BufferPtr;3216bool inPPDirectiveMode = ParsingPreprocessorDirective;3217bool atStartOfLine = IsAtStartOfLine;3218bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;3219bool leadingSpace = HasLeadingSpace;32203221Token Tok;3222Lex(Tok);32233224// Restore state that may have changed.3225BufferPtr = TmpBufferPtr;3226ParsingPreprocessorDirective = inPPDirectiveMode;3227HasLeadingSpace = leadingSpace;3228IsAtStartOfLine = atStartOfLine;3229IsAtPhysicalStartOfLine = atPhysicalStartOfLine;32303231// Restore the lexer back to non-skipping mode.3232LexingRawMode = false;32333234if (Tok.is(tok::eof))3235return 2;3236return Tok.is(tok::l_paren);3237}32383239/// Find the end of a version control conflict marker.3240static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,3241ConflictMarkerKind CMK) {3242const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";3243size_t TermLen = CMK == CMK_Perforce ? 5 : 7;3244auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);3245size_t Pos = RestOfBuffer.find(Terminator);3246while (Pos != StringRef::npos) {3247// Must occur at start of line.3248if (Pos == 0 ||3249(RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {3250RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);3251Pos = RestOfBuffer.find(Terminator);3252continue;3253}3254return RestOfBuffer.data()+Pos;3255}3256return nullptr;3257}32583259/// IsStartOfConflictMarker - If the specified pointer is the start of a version3260/// control conflict marker like '<<<<<<<', recognize it as such, emit an error3261/// and recover nicely. This returns true if it is a conflict marker and false3262/// if not.3263bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {3264// Only a conflict marker if it starts at the beginning of a line.3265if (CurPtr != BufferStart &&3266CurPtr[-1] != '\n' && CurPtr[-1] != '\r')3267return false;32683269// Check to see if we have <<<<<<< or >>>>.3270if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&3271!StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))3272return false;32733274// If we have a situation where we don't care about conflict markers, ignore3275// it.3276if (CurrentConflictMarkerState || isLexingRawMode())3277return false;32783279ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;32803281// Check to see if there is an ending marker somewhere in the buffer at the3282// start of a line to terminate this conflict marker.3283if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {3284// We found a match. We are really in a conflict marker.3285// Diagnose this, and ignore to the end of line.3286Diag(CurPtr, diag::err_conflict_marker);3287CurrentConflictMarkerState = Kind;32883289// Skip ahead to the end of line. We know this exists because the3290// end-of-conflict marker starts with \r or \n.3291while (*CurPtr != '\r' && *CurPtr != '\n') {3292assert(CurPtr != BufferEnd && "Didn't find end of line");3293++CurPtr;3294}3295BufferPtr = CurPtr;3296return true;3297}32983299// No end of conflict marker found.3300return false;3301}33023303/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if3304/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it3305/// is the end of a conflict marker. Handle it by ignoring up until the end of3306/// the line. This returns true if it is a conflict marker and false if not.3307bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {3308// Only a conflict marker if it starts at the beginning of a line.3309if (CurPtr != BufferStart &&3310CurPtr[-1] != '\n' && CurPtr[-1] != '\r')3311return false;33123313// If we have a situation where we don't care about conflict markers, ignore3314// it.3315if (!CurrentConflictMarkerState || isLexingRawMode())3316return false;33173318// Check to see if we have the marker (4 characters in a row).3319for (unsigned i = 1; i != 4; ++i)3320if (CurPtr[i] != CurPtr[0])3321return false;33223323// If we do have it, search for the end of the conflict marker. This could3324// fail if it got skipped with a '#if 0' or something. Note that CurPtr might3325// be the end of conflict marker.3326if (const char *End = FindConflictEnd(CurPtr, BufferEnd,3327CurrentConflictMarkerState)) {3328CurPtr = End;33293330// Skip ahead to the end of line.3331while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')3332++CurPtr;33333334BufferPtr = CurPtr;33353336// No longer in the conflict marker.3337CurrentConflictMarkerState = CMK_None;3338return true;3339}33403341return false;3342}33433344static const char *findPlaceholderEnd(const char *CurPtr,3345const char *BufferEnd) {3346if (CurPtr == BufferEnd)3347return nullptr;3348BufferEnd -= 1; // Scan until the second last character.3349for (; CurPtr != BufferEnd; ++CurPtr) {3350if (CurPtr[0] == '#' && CurPtr[1] == '>')3351return CurPtr + 2;3352}3353return nullptr;3354}33553356bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {3357assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");3358if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)3359return false;3360const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);3361if (!End)3362return false;3363const char *Start = CurPtr - 1;3364if (!LangOpts.AllowEditorPlaceholders)3365Diag(Start, diag::err_placeholder_in_source);3366Result.startToken();3367FormTokenWithChars(Result, End, tok::raw_identifier);3368Result.setRawIdentifierData(Start);3369PP->LookUpIdentifierInfo(Result);3370Result.setFlag(Token::IsEditorPlaceholder);3371BufferPtr = End;3372return true;3373}33743375bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {3376if (PP && PP->isCodeCompletionEnabled()) {3377SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);3378return Loc == PP->getCodeCompletionLoc();3379}33803381return false;3382}33833384std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,3385const char *SlashLoc,3386Token *Result) {3387unsigned CharSize;3388char Kind = getCharAndSize(StartPtr, CharSize);3389assert((Kind == 'u' || Kind == 'U') && "expected a UCN");33903391unsigned NumHexDigits;3392if (Kind == 'u')3393NumHexDigits = 4;3394else if (Kind == 'U')3395NumHexDigits = 8;33963397bool Delimited = false;3398bool FoundEndDelimiter = false;3399unsigned Count = 0;3400bool Diagnose = Result && !isLexingRawMode();34013402if (!LangOpts.CPlusPlus && !LangOpts.C99) {3403if (Diagnose)3404Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);3405return std::nullopt;3406}34073408const char *CurPtr = StartPtr + CharSize;3409const char *KindLoc = &CurPtr[-1];34103411uint32_t CodePoint = 0;3412while (Count != NumHexDigits || Delimited) {3413char C = getCharAndSize(CurPtr, CharSize);3414if (!Delimited && Count == 0 && C == '{') {3415Delimited = true;3416CurPtr += CharSize;3417continue;3418}34193420if (Delimited && C == '}') {3421CurPtr += CharSize;3422FoundEndDelimiter = true;3423break;3424}34253426unsigned Value = llvm::hexDigitValue(C);3427if (Value == -1U) {3428if (!Delimited)3429break;3430if (Diagnose)3431Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)3432<< StringRef(KindLoc, 1);3433return std::nullopt;3434}34353436if (CodePoint & 0xF000'0000) {3437if (Diagnose)3438Diag(KindLoc, diag::err_escape_too_large) << 0;3439return std::nullopt;3440}34413442CodePoint <<= 4;3443CodePoint |= Value;3444CurPtr += CharSize;3445Count++;3446}34473448if (Count == 0) {3449if (Diagnose)3450Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty3451: diag::warn_ucn_escape_no_digits)3452<< StringRef(KindLoc, 1);3453return std::nullopt;3454}34553456if (Delimited && Kind == 'U') {3457if (Diagnose)3458Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);3459return std::nullopt;3460}34613462if (!Delimited && Count != NumHexDigits) {3463if (Diagnose) {3464Diag(SlashLoc, diag::warn_ucn_escape_incomplete);3465// If the user wrote \U1234, suggest a fixit to \u.3466if (Count == 4 && NumHexDigits == 8) {3467CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);3468Diag(KindLoc, diag::note_ucn_four_not_eight)3469<< FixItHint::CreateReplacement(URange, "u");3470}3471}3472return std::nullopt;3473}34743475if (Delimited && PP) {3476Diag(SlashLoc, PP->getLangOpts().CPlusPlus233477? diag::warn_cxx23_delimited_escape_sequence3478: diag::ext_delimited_escape_sequence)3479<< /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);3480}34813482if (Result) {3483Result->setFlag(Token::HasUCN);3484// If the UCN contains either a trigraph or a line splicing,3485// we need to call getAndAdvanceChar again to set the appropriate flags3486// on Result.3487if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))3488StartPtr = CurPtr;3489else3490while (StartPtr != CurPtr)3491(void)getAndAdvanceChar(StartPtr, *Result);3492} else {3493StartPtr = CurPtr;3494}3495return CodePoint;3496}34973498std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,3499const char *SlashLoc,3500Token *Result) {3501unsigned CharSize;3502bool Diagnose = Result && !isLexingRawMode();35033504char C = getCharAndSize(StartPtr, CharSize);3505assert(C == 'N' && "expected \\N{...}");35063507const char *CurPtr = StartPtr + CharSize;3508const char *KindLoc = &CurPtr[-1];35093510C = getCharAndSize(CurPtr, CharSize);3511if (C != '{') {3512if (Diagnose)3513Diag(SlashLoc, diag::warn_ucn_escape_incomplete);3514return std::nullopt;3515}3516CurPtr += CharSize;3517const char *StartName = CurPtr;3518bool FoundEndDelimiter = false;3519llvm::SmallVector<char, 30> Buffer;3520while (C) {3521C = getCharAndSize(CurPtr, CharSize);3522CurPtr += CharSize;3523if (C == '}') {3524FoundEndDelimiter = true;3525break;3526}35273528if (isVerticalWhitespace(C))3529break;3530Buffer.push_back(C);3531}35323533if (!FoundEndDelimiter || Buffer.empty()) {3534if (Diagnose)3535Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty3536: diag::warn_delimited_ucn_incomplete)3537<< StringRef(KindLoc, 1);3538return std::nullopt;3539}35403541StringRef Name(Buffer.data(), Buffer.size());3542std::optional<char32_t> Match =3543llvm::sys::unicode::nameToCodepointStrict(Name);3544std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;3545if (!Match) {3546LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);3547if (Diagnose) {3548Diag(StartName, diag::err_invalid_ucn_name)3549<< StringRef(Buffer.data(), Buffer.size())3550<< makeCharRange(*this, StartName, CurPtr - CharSize);3551if (LooseMatch) {3552Diag(StartName, diag::note_invalid_ucn_name_loose_matching)3553<< FixItHint::CreateReplacement(3554makeCharRange(*this, StartName, CurPtr - CharSize),3555LooseMatch->Name);3556}3557}3558// We do not offer misspelled character names suggestions here3559// as the set of what would be a valid suggestion depends on context,3560// and we should not make invalid suggestions.3561}35623563if (Diagnose && Match)3564Diag(SlashLoc, PP->getLangOpts().CPlusPlus233565? diag::warn_cxx23_delimited_escape_sequence3566: diag::ext_delimited_escape_sequence)3567<< /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);35683569// If no diagnostic has been emitted yet, likely because we are doing a3570// tentative lexing, we do not want to recover here to make sure the token3571// will not be incorrectly considered valid. This function will be called3572// again and a diagnostic emitted then.3573if (LooseMatch && Diagnose)3574Match = LooseMatch->CodePoint;35753576if (Result) {3577Result->setFlag(Token::HasUCN);3578// If the UCN contains either a trigraph or a line splicing,3579// we need to call getAndAdvanceChar again to set the appropriate flags3580// on Result.3581if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))3582StartPtr = CurPtr;3583else3584while (StartPtr != CurPtr)3585(void)getAndAdvanceChar(StartPtr, *Result);3586} else {3587StartPtr = CurPtr;3588}3589return Match ? std::optional<uint32_t>(*Match) : std::nullopt;3590}35913592uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,3593Token *Result) {35943595unsigned CharSize;3596std::optional<uint32_t> CodePointOpt;3597char Kind = getCharAndSize(StartPtr, CharSize);3598if (Kind == 'u' || Kind == 'U')3599CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);3600else if (Kind == 'N')3601CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);36023603if (!CodePointOpt)3604return 0;36053606uint32_t CodePoint = *CodePointOpt;36073608// Don't apply C family restrictions to UCNs in assembly mode3609if (LangOpts.AsmPreprocessor)3610return CodePoint;36113612// C23 6.4.3p2: A universal character name shall not designate a code point3613// where the hexadecimal value is:3614// - in the range D800 through DFFF inclusive; or3615// - greater than 10FFFF.3616// A universal-character-name outside the c-char-sequence of a character3617// constant, or the s-char-sequence of a string-literal shall not designate3618// a control character or a character in the basic character set.36193620// C++11 [lex.charset]p2: If the hexadecimal value for a3621// universal-character-name corresponds to a surrogate code point (in the3622// range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,3623// if the hexadecimal value for a universal-character-name outside the3624// c-char-sequence, s-char-sequence, or r-char-sequence of a character or3625// string literal corresponds to a control character (in either of the3626// ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the3627// basic source character set, the program is ill-formed.3628if (CodePoint < 0xA0) {3629// We don't use isLexingRawMode() here because we need to warn about bad3630// UCNs even when skipping preprocessing tokens in a #if block.3631if (Result && PP) {3632if (CodePoint < 0x20 || CodePoint >= 0x7F)3633Diag(BufferPtr, diag::err_ucn_control_character);3634else {3635char C = static_cast<char>(CodePoint);3636Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);3637}3638}36393640return 0;3641} else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {3642// C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.3643// We don't use isLexingRawMode() here because we need to diagnose bad3644// UCNs even when skipping preprocessing tokens in a #if block.3645if (Result && PP) {3646if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)3647Diag(BufferPtr, diag::warn_ucn_escape_surrogate);3648else3649Diag(BufferPtr, diag::err_ucn_escape_invalid);3650}3651return 0;3652}36533654return CodePoint;3655}36563657bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,3658const char *CurPtr) {3659if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&3660isUnicodeWhitespace(C)) {3661Diag(BufferPtr, diag::ext_unicode_whitespace)3662<< makeCharRange(*this, BufferPtr, CurPtr);36633664Result.setFlag(Token::LeadingSpace);3665return true;3666}3667return false;3668}36693670void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {3671IsAtStartOfLine = Result.isAtStartOfLine();3672HasLeadingSpace = Result.hasLeadingSpace();3673HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();3674// Note that this doesn't affect IsAtPhysicalStartOfLine.3675}36763677bool Lexer::Lex(Token &Result) {3678assert(!isDependencyDirectivesLexer());36793680// Start a new token.3681Result.startToken();36823683// Set up misc whitespace flags for LexTokenInternal.3684if (IsAtStartOfLine) {3685Result.setFlag(Token::StartOfLine);3686IsAtStartOfLine = false;3687}36883689if (HasLeadingSpace) {3690Result.setFlag(Token::LeadingSpace);3691HasLeadingSpace = false;3692}36933694if (HasLeadingEmptyMacro) {3695Result.setFlag(Token::LeadingEmptyMacro);3696HasLeadingEmptyMacro = false;3697}36983699bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;3700IsAtPhysicalStartOfLine = false;3701bool isRawLex = isLexingRawMode();3702(void) isRawLex;3703bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);3704// (After the LexTokenInternal call, the lexer might be destroyed.)3705assert((returnedToken || !isRawLex) && "Raw lex must succeed");3706return returnedToken;3707}37083709/// LexTokenInternal - This implements a simple C family lexer. It is an3710/// extremely performance critical piece of code. This assumes that the buffer3711/// has a null character at the end of the file. This returns a preprocessing3712/// token, not a normal token, as such, it is an internal interface. It assumes3713/// that the Flags of result have been cleared before calling this.3714bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {3715LexStart:3716assert(!Result.needsCleaning() && "Result needs cleaning");3717assert(!Result.hasPtrData() && "Result has not been reset");37183719// CurPtr - Cache BufferPtr in an automatic variable.3720const char *CurPtr = BufferPtr;37213722// Small amounts of horizontal whitespace is very common between tokens.3723if (isHorizontalWhitespace(*CurPtr)) {3724do {3725++CurPtr;3726} while (isHorizontalWhitespace(*CurPtr));37273728// If we are keeping whitespace and other tokens, just return what we just3729// skipped. The next lexer invocation will return the token after the3730// whitespace.3731if (isKeepWhitespaceMode()) {3732FormTokenWithChars(Result, CurPtr, tok::unknown);3733// FIXME: The next token will not have LeadingSpace set.3734return true;3735}37363737BufferPtr = CurPtr;3738Result.setFlag(Token::LeadingSpace);3739}37403741unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.37423743// Read a character, advancing over it.3744char Char = getAndAdvanceChar(CurPtr, Result);3745tok::TokenKind Kind;37463747if (!isVerticalWhitespace(Char))3748NewLinePtr = nullptr;37493750switch (Char) {3751case 0: // Null.3752// Found end of file?3753if (CurPtr-1 == BufferEnd)3754return LexEndOfFile(Result, CurPtr-1);37553756// Check if we are performing code completion.3757if (isCodeCompletionPoint(CurPtr-1)) {3758// Return the code-completion token.3759Result.startToken();3760FormTokenWithChars(Result, CurPtr, tok::code_completion);3761return true;3762}37633764if (!isLexingRawMode())3765Diag(CurPtr-1, diag::null_in_file);3766Result.setFlag(Token::LeadingSpace);3767if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))3768return true; // KeepWhitespaceMode37693770// We know the lexer hasn't changed, so just try again with this lexer.3771// (We manually eliminate the tail call to avoid recursion.)3772goto LexNextToken;37733774case 26: // DOS & CP/M EOF: "^Z".3775// If we're in Microsoft extensions mode, treat this as end of file.3776if (LangOpts.MicrosoftExt) {3777if (!isLexingRawMode())3778Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);3779return LexEndOfFile(Result, CurPtr-1);3780}37813782// If Microsoft extensions are disabled, this is just random garbage.3783Kind = tok::unknown;3784break;37853786case '\r':3787if (CurPtr[0] == '\n')3788(void)getAndAdvanceChar(CurPtr, Result);3789[[fallthrough]];3790case '\n':3791// If we are inside a preprocessor directive and we see the end of line,3792// we know we are done with the directive, so return an EOD token.3793if (ParsingPreprocessorDirective) {3794// Done parsing the "line".3795ParsingPreprocessorDirective = false;37963797// Restore comment saving mode, in case it was disabled for directive.3798if (PP)3799resetExtendedTokenMode();38003801// Since we consumed a newline, we are back at the start of a line.3802IsAtStartOfLine = true;3803IsAtPhysicalStartOfLine = true;3804NewLinePtr = CurPtr - 1;38053806Kind = tok::eod;3807break;3808}38093810// No leading whitespace seen so far.3811Result.clearFlag(Token::LeadingSpace);38123813if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))3814return true; // KeepWhitespaceMode38153816// We only saw whitespace, so just try again with this lexer.3817// (We manually eliminate the tail call to avoid recursion.)3818goto LexNextToken;3819case ' ':3820case '\t':3821case '\f':3822case '\v':3823SkipHorizontalWhitespace:3824Result.setFlag(Token::LeadingSpace);3825if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))3826return true; // KeepWhitespaceMode38273828SkipIgnoredUnits:3829CurPtr = BufferPtr;38303831// If the next token is obviously a // or /* */ comment, skip it efficiently3832// too (without going through the big switch stmt).3833if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&3834LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {3835if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))3836return true; // There is a token to return.3837goto SkipIgnoredUnits;3838} else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {3839if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))3840return true; // There is a token to return.3841goto SkipIgnoredUnits;3842} else if (isHorizontalWhitespace(*CurPtr)) {3843goto SkipHorizontalWhitespace;3844}3845// We only saw whitespace, so just try again with this lexer.3846// (We manually eliminate the tail call to avoid recursion.)3847goto LexNextToken;38483849// C99 6.4.4.1: Integer Constants.3850// C99 6.4.4.2: Floating Constants.3851case '0': case '1': case '2': case '3': case '4':3852case '5': case '6': case '7': case '8': case '9':3853// Notify MIOpt that we read a non-whitespace/non-comment token.3854MIOpt.ReadToken();3855return LexNumericConstant(Result, CurPtr);38563857// Identifier (e.g., uber), or3858// UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or3859// UTF-8 or UTF-16 string literal (C11/C++11).3860case 'u':3861// Notify MIOpt that we read a non-whitespace/non-comment token.3862MIOpt.ReadToken();38633864if (LangOpts.CPlusPlus11 || LangOpts.C11) {3865Char = getCharAndSize(CurPtr, SizeTmp);38663867// UTF-16 string literal3868if (Char == '"')3869return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),3870tok::utf16_string_literal);38713872// UTF-16 character constant3873if (Char == '\'')3874return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),3875tok::utf16_char_constant);38763877// UTF-16 raw string literal3878if (Char == 'R' && LangOpts.RawStringLiterals &&3879getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')3880return LexRawStringLiteral(Result,3881ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),3882SizeTmp2, Result),3883tok::utf16_string_literal);38843885if (Char == '8') {3886char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);38873888// UTF-8 string literal3889if (Char2 == '"')3890return LexStringLiteral(Result,3891ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),3892SizeTmp2, Result),3893tok::utf8_string_literal);3894if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))3895return LexCharConstant(3896Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),3897SizeTmp2, Result),3898tok::utf8_char_constant);38993900if (Char2 == 'R' && LangOpts.RawStringLiterals) {3901unsigned SizeTmp3;3902char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);3903// UTF-8 raw string literal3904if (Char3 == '"') {3905return LexRawStringLiteral(Result,3906ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),3907SizeTmp2, Result),3908SizeTmp3, Result),3909tok::utf8_string_literal);3910}3911}3912}3913}39143915// treat u like the start of an identifier.3916return LexIdentifierContinue(Result, CurPtr);39173918case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal3919// Notify MIOpt that we read a non-whitespace/non-comment token.3920MIOpt.ReadToken();39213922if (LangOpts.CPlusPlus11 || LangOpts.C11) {3923Char = getCharAndSize(CurPtr, SizeTmp);39243925// UTF-32 string literal3926if (Char == '"')3927return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),3928tok::utf32_string_literal);39293930// UTF-32 character constant3931if (Char == '\'')3932return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),3933tok::utf32_char_constant);39343935// UTF-32 raw string literal3936if (Char == 'R' && LangOpts.RawStringLiterals &&3937getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')3938return LexRawStringLiteral(Result,3939ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),3940SizeTmp2, Result),3941tok::utf32_string_literal);3942}39433944// treat U like the start of an identifier.3945return LexIdentifierContinue(Result, CurPtr);39463947case 'R': // Identifier or C++0x raw string literal3948// Notify MIOpt that we read a non-whitespace/non-comment token.3949MIOpt.ReadToken();39503951if (LangOpts.RawStringLiterals) {3952Char = getCharAndSize(CurPtr, SizeTmp);39533954if (Char == '"')3955return LexRawStringLiteral(Result,3956ConsumeChar(CurPtr, SizeTmp, Result),3957tok::string_literal);3958}39593960// treat R like the start of an identifier.3961return LexIdentifierContinue(Result, CurPtr);39623963case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").3964// Notify MIOpt that we read a non-whitespace/non-comment token.3965MIOpt.ReadToken();3966Char = getCharAndSize(CurPtr, SizeTmp);39673968// Wide string literal.3969if (Char == '"')3970return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),3971tok::wide_string_literal);39723973// Wide raw string literal.3974if (LangOpts.RawStringLiterals && Char == 'R' &&3975getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')3976return LexRawStringLiteral(Result,3977ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),3978SizeTmp2, Result),3979tok::wide_string_literal);39803981// Wide character constant.3982if (Char == '\'')3983return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),3984tok::wide_char_constant);3985// FALL THROUGH, treating L like the start of an identifier.3986[[fallthrough]];39873988// C99 6.4.2: Identifiers.3989case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':3990case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':3991case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/3992case 'V': case 'W': case 'X': case 'Y': case 'Z':3993case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':3994case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':3995case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/3996case 'v': case 'w': case 'x': case 'y': case 'z':3997case '_':3998// Notify MIOpt that we read a non-whitespace/non-comment token.3999MIOpt.ReadToken();4000return LexIdentifierContinue(Result, CurPtr);40014002case '$': // $ in identifiers.4003if (LangOpts.DollarIdents) {4004if (!isLexingRawMode())4005Diag(CurPtr-1, diag::ext_dollar_in_identifier);4006// Notify MIOpt that we read a non-whitespace/non-comment token.4007MIOpt.ReadToken();4008return LexIdentifierContinue(Result, CurPtr);4009}40104011Kind = tok::unknown;4012break;40134014// C99 6.4.4: Character Constants.4015case '\'':4016// Notify MIOpt that we read a non-whitespace/non-comment token.4017MIOpt.ReadToken();4018return LexCharConstant(Result, CurPtr, tok::char_constant);40194020// C99 6.4.5: String Literals.4021case '"':4022// Notify MIOpt that we read a non-whitespace/non-comment token.4023MIOpt.ReadToken();4024return LexStringLiteral(Result, CurPtr,4025ParsingFilename ? tok::header_name4026: tok::string_literal);40274028// C99 6.4.6: Punctuators.4029case '?':4030Kind = tok::question;4031break;4032case '[':4033Kind = tok::l_square;4034break;4035case ']':4036Kind = tok::r_square;4037break;4038case '(':4039Kind = tok::l_paren;4040break;4041case ')':4042Kind = tok::r_paren;4043break;4044case '{':4045Kind = tok::l_brace;4046break;4047case '}':4048Kind = tok::r_brace;4049break;4050case '.':4051Char = getCharAndSize(CurPtr, SizeTmp);4052if (Char >= '0' && Char <= '9') {4053// Notify MIOpt that we read a non-whitespace/non-comment token.4054MIOpt.ReadToken();40554056return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));4057} else if (LangOpts.CPlusPlus && Char == '*') {4058Kind = tok::periodstar;4059CurPtr += SizeTmp;4060} else if (Char == '.' &&4061getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {4062Kind = tok::ellipsis;4063CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),4064SizeTmp2, Result);4065} else {4066Kind = tok::period;4067}4068break;4069case '&':4070Char = getCharAndSize(CurPtr, SizeTmp);4071if (Char == '&') {4072Kind = tok::ampamp;4073CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4074} else if (Char == '=') {4075Kind = tok::ampequal;4076CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4077} else {4078Kind = tok::amp;4079}4080break;4081case '*':4082if (getCharAndSize(CurPtr, SizeTmp) == '=') {4083Kind = tok::starequal;4084CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4085} else {4086Kind = tok::star;4087}4088break;4089case '+':4090Char = getCharAndSize(CurPtr, SizeTmp);4091if (Char == '+') {4092CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4093Kind = tok::plusplus;4094} else if (Char == '=') {4095CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4096Kind = tok::plusequal;4097} else {4098Kind = tok::plus;4099}4100break;4101case '-':4102Char = getCharAndSize(CurPtr, SizeTmp);4103if (Char == '-') { // --4104CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4105Kind = tok::minusminus;4106} else if (Char == '>' && LangOpts.CPlusPlus &&4107getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*4108CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),4109SizeTmp2, Result);4110Kind = tok::arrowstar;4111} else if (Char == '>') { // ->4112CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4113Kind = tok::arrow;4114} else if (Char == '=') { // -=4115CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4116Kind = tok::minusequal;4117} else {4118Kind = tok::minus;4119}4120break;4121case '~':4122Kind = tok::tilde;4123break;4124case '!':4125if (getCharAndSize(CurPtr, SizeTmp) == '=') {4126Kind = tok::exclaimequal;4127CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4128} else {4129Kind = tok::exclaim;4130}4131break;4132case '/':4133// 6.4.9: Comments4134Char = getCharAndSize(CurPtr, SizeTmp);4135if (Char == '/') { // Line comment.4136// Even if Line comments are disabled (e.g. in C89 mode), we generally4137// want to lex this as a comment. There is one problem with this though,4138// that in one particular corner case, this can change the behavior of the4139// resultant program. For example, In "foo //**/ bar", C89 would lex4140// this as "foo / bar" and languages with Line comments would lex it as4141// "foo". Check to see if the character after the second slash is a '*'.4142// If so, we will lex that as a "/" instead of the start of a comment.4143// However, we never do this if we are just preprocessing.4144bool TreatAsComment =4145LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);4146if (!TreatAsComment)4147if (!(PP && PP->isPreprocessedOutput()))4148TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';41494150if (TreatAsComment) {4151if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),4152TokAtPhysicalStartOfLine))4153return true; // There is a token to return.41544155// It is common for the tokens immediately after a // comment to be4156// whitespace (indentation for the next line). Instead of going through4157// the big switch, handle it efficiently now.4158goto SkipIgnoredUnits;4159}4160}41614162if (Char == '*') { // /**/ comment.4163if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),4164TokAtPhysicalStartOfLine))4165return true; // There is a token to return.41664167// We only saw whitespace, so just try again with this lexer.4168// (We manually eliminate the tail call to avoid recursion.)4169goto LexNextToken;4170}41714172if (Char == '=') {4173CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4174Kind = tok::slashequal;4175} else {4176Kind = tok::slash;4177}4178break;4179case '%':4180Char = getCharAndSize(CurPtr, SizeTmp);4181if (Char == '=') {4182Kind = tok::percentequal;4183CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4184} else if (LangOpts.Digraphs && Char == '>') {4185Kind = tok::r_brace; // '%>' -> '}'4186CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4187} else if (LangOpts.Digraphs && Char == ':') {4188CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4189Char = getCharAndSize(CurPtr, SizeTmp);4190if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {4191Kind = tok::hashhash; // '%:%:' -> '##'4192CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),4193SizeTmp2, Result);4194} else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize4195CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4196if (!isLexingRawMode())4197Diag(BufferPtr, diag::ext_charize_microsoft);4198Kind = tok::hashat;4199} else { // '%:' -> '#'4200// We parsed a # character. If this occurs at the start of the line,4201// it's actually the start of a preprocessing directive. Callback to4202// the preprocessor to handle it.4203// TODO: -fpreprocessed mode??4204if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)4205goto HandleDirective;42064207Kind = tok::hash;4208}4209} else {4210Kind = tok::percent;4211}4212break;4213case '<':4214Char = getCharAndSize(CurPtr, SizeTmp);4215if (ParsingFilename) {4216return LexAngledStringLiteral(Result, CurPtr);4217} else if (Char == '<') {4218char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);4219if (After == '=') {4220Kind = tok::lesslessequal;4221CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),4222SizeTmp2, Result);4223} else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {4224// If this is actually a '<<<<<<<' version control conflict marker,4225// recognize it as such and recover nicely.4226goto LexNextToken;4227} else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {4228// If this is '<<<<' and we're in a Perforce-style conflict marker,4229// ignore it.4230goto LexNextToken;4231} else if (LangOpts.CUDA && After == '<') {4232Kind = tok::lesslessless;4233CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),4234SizeTmp2, Result);4235} else {4236CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4237Kind = tok::lessless;4238}4239} else if (Char == '=') {4240char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);4241if (After == '>') {4242if (LangOpts.CPlusPlus20) {4243if (!isLexingRawMode())4244Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);4245CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),4246SizeTmp2, Result);4247Kind = tok::spaceship;4248break;4249}4250// Suggest adding a space between the '<=' and the '>' to avoid a4251// change in semantics if this turns up in C++ <=17 mode.4252if (LangOpts.CPlusPlus && !isLexingRawMode()) {4253Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)4254<< FixItHint::CreateInsertion(4255getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");4256}4257}4258CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4259Kind = tok::lessequal;4260} else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['4261if (LangOpts.CPlusPlus11 &&4262getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {4263// C++0x [lex.pptoken]p3:4264// Otherwise, if the next three characters are <:: and the subsequent4265// character is neither : nor >, the < is treated as a preprocessor4266// token by itself and not as the first character of the alternative4267// token <:.4268unsigned SizeTmp3;4269char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);4270if (After != ':' && After != '>') {4271Kind = tok::less;4272if (!isLexingRawMode())4273Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);4274break;4275}4276}42774278CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4279Kind = tok::l_square;4280} else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'4281CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4282Kind = tok::l_brace;4283} else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&4284lexEditorPlaceholder(Result, CurPtr)) {4285return true;4286} else {4287Kind = tok::less;4288}4289break;4290case '>':4291Char = getCharAndSize(CurPtr, SizeTmp);4292if (Char == '=') {4293CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4294Kind = tok::greaterequal;4295} else if (Char == '>') {4296char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);4297if (After == '=') {4298CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),4299SizeTmp2, Result);4300Kind = tok::greatergreaterequal;4301} else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {4302// If this is actually a '>>>>' conflict marker, recognize it as such4303// and recover nicely.4304goto LexNextToken;4305} else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {4306// If this is '>>>>>>>' and we're in a conflict marker, ignore it.4307goto LexNextToken;4308} else if (LangOpts.CUDA && After == '>') {4309Kind = tok::greatergreatergreater;4310CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),4311SizeTmp2, Result);4312} else {4313CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4314Kind = tok::greatergreater;4315}4316} else {4317Kind = tok::greater;4318}4319break;4320case '^':4321Char = getCharAndSize(CurPtr, SizeTmp);4322if (Char == '=') {4323CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4324Kind = tok::caretequal;4325} else if (LangOpts.OpenCL && Char == '^') {4326CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4327Kind = tok::caretcaret;4328} else {4329Kind = tok::caret;4330}4331break;4332case '|':4333Char = getCharAndSize(CurPtr, SizeTmp);4334if (Char == '=') {4335Kind = tok::pipeequal;4336CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4337} else if (Char == '|') {4338// If this is '|||||||' and we're in a conflict marker, ignore it.4339if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))4340goto LexNextToken;4341Kind = tok::pipepipe;4342CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4343} else {4344Kind = tok::pipe;4345}4346break;4347case ':':4348Char = getCharAndSize(CurPtr, SizeTmp);4349if (LangOpts.Digraphs && Char == '>') {4350Kind = tok::r_square; // ':>' -> ']'4351CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4352} else if (Char == ':') {4353Kind = tok::coloncolon;4354CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4355} else {4356Kind = tok::colon;4357}4358break;4359case ';':4360Kind = tok::semi;4361break;4362case '=':4363Char = getCharAndSize(CurPtr, SizeTmp);4364if (Char == '=') {4365// If this is '====' and we're in a conflict marker, ignore it.4366if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))4367goto LexNextToken;43684369Kind = tok::equalequal;4370CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4371} else {4372Kind = tok::equal;4373}4374break;4375case ',':4376Kind = tok::comma;4377break;4378case '#':4379Char = getCharAndSize(CurPtr, SizeTmp);4380if (Char == '#') {4381Kind = tok::hashhash;4382CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4383} else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize4384Kind = tok::hashat;4385if (!isLexingRawMode())4386Diag(BufferPtr, diag::ext_charize_microsoft);4387CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);4388} else {4389// We parsed a # character. If this occurs at the start of the line,4390// it's actually the start of a preprocessing directive. Callback to4391// the preprocessor to handle it.4392// TODO: -fpreprocessed mode??4393if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)4394goto HandleDirective;43954396Kind = tok::hash;4397}4398break;43994400case '@':4401// Objective C support.4402if (CurPtr[-1] == '@' && LangOpts.ObjC)4403Kind = tok::at;4404else4405Kind = tok::unknown;4406break;44074408// UCNs (C99 6.4.3, C++11 [lex.charset]p2)4409case '\\':4410if (!LangOpts.AsmPreprocessor) {4411if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {4412if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {4413if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))4414return true; // KeepWhitespaceMode44154416// We only saw whitespace, so just try again with this lexer.4417// (We manually eliminate the tail call to avoid recursion.)4418goto LexNextToken;4419}44204421return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);4422}4423}44244425Kind = tok::unknown;4426break;44274428default: {4429if (isASCII(Char)) {4430Kind = tok::unknown;4431break;4432}44334434llvm::UTF32 CodePoint;44354436// We can't just reset CurPtr to BufferPtr because BufferPtr may point to4437// an escaped newline.4438--CurPtr;4439llvm::ConversionResult Status =4440llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,4441(const llvm::UTF8 *)BufferEnd,4442&CodePoint,4443llvm::strictConversion);4444if (Status == llvm::conversionOK) {4445if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {4446if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))4447return true; // KeepWhitespaceMode44484449// We only saw whitespace, so just try again with this lexer.4450// (We manually eliminate the tail call to avoid recursion.)4451goto LexNextToken;4452}4453return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);4454}44554456if (isLexingRawMode() || ParsingPreprocessorDirective ||4457PP->isPreprocessedOutput()) {4458++CurPtr;4459Kind = tok::unknown;4460break;4461}44624463// Non-ASCII characters tend to creep into source code unintentionally.4464// Instead of letting the parser complain about the unknown token,4465// just diagnose the invalid UTF-8, then drop the character.4466Diag(CurPtr, diag::err_invalid_utf8);44674468BufferPtr = CurPtr+1;4469// We're pretending the character didn't exist, so just try again with4470// this lexer.4471// (We manually eliminate the tail call to avoid recursion.)4472goto LexNextToken;4473}4474}44754476// Notify MIOpt that we read a non-whitespace/non-comment token.4477MIOpt.ReadToken();44784479// Update the location of token as well as BufferPtr.4480FormTokenWithChars(Result, CurPtr, Kind);4481return true;44824483HandleDirective:4484// We parsed a # character and it's the start of a preprocessing directive.44854486FormTokenWithChars(Result, CurPtr, tok::hash);4487PP->HandleDirective(Result);44884489if (PP->hadModuleLoaderFatalFailure())4490// With a fatal failure in the module loader, we abort parsing.4491return true;44924493// We parsed the directive; lex a token with the new state.4494return false;44954496LexNextToken:4497Result.clearFlag(Token::NeedsCleaning);4498goto LexStart;4499}45004501const char *Lexer::convertDependencyDirectiveToken(4502const dependency_directives_scan::Token &DDTok, Token &Result) {4503const char *TokPtr = BufferStart + DDTok.Offset;4504Result.startToken();4505Result.setLocation(getSourceLocation(TokPtr));4506Result.setKind(DDTok.Kind);4507Result.setFlag((Token::TokenFlags)DDTok.Flags);4508Result.setLength(DDTok.Length);4509BufferPtr = TokPtr + DDTok.Length;4510return TokPtr;4511}45124513bool Lexer::LexDependencyDirectiveToken(Token &Result) {4514assert(isDependencyDirectivesLexer());45154516using namespace dependency_directives_scan;45174518while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {4519if (DepDirectives.front().Kind == pp_eof)4520return LexEndOfFile(Result, BufferEnd);4521if (DepDirectives.front().Kind == tokens_present_before_eof)4522MIOpt.ReadToken();4523NextDepDirectiveTokenIndex = 0;4524DepDirectives = DepDirectives.drop_front();4525}45264527const dependency_directives_scan::Token &DDTok =4528DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];4529if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {4530// Read something other than a preprocessor directive hash.4531MIOpt.ReadToken();4532}45334534if (ParsingFilename && DDTok.is(tok::less)) {4535BufferPtr = BufferStart + DDTok.Offset;4536LexAngledStringLiteral(Result, BufferPtr + 1);4537if (Result.isNot(tok::header_name))4538return true;4539// Advance the index of lexed tokens.4540while (true) {4541const dependency_directives_scan::Token &NextTok =4542DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];4543if (BufferStart + NextTok.Offset >= BufferPtr)4544break;4545++NextDepDirectiveTokenIndex;4546}4547return true;4548}45494550const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);45514552if (Result.is(tok::hash) && Result.isAtStartOfLine()) {4553PP->HandleDirective(Result);4554return false;4555}4556if (Result.is(tok::raw_identifier)) {4557Result.setRawIdentifierData(TokPtr);4558if (!isLexingRawMode()) {4559const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);4560if (II->isHandleIdentifierCase())4561return PP->HandleIdentifier(Result);4562}4563return true;4564}4565if (Result.isLiteral()) {4566Result.setLiteralData(TokPtr);4567return true;4568}4569if (Result.is(tok::colon)) {4570// Convert consecutive colons to 'tok::coloncolon'.4571if (*BufferPtr == ':') {4572assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(4573tok::colon));4574++NextDepDirectiveTokenIndex;4575Result.setKind(tok::coloncolon);4576}4577return true;4578}4579if (Result.is(tok::eod))4580ParsingPreprocessorDirective = false;45814582return true;4583}45844585bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {4586assert(isDependencyDirectivesLexer());45874588using namespace dependency_directives_scan;45894590bool Stop = false;4591unsigned NestedIfs = 0;4592do {4593DepDirectives = DepDirectives.drop_front();4594switch (DepDirectives.front().Kind) {4595case pp_none:4596llvm_unreachable("unexpected 'pp_none'");4597case pp_include:4598case pp___include_macros:4599case pp_define:4600case pp_undef:4601case pp_import:4602case pp_pragma_import:4603case pp_pragma_once:4604case pp_pragma_push_macro:4605case pp_pragma_pop_macro:4606case pp_pragma_include_alias:4607case pp_pragma_system_header:4608case pp_include_next:4609case decl_at_import:4610case cxx_module_decl:4611case cxx_import_decl:4612case cxx_export_module_decl:4613case cxx_export_import_decl:4614case tokens_present_before_eof:4615break;4616case pp_if:4617case pp_ifdef:4618case pp_ifndef:4619++NestedIfs;4620break;4621case pp_elif:4622case pp_elifdef:4623case pp_elifndef:4624case pp_else:4625if (!NestedIfs) {4626Stop = true;4627}4628break;4629case pp_endif:4630if (!NestedIfs) {4631Stop = true;4632} else {4633--NestedIfs;4634}4635break;4636case pp_eof:4637NextDepDirectiveTokenIndex = 0;4638return LexEndOfFile(Result, BufferEnd);4639}4640} while (!Stop);46414642const dependency_directives_scan::Token &DDTok =4643DepDirectives.front().Tokens.front();4644assert(DDTok.is(tok::hash));4645NextDepDirectiveTokenIndex = 1;46464647convertDependencyDirectiveToken(DDTok, Result);4648return false;4649}465046514652