Path: blob/main/contrib/llvm-project/clang/lib/Format/FormatTokenLexer.cpp
35233 views
//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7///8/// \file9/// This file implements FormatTokenLexer, which tokenizes a source file10/// into a FormatToken stream suitable for ClangFormat.11///12//===----------------------------------------------------------------------===//1314#include "FormatTokenLexer.h"15#include "FormatToken.h"16#include "clang/Basic/SourceLocation.h"17#include "clang/Basic/SourceManager.h"18#include "clang/Format/Format.h"19#include "llvm/Support/Regex.h"2021namespace clang {22namespace format {2324FormatTokenLexer::FormatTokenLexer(25const SourceManager &SourceMgr, FileID ID, unsigned Column,26const FormatStyle &Style, encoding::Encoding Encoding,27llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,28IdentifierTable &IdentTable)29: FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),30Column(Column), TrailingWhitespace(0),31LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),32Style(Style), IdentTable(IdentTable), Keywords(IdentTable),33Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),34FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),35MacroBlockEndRegex(Style.MacroBlockEnd) {36Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));37Lex->SetKeepWhitespaceMode(true);3839for (const std::string &ForEachMacro : Style.ForEachMacros) {40auto Identifier = &IdentTable.get(ForEachMacro);41Macros.insert({Identifier, TT_ForEachMacro});42}43for (const std::string &IfMacro : Style.IfMacros) {44auto Identifier = &IdentTable.get(IfMacro);45Macros.insert({Identifier, TT_IfMacro});46}47for (const std::string &AttributeMacro : Style.AttributeMacros) {48auto Identifier = &IdentTable.get(AttributeMacro);49Macros.insert({Identifier, TT_AttributeMacro});50}51for (const std::string &StatementMacro : Style.StatementMacros) {52auto Identifier = &IdentTable.get(StatementMacro);53Macros.insert({Identifier, TT_StatementMacro});54}55for (const std::string &TypenameMacro : Style.TypenameMacros) {56auto Identifier = &IdentTable.get(TypenameMacro);57Macros.insert({Identifier, TT_TypenameMacro});58}59for (const std::string &NamespaceMacro : Style.NamespaceMacros) {60auto Identifier = &IdentTable.get(NamespaceMacro);61Macros.insert({Identifier, TT_NamespaceMacro});62}63for (const std::string &WhitespaceSensitiveMacro :64Style.WhitespaceSensitiveMacros) {65auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);66Macros.insert({Identifier, TT_UntouchableMacroFunc});67}68for (const std::string &StatementAttributeLikeMacro :69Style.StatementAttributeLikeMacros) {70auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);71Macros.insert({Identifier, TT_StatementAttributeLikeMacro});72}7374for (const auto &TypeName : Style.TypeNames)75TypeNames.insert(&IdentTable.get(TypeName));76}7778ArrayRef<FormatToken *> FormatTokenLexer::lex() {79assert(Tokens.empty());80assert(FirstInLineIndex == 0);81do {82Tokens.push_back(getNextToken());83if (Style.isJavaScript()) {84tryParseJSRegexLiteral();85handleTemplateStrings();86}87if (Style.Language == FormatStyle::LK_TextProto)88tryParsePythonComment();89tryMergePreviousTokens();90if (Style.isCSharp()) {91// This needs to come after tokens have been merged so that C#92// string literals are correctly identified.93handleCSharpVerbatimAndInterpolatedStrings();94}95if (Style.isTableGen()) {96handleTableGenMultilineString();97handleTableGenNumericLikeIdentifier();98}99if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)100FirstInLineIndex = Tokens.size() - 1;101} while (Tokens.back()->isNot(tok::eof));102if (Style.InsertNewlineAtEOF) {103auto &TokEOF = *Tokens.back();104if (TokEOF.NewlinesBefore == 0) {105TokEOF.NewlinesBefore = 1;106TokEOF.OriginalColumn = 0;107}108}109return Tokens;110}111112void FormatTokenLexer::tryMergePreviousTokens() {113if (tryMerge_TMacro())114return;115if (tryMergeConflictMarkers())116return;117if (tryMergeLessLess())118return;119if (tryMergeGreaterGreater())120return;121if (tryMergeForEach())122return;123if (Style.isCpp() && tryTransformTryUsageForC())124return;125126if (Style.isJavaScript() || Style.isCSharp()) {127static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,128tok::question};129static const tok::TokenKind NullPropagatingOperator[] = {tok::question,130tok::period};131static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};132133if (tryMergeTokens(FatArrow, TT_FatArrow))134return;135if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {136// Treat like the "||" operator (as opposed to the ternary ?).137Tokens.back()->Tok.setKind(tok::pipepipe);138return;139}140if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {141// Treat like a regular "." access.142Tokens.back()->Tok.setKind(tok::period);143return;144}145if (tryMergeNullishCoalescingEqual())146return;147}148149if (Style.isCSharp()) {150static const tok::TokenKind CSharpNullConditionalLSquare[] = {151tok::question, tok::l_square};152153if (tryMergeCSharpKeywordVariables())154return;155if (tryMergeCSharpStringLiteral())156return;157if (tryTransformCSharpForEach())158return;159if (tryMergeTokens(CSharpNullConditionalLSquare,160TT_CSharpNullConditionalLSquare)) {161// Treat like a regular "[" operator.162Tokens.back()->Tok.setKind(tok::l_square);163return;164}165}166167if (tryMergeNSStringLiteral())168return;169170if (Style.isJavaScript()) {171static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};172static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,173tok::equal};174static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,175tok::greaterequal};176static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};177static const tok::TokenKind JSExponentiationEqual[] = {tok::star,178tok::starequal};179static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};180static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};181182// FIXME: Investigate what token type gives the correct operator priority.183if (tryMergeTokens(JSIdentity, TT_BinaryOperator))184return;185if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))186return;187if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))188return;189if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))190return;191if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {192Tokens.back()->Tok.setKind(tok::starequal);193return;194}195if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||196tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {197// Treat like the "=" assignment operator.198Tokens.back()->Tok.setKind(tok::equal);199return;200}201if (tryMergeJSPrivateIdentifier())202return;203}204205if (Style.Language == FormatStyle::LK_Java) {206static const tok::TokenKind JavaRightLogicalShiftAssign[] = {207tok::greater, tok::greater, tok::greaterequal};208if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))209return;210}211212if (Style.isVerilog()) {213// Merge the number following a base like `'h?a0`.214if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&215Tokens.end()[-2]->is(tok::numeric_constant) &&216Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,217tok::question) &&218tryMergeTokens(2, TT_Unknown)) {219return;220}221// Part select.222if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},223TT_BitFieldColon)) {224return;225}226// Xnor. The combined token is treated as a caret which can also be either a227// unary or binary operator. The actual type is determined in228// TokenAnnotator. We also check the token length so we know it is not229// already a merged token.230if (Tokens.back()->TokenText.size() == 1 &&231tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},232TT_BinaryOperator)) {233Tokens.back()->Tok.setKind(tok::caret);234return;235}236// Signed shift and distribution weight.237if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {238Tokens.back()->Tok.setKind(tok::lessless);239return;240}241if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {242Tokens.back()->Tok.setKind(tok::greatergreater);243return;244}245if (tryMergeTokensAny({{tok::lessless, tok::equal},246{tok::lessless, tok::lessequal},247{tok::greatergreater, tok::equal},248{tok::greatergreater, tok::greaterequal},249{tok::colon, tok::equal},250{tok::colon, tok::slash}},251TT_BinaryOperator)) {252Tokens.back()->ForcedPrecedence = prec::Assignment;253return;254}255// Exponentiation, signed shift, case equality, and wildcard equality.256if (tryMergeTokensAny({{tok::star, tok::star},257{tok::lessless, tok::less},258{tok::greatergreater, tok::greater},259{tok::exclaimequal, tok::equal},260{tok::exclaimequal, tok::question},261{tok::equalequal, tok::equal},262{tok::equalequal, tok::question}},263TT_BinaryOperator)) {264return;265}266// Module paths in specify blocks and the implication and boolean equality267// operators.268if (tryMergeTokensAny({{tok::plusequal, tok::greater},269{tok::plus, tok::star, tok::greater},270{tok::minusequal, tok::greater},271{tok::minus, tok::star, tok::greater},272{tok::less, tok::arrow},273{tok::equal, tok::greater},274{tok::star, tok::greater},275{tok::pipeequal, tok::greater},276{tok::pipe, tok::arrow},277{tok::hash, tok::minus, tok::hash},278{tok::hash, tok::equal, tok::hash}},279TT_BinaryOperator) ||280Tokens.back()->is(tok::arrow)) {281Tokens.back()->ForcedPrecedence = prec::Comma;282return;283}284}285if (Style.isTableGen()) {286// TableGen's Multi line string starts with [{287if (tryMergeTokens({tok::l_square, tok::l_brace},288TT_TableGenMultiLineString)) {289// Set again with finalizing. This must never be annotated as other types.290Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);291Tokens.back()->Tok.setKind(tok::string_literal);292return;293}294// TableGen's bang operator is the form !<name>.295// !cond is a special case with specific syntax.296if (tryMergeTokens({tok::exclaim, tok::identifier},297TT_TableGenBangOperator)) {298Tokens.back()->Tok.setKind(tok::identifier);299Tokens.back()->Tok.setIdentifierInfo(nullptr);300if (Tokens.back()->TokenText == "!cond")301Tokens.back()->setFinalizedType(TT_TableGenCondOperator);302else303Tokens.back()->setFinalizedType(TT_TableGenBangOperator);304return;305}306if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {307// Here, "! if" becomes "!if". That is, ! captures if even when the space308// exists. That is only one possibility in TableGen's syntax.309Tokens.back()->Tok.setKind(tok::identifier);310Tokens.back()->Tok.setIdentifierInfo(nullptr);311Tokens.back()->setFinalizedType(TT_TableGenBangOperator);312return;313}314// +, - with numbers are literals. Not unary operators.315if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {316Tokens.back()->Tok.setKind(tok::numeric_constant);317return;318}319if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {320Tokens.back()->Tok.setKind(tok::numeric_constant);321return;322}323}324}325326bool FormatTokenLexer::tryMergeNSStringLiteral() {327if (Tokens.size() < 2)328return false;329auto &At = *(Tokens.end() - 2);330auto &String = *(Tokens.end() - 1);331if (At->isNot(tok::at) || String->isNot(tok::string_literal))332return false;333At->Tok.setKind(tok::string_literal);334At->TokenText = StringRef(At->TokenText.begin(),335String->TokenText.end() - At->TokenText.begin());336At->ColumnWidth += String->ColumnWidth;337At->setType(TT_ObjCStringLiteral);338Tokens.erase(Tokens.end() - 1);339return true;340}341342bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {343// Merges #idenfier into a single identifier with the text #identifier344// but the token tok::identifier.345if (Tokens.size() < 2)346return false;347auto &Hash = *(Tokens.end() - 2);348auto &Identifier = *(Tokens.end() - 1);349if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))350return false;351Hash->Tok.setKind(tok::identifier);352Hash->TokenText =353StringRef(Hash->TokenText.begin(),354Identifier->TokenText.end() - Hash->TokenText.begin());355Hash->ColumnWidth += Identifier->ColumnWidth;356Hash->setType(TT_JsPrivateIdentifier);357Tokens.erase(Tokens.end() - 1);358return true;359}360361// Search for verbatim or interpolated string literals @"ABC" or362// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to363// prevent splitting of @, $ and ".364// Merging of multiline verbatim strings with embedded '"' is handled in365// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.366bool FormatTokenLexer::tryMergeCSharpStringLiteral() {367if (Tokens.size() < 2)368return false;369370// Look for @"aaaaaa" or $"aaaaaa".371const auto String = *(Tokens.end() - 1);372if (String->isNot(tok::string_literal))373return false;374375auto Prefix = *(Tokens.end() - 2);376if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")377return false;378379if (Tokens.size() > 2) {380const auto Tok = *(Tokens.end() - 3);381if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||382(Tok->is(tok::at) && Prefix->TokenText == "$")) {383// This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.384Tok->ColumnWidth += Prefix->ColumnWidth;385Tokens.erase(Tokens.end() - 2);386Prefix = Tok;387}388}389390// Convert back into just a string_literal.391Prefix->Tok.setKind(tok::string_literal);392Prefix->TokenText =393StringRef(Prefix->TokenText.begin(),394String->TokenText.end() - Prefix->TokenText.begin());395Prefix->ColumnWidth += String->ColumnWidth;396Prefix->setType(TT_CSharpStringLiteral);397Tokens.erase(Tokens.end() - 1);398return true;399}400401// Valid C# attribute targets:402// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets403const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {404"assembly", "module", "field", "event", "method",405"param", "property", "return", "type",406};407408bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {409if (Tokens.size() < 2)410return false;411auto &NullishCoalescing = *(Tokens.end() - 2);412auto &Equal = *(Tokens.end() - 1);413if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||414Equal->isNot(tok::equal)) {415return false;416}417NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.418NullishCoalescing->TokenText =419StringRef(NullishCoalescing->TokenText.begin(),420Equal->TokenText.end() - NullishCoalescing->TokenText.begin());421NullishCoalescing->ColumnWidth += Equal->ColumnWidth;422NullishCoalescing->setType(TT_NullCoalescingEqual);423Tokens.erase(Tokens.end() - 1);424return true;425}426427bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {428if (Tokens.size() < 2)429return false;430const auto At = *(Tokens.end() - 2);431if (At->isNot(tok::at))432return false;433const auto Keyword = *(Tokens.end() - 1);434if (Keyword->TokenText == "$")435return false;436if (!Keywords.isCSharpKeyword(*Keyword))437return false;438439At->Tok.setKind(tok::identifier);440At->TokenText = StringRef(At->TokenText.begin(),441Keyword->TokenText.end() - At->TokenText.begin());442At->ColumnWidth += Keyword->ColumnWidth;443At->setType(Keyword->getType());444Tokens.erase(Tokens.end() - 1);445return true;446}447448// In C# transform identifier foreach into kw_foreach449bool FormatTokenLexer::tryTransformCSharpForEach() {450if (Tokens.size() < 1)451return false;452auto &Identifier = *(Tokens.end() - 1);453if (Identifier->isNot(tok::identifier))454return false;455if (Identifier->TokenText != "foreach")456return false;457458Identifier->setType(TT_ForEachMacro);459Identifier->Tok.setKind(tok::kw_for);460return true;461}462463bool FormatTokenLexer::tryMergeForEach() {464if (Tokens.size() < 2)465return false;466auto &For = *(Tokens.end() - 2);467auto &Each = *(Tokens.end() - 1);468if (For->isNot(tok::kw_for))469return false;470if (Each->isNot(tok::identifier))471return false;472if (Each->TokenText != "each")473return false;474475For->setType(TT_ForEachMacro);476For->Tok.setKind(tok::kw_for);477478For->TokenText = StringRef(For->TokenText.begin(),479Each->TokenText.end() - For->TokenText.begin());480For->ColumnWidth += Each->ColumnWidth;481Tokens.erase(Tokens.end() - 1);482return true;483}484485bool FormatTokenLexer::tryTransformTryUsageForC() {486if (Tokens.size() < 2)487return false;488auto &Try = *(Tokens.end() - 2);489if (Try->isNot(tok::kw_try))490return false;491auto &Next = *(Tokens.end() - 1);492if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))493return false;494495if (Tokens.size() > 2) {496auto &At = *(Tokens.end() - 3);497if (At->is(tok::at))498return false;499}500501Try->Tok.setKind(tok::identifier);502return true;503}504505bool FormatTokenLexer::tryMergeLessLess() {506// Merge X,less,less,Y into X,lessless,Y unless X or Y is less.507if (Tokens.size() < 3)508return false;509510auto First = Tokens.end() - 3;511if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))512return false;513514// Only merge if there currently is no whitespace between the two "<".515if (First[1]->hasWhitespaceBefore())516return false;517518auto X = Tokens.size() > 3 ? First[-1] : nullptr;519if (X && X->is(tok::less))520return false;521522auto Y = First[2];523if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))524return false;525526First[0]->Tok.setKind(tok::lessless);527First[0]->TokenText = "<<";528First[0]->ColumnWidth += 1;529Tokens.erase(Tokens.end() - 2);530return true;531}532533bool FormatTokenLexer::tryMergeGreaterGreater() {534// Merge kw_operator,greater,greater into kw_operator,greatergreater.535if (Tokens.size() < 2)536return false;537538auto First = Tokens.end() - 2;539if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))540return false;541542// Only merge if there currently is no whitespace between the first two ">".543if (First[1]->hasWhitespaceBefore())544return false;545546auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;547if (Tok && Tok->isNot(tok::kw_operator))548return false;549550First[0]->Tok.setKind(tok::greatergreater);551First[0]->TokenText = ">>";552First[0]->ColumnWidth += 1;553Tokens.erase(Tokens.end() - 1);554return true;555}556557bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,558TokenType NewType) {559if (Tokens.size() < Kinds.size())560return false;561562SmallVectorImpl<FormatToken *>::const_iterator First =563Tokens.end() - Kinds.size();564for (unsigned i = 0; i < Kinds.size(); ++i)565if (First[i]->isNot(Kinds[i]))566return false;567568return tryMergeTokens(Kinds.size(), NewType);569}570571bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {572if (Tokens.size() < Count)573return false;574575SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;576unsigned AddLength = 0;577for (size_t i = 1; i < Count; ++i) {578// If there is whitespace separating the token and the previous one,579// they should not be merged.580if (First[i]->hasWhitespaceBefore())581return false;582AddLength += First[i]->TokenText.size();583}584585Tokens.resize(Tokens.size() - Count + 1);586First[0]->TokenText = StringRef(First[0]->TokenText.data(),587First[0]->TokenText.size() + AddLength);588First[0]->ColumnWidth += AddLength;589First[0]->setType(NewType);590return true;591}592593bool FormatTokenLexer::tryMergeTokensAny(594ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {595return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {596return tryMergeTokens(Kinds, NewType);597});598}599600// Returns \c true if \p Tok can only be followed by an operand in JavaScript.601bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {602// NB: This is not entirely correct, as an r_paren can introduce an operand603// location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough604// corner case to not matter in practice, though.605return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,606tok::r_brace, tok::l_square, tok::semi, tok::exclaim,607tok::colon, tok::question, tok::tilde) ||608Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,609tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,610tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||611Tok->isBinaryOperator();612}613614bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {615if (!Prev)616return true;617618// Regex literals can only follow after prefix unary operators, not after619// postfix unary operators. If the '++' is followed by a non-operand620// introducing token, the slash here is the operand and not the start of a621// regex.622// `!` is an unary prefix operator, but also a post-fix operator that casts623// away nullability, so the same check applies.624if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))625return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);626627// The previous token must introduce an operand location where regex628// literals can occur.629if (!precedesOperand(Prev))630return false;631632return true;633}634635// Tries to parse a JavaScript Regex literal starting at the current token,636// if that begins with a slash and is in a location where JavaScript allows637// regex literals. Changes the current token to a regex literal and updates638// its text if successful.639void FormatTokenLexer::tryParseJSRegexLiteral() {640FormatToken *RegexToken = Tokens.back();641if (!RegexToken->isOneOf(tok::slash, tok::slashequal))642return;643644FormatToken *Prev = nullptr;645for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {646// NB: Because previous pointers are not initialized yet, this cannot use647// Token.getPreviousNonComment.648if (FT->isNot(tok::comment)) {649Prev = FT;650break;651}652}653654if (!canPrecedeRegexLiteral(Prev))655return;656657// 'Manually' lex ahead in the current file buffer.658const char *Offset = Lex->getBufferLocation();659const char *RegexBegin = Offset - RegexToken->TokenText.size();660StringRef Buffer = Lex->getBuffer();661bool InCharacterClass = false;662bool HaveClosingSlash = false;663for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {664// Regular expressions are terminated with a '/', which can only be665// escaped using '\' or a character class between '[' and ']'.666// See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.667switch (*Offset) {668case '\\':669// Skip the escaped character.670++Offset;671break;672case '[':673InCharacterClass = true;674break;675case ']':676InCharacterClass = false;677break;678case '/':679if (!InCharacterClass)680HaveClosingSlash = true;681break;682}683}684685RegexToken->setType(TT_RegexLiteral);686// Treat regex literals like other string_literals.687RegexToken->Tok.setKind(tok::string_literal);688RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);689RegexToken->ColumnWidth = RegexToken->TokenText.size();690691resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));692}693694static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,695bool Interpolated) {696auto Repeated = [&Begin, End]() {697return Begin + 1 < End && Begin[1] == Begin[0];698};699700// Look for a terminating '"' in the current file buffer.701// Make no effort to format code within an interpolated or verbatim string.702//703// Interpolated strings could contain { } with " characters inside.704// $"{x ?? "null"}"705// should not be split into $"{x ?? ", null, "}" but should be treated as a706// single string-literal.707//708// We opt not to try and format expressions inside {} within a C#709// interpolated string. Formatting expressions within an interpolated string710// would require similar work as that done for JavaScript template strings711// in `handleTemplateStrings()`.712for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {713switch (*Begin) {714case '\\':715if (!Verbatim)716++Begin;717break;718case '{':719if (Interpolated) {720// {{ inside an interpolated string is escaped, so skip it.721if (Repeated())722++Begin;723else724++UnmatchedOpeningBraceCount;725}726break;727case '}':728if (Interpolated) {729// }} inside an interpolated string is escaped, so skip it.730if (Repeated())731++Begin;732else if (UnmatchedOpeningBraceCount > 0)733--UnmatchedOpeningBraceCount;734else735return End;736}737break;738case '"':739if (UnmatchedOpeningBraceCount > 0)740break;741// "" within a verbatim string is an escaped double quote: skip it.742if (Verbatim && Repeated()) {743++Begin;744break;745}746return Begin;747}748}749750return End;751}752753void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {754FormatToken *CSharpStringLiteral = Tokens.back();755756if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))757return;758759auto &TokenText = CSharpStringLiteral->TokenText;760761bool Verbatim = false;762bool Interpolated = false;763if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {764Verbatim = true;765Interpolated = true;766} else if (TokenText.starts_with(R"(@")")) {767Verbatim = true;768} else if (TokenText.starts_with(R"($")")) {769Interpolated = true;770}771772// Deal with multiline strings.773if (!Verbatim && !Interpolated)774return;775776const char *StrBegin = Lex->getBufferLocation() - TokenText.size();777const char *Offset = StrBegin;778if (Verbatim && Interpolated)779Offset += 3;780else781Offset += 2;782783const auto End = Lex->getBuffer().end();784Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);785786// Make no attempt to format code properly if a verbatim string is787// unterminated.788if (Offset >= End)789return;790791StringRef LiteralText(StrBegin, Offset - StrBegin + 1);792TokenText = LiteralText;793794// Adjust width for potentially multiline string literals.795size_t FirstBreak = LiteralText.find('\n');796StringRef FirstLineText = FirstBreak == StringRef::npos797? LiteralText798: LiteralText.substr(0, FirstBreak);799CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(800FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,801Encoding);802size_t LastBreak = LiteralText.rfind('\n');803if (LastBreak != StringRef::npos) {804CSharpStringLiteral->IsMultiline = true;805unsigned StartColumn = 0;806CSharpStringLiteral->LastLineColumnWidth =807encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),808StartColumn, Style.TabWidth, Encoding);809}810811assert(Offset < End);812resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));813}814815void FormatTokenLexer::handleTableGenMultilineString() {816FormatToken *MultiLineString = Tokens.back();817if (MultiLineString->isNot(TT_TableGenMultiLineString))818return;819820auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;821// "}]" is the end of multi line string.822auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);823if (CloseOffset == StringRef::npos)824return;825auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);826MultiLineString->TokenText = Text;827resetLexer(SourceMgr.getFileOffset(828Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));829auto FirstLineText = Text;830auto FirstBreak = Text.find('\n');831// Set ColumnWidth and LastLineColumnWidth when it has multiple lines.832if (FirstBreak != StringRef::npos) {833MultiLineString->IsMultiline = true;834FirstLineText = Text.substr(0, FirstBreak + 1);835// LastLineColumnWidth holds the width of the last line.836auto LastBreak = Text.rfind('\n');837MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(838Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,839Style.TabWidth, Encoding);840}841// ColumnWidth holds only the width of the first line.842MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(843FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);844}845846void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {847FormatToken *Tok = Tokens.back();848// TableGen identifiers can begin with digits. Such tokens are lexed as849// numeric_constant now.850if (Tok->isNot(tok::numeric_constant))851return;852StringRef Text = Tok->TokenText;853// The following check is based on llvm::TGLexer::LexToken.854// That lexes the token as a number if any of the following holds:855// 1. It starts with '+', '-'.856// 2. All the characters are digits.857// 3. The first non-digit character is 'b', and the next is '0' or '1'.858// 4. The first non-digit character is 'x', and the next is a hex digit.859// Note that in the case 3 and 4, if the next character does not exists in860// this token, the token is an identifier.861if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-')862return;863const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });864// All the characters are digits865if (NonDigitPos == StringRef::npos)866return;867char FirstNonDigit = Text[NonDigitPos];868if (NonDigitPos < Text.size() - 1) {869char TheNext = Text[NonDigitPos + 1];870// Regarded as a binary number.871if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))872return;873// Regarded as hex number.874if (FirstNonDigit == 'x' && isxdigit(TheNext))875return;876}877if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {878// This is actually an identifier in TableGen.879Tok->Tok.setKind(tok::identifier);880Tok->Tok.setIdentifierInfo(nullptr);881}882}883884void FormatTokenLexer::handleTemplateStrings() {885FormatToken *BacktickToken = Tokens.back();886887if (BacktickToken->is(tok::l_brace)) {888StateStack.push(LexerState::NORMAL);889return;890}891if (BacktickToken->is(tok::r_brace)) {892if (StateStack.size() == 1)893return;894StateStack.pop();895if (StateStack.top() != LexerState::TEMPLATE_STRING)896return;897// If back in TEMPLATE_STRING, fallthrough and continue parsing the898} else if (BacktickToken->is(tok::unknown) &&899BacktickToken->TokenText == "`") {900StateStack.push(LexerState::TEMPLATE_STRING);901} else {902return; // Not actually a template903}904905// 'Manually' lex ahead in the current file buffer.906const char *Offset = Lex->getBufferLocation();907const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"908for (; Offset != Lex->getBuffer().end(); ++Offset) {909if (Offset[0] == '`') {910StateStack.pop();911++Offset;912break;913}914if (Offset[0] == '\\') {915++Offset; // Skip the escaped character.916} else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&917Offset[1] == '{') {918// '${' introduces an expression interpolation in the template string.919StateStack.push(LexerState::NORMAL);920Offset += 2;921break;922}923}924925StringRef LiteralText(TmplBegin, Offset - TmplBegin);926BacktickToken->setType(TT_TemplateString);927BacktickToken->Tok.setKind(tok::string_literal);928BacktickToken->TokenText = LiteralText;929930// Adjust width for potentially multiline string literals.931size_t FirstBreak = LiteralText.find('\n');932StringRef FirstLineText = FirstBreak == StringRef::npos933? LiteralText934: LiteralText.substr(0, FirstBreak);935BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(936FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);937size_t LastBreak = LiteralText.rfind('\n');938if (LastBreak != StringRef::npos) {939BacktickToken->IsMultiline = true;940unsigned StartColumn = 0; // The template tail spans the entire line.941BacktickToken->LastLineColumnWidth =942encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),943StartColumn, Style.TabWidth, Encoding);944}945946SourceLocation loc = Lex->getSourceLocation(Offset);947resetLexer(SourceMgr.getFileOffset(loc));948}949950void FormatTokenLexer::tryParsePythonComment() {951FormatToken *HashToken = Tokens.back();952if (!HashToken->isOneOf(tok::hash, tok::hashhash))953return;954// Turn the remainder of this line into a comment.955const char *CommentBegin =956Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"957size_t From = CommentBegin - Lex->getBuffer().begin();958size_t To = Lex->getBuffer().find_first_of('\n', From);959if (To == StringRef::npos)960To = Lex->getBuffer().size();961size_t Len = To - From;962HashToken->setType(TT_LineComment);963HashToken->Tok.setKind(tok::comment);964HashToken->TokenText = Lex->getBuffer().substr(From, Len);965SourceLocation Loc = To < Lex->getBuffer().size()966? Lex->getSourceLocation(CommentBegin + Len)967: SourceMgr.getLocForEndOfFile(ID);968resetLexer(SourceMgr.getFileOffset(Loc));969}970971bool FormatTokenLexer::tryMerge_TMacro() {972if (Tokens.size() < 4)973return false;974FormatToken *Last = Tokens.back();975if (Last->isNot(tok::r_paren))976return false;977978FormatToken *String = Tokens[Tokens.size() - 2];979if (String->isNot(tok::string_literal) || String->IsMultiline)980return false;981982if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))983return false;984985FormatToken *Macro = Tokens[Tokens.size() - 4];986if (Macro->TokenText != "_T")987return false;988989const char *Start = Macro->TokenText.data();990const char *End = Last->TokenText.data() + Last->TokenText.size();991String->TokenText = StringRef(Start, End - Start);992String->IsFirst = Macro->IsFirst;993String->LastNewlineOffset = Macro->LastNewlineOffset;994String->WhitespaceRange = Macro->WhitespaceRange;995String->OriginalColumn = Macro->OriginalColumn;996String->ColumnWidth = encoding::columnWidthWithTabs(997String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);998String->NewlinesBefore = Macro->NewlinesBefore;999String->HasUnescapedNewline = Macro->HasUnescapedNewline;10001001Tokens.pop_back();1002Tokens.pop_back();1003Tokens.pop_back();1004Tokens.back() = String;1005if (FirstInLineIndex >= Tokens.size())1006FirstInLineIndex = Tokens.size() - 1;1007return true;1008}10091010bool FormatTokenLexer::tryMergeConflictMarkers() {1011if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))1012return false;10131014// Conflict lines look like:1015// <marker> <text from the vcs>1016// For example:1017// >>>>>>> /file/in/file/system at revision 12341018//1019// We merge all tokens in a line that starts with a conflict marker1020// into a single token with a special token type that the unwrapped line1021// parser will use to correctly rebuild the underlying code.10221023FileID ID;1024// Get the position of the first token in the line.1025unsigned FirstInLineOffset;1026std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(1027Tokens[FirstInLineIndex]->getStartOfNonWhitespace());1028StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();1029// Calculate the offset of the start of the current line.1030auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);1031if (LineOffset == StringRef::npos)1032LineOffset = 0;1033else1034++LineOffset;10351036auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);1037StringRef LineStart;1038if (FirstSpace == StringRef::npos)1039LineStart = Buffer.substr(LineOffset);1040else1041LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);10421043TokenType Type = TT_Unknown;1044if (LineStart == "<<<<<<<" || LineStart == ">>>>") {1045Type = TT_ConflictStart;1046} else if (LineStart == "|||||||" || LineStart == "=======" ||1047LineStart == "====") {1048Type = TT_ConflictAlternative;1049} else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {1050Type = TT_ConflictEnd;1051}10521053if (Type != TT_Unknown) {1054FormatToken *Next = Tokens.back();10551056Tokens.resize(FirstInLineIndex + 1);1057// We do not need to build a complete token here, as we will skip it1058// during parsing anyway (as we must not touch whitespace around conflict1059// markers).1060Tokens.back()->setType(Type);1061Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);10621063Tokens.push_back(Next);1064return true;1065}10661067return false;1068}10691070FormatToken *FormatTokenLexer::getStashedToken() {1071// Create a synthesized second '>' or '<' token.1072Token Tok = FormatTok->Tok;1073StringRef TokenText = FormatTok->TokenText;10741075unsigned OriginalColumn = FormatTok->OriginalColumn;1076FormatTok = new (Allocator.Allocate()) FormatToken;1077FormatTok->Tok = Tok;1078SourceLocation TokLocation =1079FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);1080FormatTok->Tok.setLocation(TokLocation);1081FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);1082FormatTok->TokenText = TokenText;1083FormatTok->ColumnWidth = 1;1084FormatTok->OriginalColumn = OriginalColumn + 1;10851086return FormatTok;1087}10881089/// Truncate the current token to the new length and make the lexer continue1090/// from the end of the truncated token. Used for other languages that have1091/// different token boundaries, like JavaScript in which a comment ends at a1092/// line break regardless of whether the line break follows a backslash. Also1093/// used to set the lexer to the end of whitespace if the lexer regards1094/// whitespace and an unrecognized symbol as one token.1095void FormatTokenLexer::truncateToken(size_t NewLen) {1096assert(NewLen <= FormatTok->TokenText.size());1097resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(1098Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));1099FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);1100FormatTok->ColumnWidth = encoding::columnWidthWithTabs(1101FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,1102Encoding);1103FormatTok->Tok.setLength(NewLen);1104}11051106/// Count the length of leading whitespace in a token.1107static size_t countLeadingWhitespace(StringRef Text) {1108// Basically counting the length matched by this regex.1109// "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"1110// Directly using the regex turned out to be slow. With the regex1111// version formatting all files in this directory took about 1.251112// seconds. This version took about 0.5 seconds.1113const unsigned char *const Begin = Text.bytes_begin();1114const unsigned char *const End = Text.bytes_end();1115const unsigned char *Cur = Begin;1116while (Cur < End) {1117if (isspace(Cur[0])) {1118++Cur;1119} else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {1120// A '\' followed by a newline always escapes the newline, regardless1121// of whether there is another '\' before it.1122// The source has a null byte at the end. So the end of the entire input1123// isn't reached yet. Also the lexer doesn't break apart an escaped1124// newline.1125assert(End - Cur >= 2);1126Cur += 2;1127} else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&1128(Cur[3] == '\n' || Cur[3] == '\r')) {1129// Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the1130// characters are quoted individually in this comment because if we write1131// them together some compilers warn that we have a trigraph in the code.1132assert(End - Cur >= 4);1133Cur += 4;1134} else {1135break;1136}1137}1138return Cur - Begin;1139}11401141FormatToken *FormatTokenLexer::getNextToken() {1142if (StateStack.top() == LexerState::TOKEN_STASHED) {1143StateStack.pop();1144return getStashedToken();1145}11461147FormatTok = new (Allocator.Allocate()) FormatToken;1148readRawToken(*FormatTok);1149SourceLocation WhitespaceStart =1150FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);1151FormatTok->IsFirst = IsFirstToken;1152IsFirstToken = false;11531154// Consume and record whitespace until we find a significant token.1155// Some tok::unknown tokens are not just whitespace, e.g. whitespace1156// followed by a symbol such as backtick. Those symbols may be1157// significant in other languages.1158unsigned WhitespaceLength = TrailingWhitespace;1159while (FormatTok->isNot(tok::eof)) {1160auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);1161if (LeadingWhitespace == 0)1162break;1163if (LeadingWhitespace < FormatTok->TokenText.size())1164truncateToken(LeadingWhitespace);1165StringRef Text = FormatTok->TokenText;1166bool InEscape = false;1167for (int i = 0, e = Text.size(); i != e; ++i) {1168switch (Text[i]) {1169case '\r':1170// If this is a CRLF sequence, break here and the LF will be handled on1171// the next loop iteration. Otherwise, this is a single Mac CR, treat it1172// the same as a single LF.1173if (i + 1 < e && Text[i + 1] == '\n')1174break;1175[[fallthrough]];1176case '\n':1177++FormatTok->NewlinesBefore;1178if (!InEscape)1179FormatTok->HasUnescapedNewline = true;1180else1181InEscape = false;1182FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;1183Column = 0;1184break;1185case '\f':1186case '\v':1187Column = 0;1188break;1189case ' ':1190++Column;1191break;1192case '\t':1193Column +=1194Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);1195break;1196case '\\':1197case '?':1198case '/':1199// The text was entirely whitespace when this loop was entered. Thus1200// this has to be an escape sequence.1201assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||1202Text.substr(i, 4) == "\?\?/\r" ||1203Text.substr(i, 4) == "\?\?/\n" ||1204(i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||1205Text.substr(i - 1, 4) == "\?\?/\n")) ||1206(i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||1207Text.substr(i - 2, 4) == "\?\?/\n")));1208InEscape = true;1209break;1210default:1211// This shouldn't happen.1212assert(false);1213break;1214}1215}1216WhitespaceLength += Text.size();1217readRawToken(*FormatTok);1218}12191220if (FormatTok->is(tok::unknown))1221FormatTok->setType(TT_ImplicitStringLiteral);12221223// JavaScript and Java do not allow to escape the end of the line with a1224// backslash. Backslashes are syntax errors in plain source, but can occur in1225// comments. When a single line comment ends with a \, it'll cause the next1226// line of code to be lexed as a comment, breaking formatting. The code below1227// finds comments that contain a backslash followed by a line break, truncates1228// the comment token at the backslash, and resets the lexer to restart behind1229// the backslash.1230if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&1231FormatTok->is(tok::comment) && FormatTok->TokenText.starts_with("//")) {1232size_t BackslashPos = FormatTok->TokenText.find('\\');1233while (BackslashPos != StringRef::npos) {1234if (BackslashPos + 1 < FormatTok->TokenText.size() &&1235FormatTok->TokenText[BackslashPos + 1] == '\n') {1236truncateToken(BackslashPos + 1);1237break;1238}1239BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);1240}1241}12421243if (Style.isVerilog()) {1244static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);1245SmallVector<StringRef, 1> Matches;1246// Verilog uses the backtick instead of the hash for preprocessor stuff.1247// And it uses the hash for delays and parameter lists. In order to continue1248// using `tok::hash` in other places, the backtick gets marked as the hash1249// here. And in order to tell the backtick and hash apart for1250// Verilog-specific stuff, the hash becomes an identifier.1251if (FormatTok->is(tok::numeric_constant)) {1252// In Verilog the quote is not part of a number.1253auto Quote = FormatTok->TokenText.find('\'');1254if (Quote != StringRef::npos)1255truncateToken(Quote);1256} else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {1257FormatTok->Tok.setKind(tok::raw_identifier);1258} else if (FormatTok->is(tok::raw_identifier)) {1259if (FormatTok->TokenText == "`") {1260FormatTok->Tok.setIdentifierInfo(nullptr);1261FormatTok->Tok.setKind(tok::hash);1262} else if (FormatTok->TokenText == "``") {1263FormatTok->Tok.setIdentifierInfo(nullptr);1264FormatTok->Tok.setKind(tok::hashhash);1265} else if (Tokens.size() > 0 &&1266Tokens.back()->is(Keywords.kw_apostrophe) &&1267NumberBase.match(FormatTok->TokenText, &Matches)) {1268// In Verilog in a based number literal like `'b10`, there may be1269// whitespace between `'b` and `10`. Therefore we handle the base and1270// the rest of the number literal as two tokens. But if there is no1271// space in the input code, we need to manually separate the two parts.1272truncateToken(Matches[0].size());1273FormatTok->setFinalizedType(TT_VerilogNumberBase);1274}1275}1276}12771278FormatTok->WhitespaceRange = SourceRange(1279WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));12801281FormatTok->OriginalColumn = Column;12821283TrailingWhitespace = 0;1284if (FormatTok->is(tok::comment)) {1285// FIXME: Add the trimmed whitespace to Column.1286StringRef UntrimmedText = FormatTok->TokenText;1287FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");1288TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();1289} else if (FormatTok->is(tok::raw_identifier)) {1290IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);1291FormatTok->Tok.setIdentifierInfo(&Info);1292FormatTok->Tok.setKind(Info.getTokenID());1293if (Style.Language == FormatStyle::LK_Java &&1294FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,1295tok::kw_operator)) {1296FormatTok->Tok.setKind(tok::identifier);1297FormatTok->Tok.setIdentifierInfo(nullptr);1298} else if (Style.isJavaScript() &&1299FormatTok->isOneOf(tok::kw_struct, tok::kw_union,1300tok::kw_operator)) {1301FormatTok->Tok.setKind(tok::identifier);1302FormatTok->Tok.setIdentifierInfo(nullptr);1303} else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {1304FormatTok->Tok.setKind(tok::identifier);1305FormatTok->Tok.setIdentifierInfo(nullptr);1306}1307} else if (FormatTok->is(tok::greatergreater)) {1308FormatTok->Tok.setKind(tok::greater);1309FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);1310++Column;1311StateStack.push(LexerState::TOKEN_STASHED);1312} else if (FormatTok->is(tok::lessless)) {1313FormatTok->Tok.setKind(tok::less);1314FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);1315++Column;1316StateStack.push(LexerState::TOKEN_STASHED);1317}13181319if (Style.isVerilog() && Tokens.size() > 0 &&1320Tokens.back()->is(TT_VerilogNumberBase) &&1321FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {1322// Mark the number following a base like `'h?a0` as a number.1323FormatTok->Tok.setKind(tok::numeric_constant);1324}13251326// Now FormatTok is the next non-whitespace token.13271328StringRef Text = FormatTok->TokenText;1329size_t FirstNewlinePos = Text.find('\n');1330if (FirstNewlinePos == StringRef::npos) {1331// FIXME: ColumnWidth actually depends on the start column, we need to1332// take this into account when the token is moved.1333FormatTok->ColumnWidth =1334encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);1335Column += FormatTok->ColumnWidth;1336} else {1337FormatTok->IsMultiline = true;1338// FIXME: ColumnWidth actually depends on the start column, we need to1339// take this into account when the token is moved.1340FormatTok->ColumnWidth = encoding::columnWidthWithTabs(1341Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);13421343// The last line of the token always starts in column 0.1344// Thus, the length can be precomputed even in the presence of tabs.1345FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(1346Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);1347Column = FormatTok->LastLineColumnWidth;1348}13491350if (Style.isCpp()) {1351auto *Identifier = FormatTok->Tok.getIdentifierInfo();1352auto it = Macros.find(Identifier);1353if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&1354Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==1355tok::pp_define) &&1356it != Macros.end()) {1357FormatTok->setType(it->second);1358if (it->second == TT_IfMacro) {1359// The lexer token currently has type tok::kw_unknown. However, for this1360// substitution to be treated correctly in the TokenAnnotator, faking1361// the tok value seems to be needed. Not sure if there's a more elegant1362// way.1363FormatTok->Tok.setKind(tok::kw_if);1364}1365} else if (FormatTok->is(tok::identifier)) {1366if (MacroBlockBeginRegex.match(Text))1367FormatTok->setType(TT_MacroBlockBegin);1368else if (MacroBlockEndRegex.match(Text))1369FormatTok->setType(TT_MacroBlockEnd);1370else if (TypeNames.contains(Identifier))1371FormatTok->setFinalizedType(TT_TypeName);1372}1373}13741375return FormatTok;1376}13771378bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {1379// In Verilog the quote is not a character literal.1380//1381// Make the backtick and double backtick identifiers to match against them1382// more easily.1383//1384// In Verilog an escaped identifier starts with backslash and ends with1385// whitespace. Unless that whitespace is an escaped newline. A backslash can1386// also begin an escaped newline outside of an escaped identifier. We check1387// for that outside of the Regex since we can't use negative lookhead1388// assertions. Simply changing the '*' to '+' breaks stuff as the escaped1389// identifier may have a length of 0 according to Section A.9.3.1390// FIXME: If there is an escaped newline in the middle of an escaped1391// identifier, allow for pasting the two lines together, But escaped1392// identifiers usually occur only in generated code anyway.1393static const llvm::Regex VerilogToken(R"re(^('|``?|\\(\\)re"1394"(\r?\n|\r)|[^[:space:]])*)");13951396SmallVector<StringRef, 4> Matches;1397const char *Start = Lex->getBufferLocation();1398if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),1399&Matches)) {1400return false;1401}1402// There is a null byte at the end of the buffer, so we don't have to check1403// Start[1] is within the buffer.1404if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))1405return false;1406size_t Len = Matches[0].size();14071408// The kind has to be an identifier so we can match it against those defined1409// in Keywords. The kind has to be set before the length because the setLength1410// function checks that the kind is not an annotation.1411Tok.setKind(tok::raw_identifier);1412Tok.setLength(Len);1413Tok.setLocation(Lex->getSourceLocation(Start, Len));1414Tok.setRawIdentifierData(Start);1415Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);1416return true;1417}14181419void FormatTokenLexer::readRawToken(FormatToken &Tok) {1420// For Verilog, first see if there is a special token, and fall back to the1421// normal lexer if there isn't one.1422if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))1423Lex->LexFromRawLexer(Tok.Tok);1424Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),1425Tok.Tok.getLength());1426// For formatting, treat unterminated string literals like normal string1427// literals.1428if (Tok.is(tok::unknown)) {1429if (Tok.TokenText.starts_with("\"")) {1430Tok.Tok.setKind(tok::string_literal);1431Tok.IsUnterminatedLiteral = true;1432} else if (Style.isJavaScript() && Tok.TokenText == "''") {1433Tok.Tok.setKind(tok::string_literal);1434}1435}14361437if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))1438Tok.Tok.setKind(tok::string_literal);14391440if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))1441FormattingDisabled = false;14421443Tok.Finalized = FormattingDisabled;14441445if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))1446FormattingDisabled = true;1447}14481449void FormatTokenLexer::resetLexer(unsigned Offset) {1450StringRef Buffer = SourceMgr.getBufferData(ID);1451Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,1452Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));1453Lex->SetKeepWhitespaceMode(true);1454TrailingWhitespace = 0;1455}14561457} // namespace format1458} // namespace clang145914601461