Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/clang/lib/Format/FormatTokenLexer.cpp
35233 views
1
//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This file implements FormatTokenLexer, which tokenizes a source file
11
/// into a FormatToken stream suitable for ClangFormat.
12
///
13
//===----------------------------------------------------------------------===//
14
15
#include "FormatTokenLexer.h"
16
#include "FormatToken.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/SourceManager.h"
19
#include "clang/Format/Format.h"
20
#include "llvm/Support/Regex.h"
21
22
namespace clang {
23
namespace format {
24
25
FormatTokenLexer::FormatTokenLexer(
26
const SourceManager &SourceMgr, FileID ID, unsigned Column,
27
const FormatStyle &Style, encoding::Encoding Encoding,
28
llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29
IdentifierTable &IdentTable)
30
: FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31
Column(Column), TrailingWhitespace(0),
32
LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
33
Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
34
Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
35
FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
36
MacroBlockEndRegex(Style.MacroBlockEnd) {
37
Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
38
Lex->SetKeepWhitespaceMode(true);
39
40
for (const std::string &ForEachMacro : Style.ForEachMacros) {
41
auto Identifier = &IdentTable.get(ForEachMacro);
42
Macros.insert({Identifier, TT_ForEachMacro});
43
}
44
for (const std::string &IfMacro : Style.IfMacros) {
45
auto Identifier = &IdentTable.get(IfMacro);
46
Macros.insert({Identifier, TT_IfMacro});
47
}
48
for (const std::string &AttributeMacro : Style.AttributeMacros) {
49
auto Identifier = &IdentTable.get(AttributeMacro);
50
Macros.insert({Identifier, TT_AttributeMacro});
51
}
52
for (const std::string &StatementMacro : Style.StatementMacros) {
53
auto Identifier = &IdentTable.get(StatementMacro);
54
Macros.insert({Identifier, TT_StatementMacro});
55
}
56
for (const std::string &TypenameMacro : Style.TypenameMacros) {
57
auto Identifier = &IdentTable.get(TypenameMacro);
58
Macros.insert({Identifier, TT_TypenameMacro});
59
}
60
for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61
auto Identifier = &IdentTable.get(NamespaceMacro);
62
Macros.insert({Identifier, TT_NamespaceMacro});
63
}
64
for (const std::string &WhitespaceSensitiveMacro :
65
Style.WhitespaceSensitiveMacros) {
66
auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67
Macros.insert({Identifier, TT_UntouchableMacroFunc});
68
}
69
for (const std::string &StatementAttributeLikeMacro :
70
Style.StatementAttributeLikeMacros) {
71
auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72
Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73
}
74
75
for (const auto &TypeName : Style.TypeNames)
76
TypeNames.insert(&IdentTable.get(TypeName));
77
}
78
79
ArrayRef<FormatToken *> FormatTokenLexer::lex() {
80
assert(Tokens.empty());
81
assert(FirstInLineIndex == 0);
82
do {
83
Tokens.push_back(getNextToken());
84
if (Style.isJavaScript()) {
85
tryParseJSRegexLiteral();
86
handleTemplateStrings();
87
}
88
if (Style.Language == FormatStyle::LK_TextProto)
89
tryParsePythonComment();
90
tryMergePreviousTokens();
91
if (Style.isCSharp()) {
92
// This needs to come after tokens have been merged so that C#
93
// string literals are correctly identified.
94
handleCSharpVerbatimAndInterpolatedStrings();
95
}
96
if (Style.isTableGen()) {
97
handleTableGenMultilineString();
98
handleTableGenNumericLikeIdentifier();
99
}
100
if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
101
FirstInLineIndex = Tokens.size() - 1;
102
} while (Tokens.back()->isNot(tok::eof));
103
if (Style.InsertNewlineAtEOF) {
104
auto &TokEOF = *Tokens.back();
105
if (TokEOF.NewlinesBefore == 0) {
106
TokEOF.NewlinesBefore = 1;
107
TokEOF.OriginalColumn = 0;
108
}
109
}
110
return Tokens;
111
}
112
113
void FormatTokenLexer::tryMergePreviousTokens() {
114
if (tryMerge_TMacro())
115
return;
116
if (tryMergeConflictMarkers())
117
return;
118
if (tryMergeLessLess())
119
return;
120
if (tryMergeGreaterGreater())
121
return;
122
if (tryMergeForEach())
123
return;
124
if (Style.isCpp() && tryTransformTryUsageForC())
125
return;
126
127
if (Style.isJavaScript() || Style.isCSharp()) {
128
static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
129
tok::question};
130
static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
131
tok::period};
132
static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
133
134
if (tryMergeTokens(FatArrow, TT_FatArrow))
135
return;
136
if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
137
// Treat like the "||" operator (as opposed to the ternary ?).
138
Tokens.back()->Tok.setKind(tok::pipepipe);
139
return;
140
}
141
if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
142
// Treat like a regular "." access.
143
Tokens.back()->Tok.setKind(tok::period);
144
return;
145
}
146
if (tryMergeNullishCoalescingEqual())
147
return;
148
}
149
150
if (Style.isCSharp()) {
151
static const tok::TokenKind CSharpNullConditionalLSquare[] = {
152
tok::question, tok::l_square};
153
154
if (tryMergeCSharpKeywordVariables())
155
return;
156
if (tryMergeCSharpStringLiteral())
157
return;
158
if (tryTransformCSharpForEach())
159
return;
160
if (tryMergeTokens(CSharpNullConditionalLSquare,
161
TT_CSharpNullConditionalLSquare)) {
162
// Treat like a regular "[" operator.
163
Tokens.back()->Tok.setKind(tok::l_square);
164
return;
165
}
166
}
167
168
if (tryMergeNSStringLiteral())
169
return;
170
171
if (Style.isJavaScript()) {
172
static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
173
static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
174
tok::equal};
175
static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
176
tok::greaterequal};
177
static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
178
static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
179
tok::starequal};
180
static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
181
static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
182
183
// FIXME: Investigate what token type gives the correct operator priority.
184
if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
185
return;
186
if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
187
return;
188
if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
189
return;
190
if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
191
return;
192
if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
193
Tokens.back()->Tok.setKind(tok::starequal);
194
return;
195
}
196
if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
197
tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
198
// Treat like the "=" assignment operator.
199
Tokens.back()->Tok.setKind(tok::equal);
200
return;
201
}
202
if (tryMergeJSPrivateIdentifier())
203
return;
204
}
205
206
if (Style.Language == FormatStyle::LK_Java) {
207
static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
208
tok::greater, tok::greater, tok::greaterequal};
209
if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
210
return;
211
}
212
213
if (Style.isVerilog()) {
214
// Merge the number following a base like `'h?a0`.
215
if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
216
Tokens.end()[-2]->is(tok::numeric_constant) &&
217
Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
218
tok::question) &&
219
tryMergeTokens(2, TT_Unknown)) {
220
return;
221
}
222
// Part select.
223
if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
224
TT_BitFieldColon)) {
225
return;
226
}
227
// Xnor. The combined token is treated as a caret which can also be either a
228
// unary or binary operator. The actual type is determined in
229
// TokenAnnotator. We also check the token length so we know it is not
230
// already a merged token.
231
if (Tokens.back()->TokenText.size() == 1 &&
232
tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
233
TT_BinaryOperator)) {
234
Tokens.back()->Tok.setKind(tok::caret);
235
return;
236
}
237
// Signed shift and distribution weight.
238
if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
239
Tokens.back()->Tok.setKind(tok::lessless);
240
return;
241
}
242
if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
243
Tokens.back()->Tok.setKind(tok::greatergreater);
244
return;
245
}
246
if (tryMergeTokensAny({{tok::lessless, tok::equal},
247
{tok::lessless, tok::lessequal},
248
{tok::greatergreater, tok::equal},
249
{tok::greatergreater, tok::greaterequal},
250
{tok::colon, tok::equal},
251
{tok::colon, tok::slash}},
252
TT_BinaryOperator)) {
253
Tokens.back()->ForcedPrecedence = prec::Assignment;
254
return;
255
}
256
// Exponentiation, signed shift, case equality, and wildcard equality.
257
if (tryMergeTokensAny({{tok::star, tok::star},
258
{tok::lessless, tok::less},
259
{tok::greatergreater, tok::greater},
260
{tok::exclaimequal, tok::equal},
261
{tok::exclaimequal, tok::question},
262
{tok::equalequal, tok::equal},
263
{tok::equalequal, tok::question}},
264
TT_BinaryOperator)) {
265
return;
266
}
267
// Module paths in specify blocks and the implication and boolean equality
268
// operators.
269
if (tryMergeTokensAny({{tok::plusequal, tok::greater},
270
{tok::plus, tok::star, tok::greater},
271
{tok::minusequal, tok::greater},
272
{tok::minus, tok::star, tok::greater},
273
{tok::less, tok::arrow},
274
{tok::equal, tok::greater},
275
{tok::star, tok::greater},
276
{tok::pipeequal, tok::greater},
277
{tok::pipe, tok::arrow},
278
{tok::hash, tok::minus, tok::hash},
279
{tok::hash, tok::equal, tok::hash}},
280
TT_BinaryOperator) ||
281
Tokens.back()->is(tok::arrow)) {
282
Tokens.back()->ForcedPrecedence = prec::Comma;
283
return;
284
}
285
}
286
if (Style.isTableGen()) {
287
// TableGen's Multi line string starts with [{
288
if (tryMergeTokens({tok::l_square, tok::l_brace},
289
TT_TableGenMultiLineString)) {
290
// Set again with finalizing. This must never be annotated as other types.
291
Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
292
Tokens.back()->Tok.setKind(tok::string_literal);
293
return;
294
}
295
// TableGen's bang operator is the form !<name>.
296
// !cond is a special case with specific syntax.
297
if (tryMergeTokens({tok::exclaim, tok::identifier},
298
TT_TableGenBangOperator)) {
299
Tokens.back()->Tok.setKind(tok::identifier);
300
Tokens.back()->Tok.setIdentifierInfo(nullptr);
301
if (Tokens.back()->TokenText == "!cond")
302
Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
303
else
304
Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
305
return;
306
}
307
if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
308
// Here, "! if" becomes "!if". That is, ! captures if even when the space
309
// exists. That is only one possibility in TableGen's syntax.
310
Tokens.back()->Tok.setKind(tok::identifier);
311
Tokens.back()->Tok.setIdentifierInfo(nullptr);
312
Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
313
return;
314
}
315
// +, - with numbers are literals. Not unary operators.
316
if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
317
Tokens.back()->Tok.setKind(tok::numeric_constant);
318
return;
319
}
320
if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
321
Tokens.back()->Tok.setKind(tok::numeric_constant);
322
return;
323
}
324
}
325
}
326
327
bool FormatTokenLexer::tryMergeNSStringLiteral() {
328
if (Tokens.size() < 2)
329
return false;
330
auto &At = *(Tokens.end() - 2);
331
auto &String = *(Tokens.end() - 1);
332
if (At->isNot(tok::at) || String->isNot(tok::string_literal))
333
return false;
334
At->Tok.setKind(tok::string_literal);
335
At->TokenText = StringRef(At->TokenText.begin(),
336
String->TokenText.end() - At->TokenText.begin());
337
At->ColumnWidth += String->ColumnWidth;
338
At->setType(TT_ObjCStringLiteral);
339
Tokens.erase(Tokens.end() - 1);
340
return true;
341
}
342
343
bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
344
// Merges #idenfier into a single identifier with the text #identifier
345
// but the token tok::identifier.
346
if (Tokens.size() < 2)
347
return false;
348
auto &Hash = *(Tokens.end() - 2);
349
auto &Identifier = *(Tokens.end() - 1);
350
if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
351
return false;
352
Hash->Tok.setKind(tok::identifier);
353
Hash->TokenText =
354
StringRef(Hash->TokenText.begin(),
355
Identifier->TokenText.end() - Hash->TokenText.begin());
356
Hash->ColumnWidth += Identifier->ColumnWidth;
357
Hash->setType(TT_JsPrivateIdentifier);
358
Tokens.erase(Tokens.end() - 1);
359
return true;
360
}
361
362
// Search for verbatim or interpolated string literals @"ABC" or
363
// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
364
// prevent splitting of @, $ and ".
365
// Merging of multiline verbatim strings with embedded '"' is handled in
366
// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
367
bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
368
if (Tokens.size() < 2)
369
return false;
370
371
// Look for @"aaaaaa" or $"aaaaaa".
372
const auto String = *(Tokens.end() - 1);
373
if (String->isNot(tok::string_literal))
374
return false;
375
376
auto Prefix = *(Tokens.end() - 2);
377
if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
378
return false;
379
380
if (Tokens.size() > 2) {
381
const auto Tok = *(Tokens.end() - 3);
382
if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
383
(Tok->is(tok::at) && Prefix->TokenText == "$")) {
384
// This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
385
Tok->ColumnWidth += Prefix->ColumnWidth;
386
Tokens.erase(Tokens.end() - 2);
387
Prefix = Tok;
388
}
389
}
390
391
// Convert back into just a string_literal.
392
Prefix->Tok.setKind(tok::string_literal);
393
Prefix->TokenText =
394
StringRef(Prefix->TokenText.begin(),
395
String->TokenText.end() - Prefix->TokenText.begin());
396
Prefix->ColumnWidth += String->ColumnWidth;
397
Prefix->setType(TT_CSharpStringLiteral);
398
Tokens.erase(Tokens.end() - 1);
399
return true;
400
}
401
402
// Valid C# attribute targets:
403
// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
404
const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
405
"assembly", "module", "field", "event", "method",
406
"param", "property", "return", "type",
407
};
408
409
bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
410
if (Tokens.size() < 2)
411
return false;
412
auto &NullishCoalescing = *(Tokens.end() - 2);
413
auto &Equal = *(Tokens.end() - 1);
414
if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||
415
Equal->isNot(tok::equal)) {
416
return false;
417
}
418
NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
419
NullishCoalescing->TokenText =
420
StringRef(NullishCoalescing->TokenText.begin(),
421
Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
422
NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
423
NullishCoalescing->setType(TT_NullCoalescingEqual);
424
Tokens.erase(Tokens.end() - 1);
425
return true;
426
}
427
428
bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
429
if (Tokens.size() < 2)
430
return false;
431
const auto At = *(Tokens.end() - 2);
432
if (At->isNot(tok::at))
433
return false;
434
const auto Keyword = *(Tokens.end() - 1);
435
if (Keyword->TokenText == "$")
436
return false;
437
if (!Keywords.isCSharpKeyword(*Keyword))
438
return false;
439
440
At->Tok.setKind(tok::identifier);
441
At->TokenText = StringRef(At->TokenText.begin(),
442
Keyword->TokenText.end() - At->TokenText.begin());
443
At->ColumnWidth += Keyword->ColumnWidth;
444
At->setType(Keyword->getType());
445
Tokens.erase(Tokens.end() - 1);
446
return true;
447
}
448
449
// In C# transform identifier foreach into kw_foreach
450
bool FormatTokenLexer::tryTransformCSharpForEach() {
451
if (Tokens.size() < 1)
452
return false;
453
auto &Identifier = *(Tokens.end() - 1);
454
if (Identifier->isNot(tok::identifier))
455
return false;
456
if (Identifier->TokenText != "foreach")
457
return false;
458
459
Identifier->setType(TT_ForEachMacro);
460
Identifier->Tok.setKind(tok::kw_for);
461
return true;
462
}
463
464
bool FormatTokenLexer::tryMergeForEach() {
465
if (Tokens.size() < 2)
466
return false;
467
auto &For = *(Tokens.end() - 2);
468
auto &Each = *(Tokens.end() - 1);
469
if (For->isNot(tok::kw_for))
470
return false;
471
if (Each->isNot(tok::identifier))
472
return false;
473
if (Each->TokenText != "each")
474
return false;
475
476
For->setType(TT_ForEachMacro);
477
For->Tok.setKind(tok::kw_for);
478
479
For->TokenText = StringRef(For->TokenText.begin(),
480
Each->TokenText.end() - For->TokenText.begin());
481
For->ColumnWidth += Each->ColumnWidth;
482
Tokens.erase(Tokens.end() - 1);
483
return true;
484
}
485
486
bool FormatTokenLexer::tryTransformTryUsageForC() {
487
if (Tokens.size() < 2)
488
return false;
489
auto &Try = *(Tokens.end() - 2);
490
if (Try->isNot(tok::kw_try))
491
return false;
492
auto &Next = *(Tokens.end() - 1);
493
if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
494
return false;
495
496
if (Tokens.size() > 2) {
497
auto &At = *(Tokens.end() - 3);
498
if (At->is(tok::at))
499
return false;
500
}
501
502
Try->Tok.setKind(tok::identifier);
503
return true;
504
}
505
506
bool FormatTokenLexer::tryMergeLessLess() {
507
// Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
508
if (Tokens.size() < 3)
509
return false;
510
511
auto First = Tokens.end() - 3;
512
if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
513
return false;
514
515
// Only merge if there currently is no whitespace between the two "<".
516
if (First[1]->hasWhitespaceBefore())
517
return false;
518
519
auto X = Tokens.size() > 3 ? First[-1] : nullptr;
520
if (X && X->is(tok::less))
521
return false;
522
523
auto Y = First[2];
524
if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
525
return false;
526
527
First[0]->Tok.setKind(tok::lessless);
528
First[0]->TokenText = "<<";
529
First[0]->ColumnWidth += 1;
530
Tokens.erase(Tokens.end() - 2);
531
return true;
532
}
533
534
bool FormatTokenLexer::tryMergeGreaterGreater() {
535
// Merge kw_operator,greater,greater into kw_operator,greatergreater.
536
if (Tokens.size() < 2)
537
return false;
538
539
auto First = Tokens.end() - 2;
540
if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
541
return false;
542
543
// Only merge if there currently is no whitespace between the first two ">".
544
if (First[1]->hasWhitespaceBefore())
545
return false;
546
547
auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
548
if (Tok && Tok->isNot(tok::kw_operator))
549
return false;
550
551
First[0]->Tok.setKind(tok::greatergreater);
552
First[0]->TokenText = ">>";
553
First[0]->ColumnWidth += 1;
554
Tokens.erase(Tokens.end() - 1);
555
return true;
556
}
557
558
bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
559
TokenType NewType) {
560
if (Tokens.size() < Kinds.size())
561
return false;
562
563
SmallVectorImpl<FormatToken *>::const_iterator First =
564
Tokens.end() - Kinds.size();
565
for (unsigned i = 0; i < Kinds.size(); ++i)
566
if (First[i]->isNot(Kinds[i]))
567
return false;
568
569
return tryMergeTokens(Kinds.size(), NewType);
570
}
571
572
bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
573
if (Tokens.size() < Count)
574
return false;
575
576
SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
577
unsigned AddLength = 0;
578
for (size_t i = 1; i < Count; ++i) {
579
// If there is whitespace separating the token and the previous one,
580
// they should not be merged.
581
if (First[i]->hasWhitespaceBefore())
582
return false;
583
AddLength += First[i]->TokenText.size();
584
}
585
586
Tokens.resize(Tokens.size() - Count + 1);
587
First[0]->TokenText = StringRef(First[0]->TokenText.data(),
588
First[0]->TokenText.size() + AddLength);
589
First[0]->ColumnWidth += AddLength;
590
First[0]->setType(NewType);
591
return true;
592
}
593
594
bool FormatTokenLexer::tryMergeTokensAny(
595
ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
596
return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
597
return tryMergeTokens(Kinds, NewType);
598
});
599
}
600
601
// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
602
bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
603
// NB: This is not entirely correct, as an r_paren can introduce an operand
604
// location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
605
// corner case to not matter in practice, though.
606
return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
607
tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
608
tok::colon, tok::question, tok::tilde) ||
609
Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
610
tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
611
tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
612
Tok->isBinaryOperator();
613
}
614
615
bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
616
if (!Prev)
617
return true;
618
619
// Regex literals can only follow after prefix unary operators, not after
620
// postfix unary operators. If the '++' is followed by a non-operand
621
// introducing token, the slash here is the operand and not the start of a
622
// regex.
623
// `!` is an unary prefix operator, but also a post-fix operator that casts
624
// away nullability, so the same check applies.
625
if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
626
return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
627
628
// The previous token must introduce an operand location where regex
629
// literals can occur.
630
if (!precedesOperand(Prev))
631
return false;
632
633
return true;
634
}
635
636
// Tries to parse a JavaScript Regex literal starting at the current token,
637
// if that begins with a slash and is in a location where JavaScript allows
638
// regex literals. Changes the current token to a regex literal and updates
639
// its text if successful.
640
void FormatTokenLexer::tryParseJSRegexLiteral() {
641
FormatToken *RegexToken = Tokens.back();
642
if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
643
return;
644
645
FormatToken *Prev = nullptr;
646
for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
647
// NB: Because previous pointers are not initialized yet, this cannot use
648
// Token.getPreviousNonComment.
649
if (FT->isNot(tok::comment)) {
650
Prev = FT;
651
break;
652
}
653
}
654
655
if (!canPrecedeRegexLiteral(Prev))
656
return;
657
658
// 'Manually' lex ahead in the current file buffer.
659
const char *Offset = Lex->getBufferLocation();
660
const char *RegexBegin = Offset - RegexToken->TokenText.size();
661
StringRef Buffer = Lex->getBuffer();
662
bool InCharacterClass = false;
663
bool HaveClosingSlash = false;
664
for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
665
// Regular expressions are terminated with a '/', which can only be
666
// escaped using '\' or a character class between '[' and ']'.
667
// See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
668
switch (*Offset) {
669
case '\\':
670
// Skip the escaped character.
671
++Offset;
672
break;
673
case '[':
674
InCharacterClass = true;
675
break;
676
case ']':
677
InCharacterClass = false;
678
break;
679
case '/':
680
if (!InCharacterClass)
681
HaveClosingSlash = true;
682
break;
683
}
684
}
685
686
RegexToken->setType(TT_RegexLiteral);
687
// Treat regex literals like other string_literals.
688
RegexToken->Tok.setKind(tok::string_literal);
689
RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
690
RegexToken->ColumnWidth = RegexToken->TokenText.size();
691
692
resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
693
}
694
695
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
696
bool Interpolated) {
697
auto Repeated = [&Begin, End]() {
698
return Begin + 1 < End && Begin[1] == Begin[0];
699
};
700
701
// Look for a terminating '"' in the current file buffer.
702
// Make no effort to format code within an interpolated or verbatim string.
703
//
704
// Interpolated strings could contain { } with " characters inside.
705
// $"{x ?? "null"}"
706
// should not be split into $"{x ?? ", null, "}" but should be treated as a
707
// single string-literal.
708
//
709
// We opt not to try and format expressions inside {} within a C#
710
// interpolated string. Formatting expressions within an interpolated string
711
// would require similar work as that done for JavaScript template strings
712
// in `handleTemplateStrings()`.
713
for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
714
switch (*Begin) {
715
case '\\':
716
if (!Verbatim)
717
++Begin;
718
break;
719
case '{':
720
if (Interpolated) {
721
// {{ inside an interpolated string is escaped, so skip it.
722
if (Repeated())
723
++Begin;
724
else
725
++UnmatchedOpeningBraceCount;
726
}
727
break;
728
case '}':
729
if (Interpolated) {
730
// }} inside an interpolated string is escaped, so skip it.
731
if (Repeated())
732
++Begin;
733
else if (UnmatchedOpeningBraceCount > 0)
734
--UnmatchedOpeningBraceCount;
735
else
736
return End;
737
}
738
break;
739
case '"':
740
if (UnmatchedOpeningBraceCount > 0)
741
break;
742
// "" within a verbatim string is an escaped double quote: skip it.
743
if (Verbatim && Repeated()) {
744
++Begin;
745
break;
746
}
747
return Begin;
748
}
749
}
750
751
return End;
752
}
753
754
void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
755
FormatToken *CSharpStringLiteral = Tokens.back();
756
757
if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
758
return;
759
760
auto &TokenText = CSharpStringLiteral->TokenText;
761
762
bool Verbatim = false;
763
bool Interpolated = false;
764
if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
765
Verbatim = true;
766
Interpolated = true;
767
} else if (TokenText.starts_with(R"(@")")) {
768
Verbatim = true;
769
} else if (TokenText.starts_with(R"($")")) {
770
Interpolated = true;
771
}
772
773
// Deal with multiline strings.
774
if (!Verbatim && !Interpolated)
775
return;
776
777
const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
778
const char *Offset = StrBegin;
779
if (Verbatim && Interpolated)
780
Offset += 3;
781
else
782
Offset += 2;
783
784
const auto End = Lex->getBuffer().end();
785
Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
786
787
// Make no attempt to format code properly if a verbatim string is
788
// unterminated.
789
if (Offset >= End)
790
return;
791
792
StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
793
TokenText = LiteralText;
794
795
// Adjust width for potentially multiline string literals.
796
size_t FirstBreak = LiteralText.find('\n');
797
StringRef FirstLineText = FirstBreak == StringRef::npos
798
? LiteralText
799
: LiteralText.substr(0, FirstBreak);
800
CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
801
FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
802
Encoding);
803
size_t LastBreak = LiteralText.rfind('\n');
804
if (LastBreak != StringRef::npos) {
805
CSharpStringLiteral->IsMultiline = true;
806
unsigned StartColumn = 0;
807
CSharpStringLiteral->LastLineColumnWidth =
808
encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
809
StartColumn, Style.TabWidth, Encoding);
810
}
811
812
assert(Offset < End);
813
resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
814
}
815
816
void FormatTokenLexer::handleTableGenMultilineString() {
817
FormatToken *MultiLineString = Tokens.back();
818
if (MultiLineString->isNot(TT_TableGenMultiLineString))
819
return;
820
821
auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
822
// "}]" is the end of multi line string.
823
auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
824
if (CloseOffset == StringRef::npos)
825
return;
826
auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
827
MultiLineString->TokenText = Text;
828
resetLexer(SourceMgr.getFileOffset(
829
Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
830
auto FirstLineText = Text;
831
auto FirstBreak = Text.find('\n');
832
// Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
833
if (FirstBreak != StringRef::npos) {
834
MultiLineString->IsMultiline = true;
835
FirstLineText = Text.substr(0, FirstBreak + 1);
836
// LastLineColumnWidth holds the width of the last line.
837
auto LastBreak = Text.rfind('\n');
838
MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
839
Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
840
Style.TabWidth, Encoding);
841
}
842
// ColumnWidth holds only the width of the first line.
843
MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
844
FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
845
}
846
847
void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
848
FormatToken *Tok = Tokens.back();
849
// TableGen identifiers can begin with digits. Such tokens are lexed as
850
// numeric_constant now.
851
if (Tok->isNot(tok::numeric_constant))
852
return;
853
StringRef Text = Tok->TokenText;
854
// The following check is based on llvm::TGLexer::LexToken.
855
// That lexes the token as a number if any of the following holds:
856
// 1. It starts with '+', '-'.
857
// 2. All the characters are digits.
858
// 3. The first non-digit character is 'b', and the next is '0' or '1'.
859
// 4. The first non-digit character is 'x', and the next is a hex digit.
860
// Note that in the case 3 and 4, if the next character does not exists in
861
// this token, the token is an identifier.
862
if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-')
863
return;
864
const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
865
// All the characters are digits
866
if (NonDigitPos == StringRef::npos)
867
return;
868
char FirstNonDigit = Text[NonDigitPos];
869
if (NonDigitPos < Text.size() - 1) {
870
char TheNext = Text[NonDigitPos + 1];
871
// Regarded as a binary number.
872
if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
873
return;
874
// Regarded as hex number.
875
if (FirstNonDigit == 'x' && isxdigit(TheNext))
876
return;
877
}
878
if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
879
// This is actually an identifier in TableGen.
880
Tok->Tok.setKind(tok::identifier);
881
Tok->Tok.setIdentifierInfo(nullptr);
882
}
883
}
884
885
void FormatTokenLexer::handleTemplateStrings() {
886
FormatToken *BacktickToken = Tokens.back();
887
888
if (BacktickToken->is(tok::l_brace)) {
889
StateStack.push(LexerState::NORMAL);
890
return;
891
}
892
if (BacktickToken->is(tok::r_brace)) {
893
if (StateStack.size() == 1)
894
return;
895
StateStack.pop();
896
if (StateStack.top() != LexerState::TEMPLATE_STRING)
897
return;
898
// If back in TEMPLATE_STRING, fallthrough and continue parsing the
899
} else if (BacktickToken->is(tok::unknown) &&
900
BacktickToken->TokenText == "`") {
901
StateStack.push(LexerState::TEMPLATE_STRING);
902
} else {
903
return; // Not actually a template
904
}
905
906
// 'Manually' lex ahead in the current file buffer.
907
const char *Offset = Lex->getBufferLocation();
908
const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
909
for (; Offset != Lex->getBuffer().end(); ++Offset) {
910
if (Offset[0] == '`') {
911
StateStack.pop();
912
++Offset;
913
break;
914
}
915
if (Offset[0] == '\\') {
916
++Offset; // Skip the escaped character.
917
} else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
918
Offset[1] == '{') {
919
// '${' introduces an expression interpolation in the template string.
920
StateStack.push(LexerState::NORMAL);
921
Offset += 2;
922
break;
923
}
924
}
925
926
StringRef LiteralText(TmplBegin, Offset - TmplBegin);
927
BacktickToken->setType(TT_TemplateString);
928
BacktickToken->Tok.setKind(tok::string_literal);
929
BacktickToken->TokenText = LiteralText;
930
931
// Adjust width for potentially multiline string literals.
932
size_t FirstBreak = LiteralText.find('\n');
933
StringRef FirstLineText = FirstBreak == StringRef::npos
934
? LiteralText
935
: LiteralText.substr(0, FirstBreak);
936
BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
937
FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
938
size_t LastBreak = LiteralText.rfind('\n');
939
if (LastBreak != StringRef::npos) {
940
BacktickToken->IsMultiline = true;
941
unsigned StartColumn = 0; // The template tail spans the entire line.
942
BacktickToken->LastLineColumnWidth =
943
encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
944
StartColumn, Style.TabWidth, Encoding);
945
}
946
947
SourceLocation loc = Lex->getSourceLocation(Offset);
948
resetLexer(SourceMgr.getFileOffset(loc));
949
}
950
951
void FormatTokenLexer::tryParsePythonComment() {
952
FormatToken *HashToken = Tokens.back();
953
if (!HashToken->isOneOf(tok::hash, tok::hashhash))
954
return;
955
// Turn the remainder of this line into a comment.
956
const char *CommentBegin =
957
Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
958
size_t From = CommentBegin - Lex->getBuffer().begin();
959
size_t To = Lex->getBuffer().find_first_of('\n', From);
960
if (To == StringRef::npos)
961
To = Lex->getBuffer().size();
962
size_t Len = To - From;
963
HashToken->setType(TT_LineComment);
964
HashToken->Tok.setKind(tok::comment);
965
HashToken->TokenText = Lex->getBuffer().substr(From, Len);
966
SourceLocation Loc = To < Lex->getBuffer().size()
967
? Lex->getSourceLocation(CommentBegin + Len)
968
: SourceMgr.getLocForEndOfFile(ID);
969
resetLexer(SourceMgr.getFileOffset(Loc));
970
}
971
972
bool FormatTokenLexer::tryMerge_TMacro() {
973
if (Tokens.size() < 4)
974
return false;
975
FormatToken *Last = Tokens.back();
976
if (Last->isNot(tok::r_paren))
977
return false;
978
979
FormatToken *String = Tokens[Tokens.size() - 2];
980
if (String->isNot(tok::string_literal) || String->IsMultiline)
981
return false;
982
983
if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
984
return false;
985
986
FormatToken *Macro = Tokens[Tokens.size() - 4];
987
if (Macro->TokenText != "_T")
988
return false;
989
990
const char *Start = Macro->TokenText.data();
991
const char *End = Last->TokenText.data() + Last->TokenText.size();
992
String->TokenText = StringRef(Start, End - Start);
993
String->IsFirst = Macro->IsFirst;
994
String->LastNewlineOffset = Macro->LastNewlineOffset;
995
String->WhitespaceRange = Macro->WhitespaceRange;
996
String->OriginalColumn = Macro->OriginalColumn;
997
String->ColumnWidth = encoding::columnWidthWithTabs(
998
String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
999
String->NewlinesBefore = Macro->NewlinesBefore;
1000
String->HasUnescapedNewline = Macro->HasUnescapedNewline;
1001
1002
Tokens.pop_back();
1003
Tokens.pop_back();
1004
Tokens.pop_back();
1005
Tokens.back() = String;
1006
if (FirstInLineIndex >= Tokens.size())
1007
FirstInLineIndex = Tokens.size() - 1;
1008
return true;
1009
}
1010
1011
bool FormatTokenLexer::tryMergeConflictMarkers() {
1012
if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1013
return false;
1014
1015
// Conflict lines look like:
1016
// <marker> <text from the vcs>
1017
// For example:
1018
// >>>>>>> /file/in/file/system at revision 1234
1019
//
1020
// We merge all tokens in a line that starts with a conflict marker
1021
// into a single token with a special token type that the unwrapped line
1022
// parser will use to correctly rebuild the underlying code.
1023
1024
FileID ID;
1025
// Get the position of the first token in the line.
1026
unsigned FirstInLineOffset;
1027
std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1028
Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1029
StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1030
// Calculate the offset of the start of the current line.
1031
auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1032
if (LineOffset == StringRef::npos)
1033
LineOffset = 0;
1034
else
1035
++LineOffset;
1036
1037
auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1038
StringRef LineStart;
1039
if (FirstSpace == StringRef::npos)
1040
LineStart = Buffer.substr(LineOffset);
1041
else
1042
LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1043
1044
TokenType Type = TT_Unknown;
1045
if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1046
Type = TT_ConflictStart;
1047
} else if (LineStart == "|||||||" || LineStart == "=======" ||
1048
LineStart == "====") {
1049
Type = TT_ConflictAlternative;
1050
} else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1051
Type = TT_ConflictEnd;
1052
}
1053
1054
if (Type != TT_Unknown) {
1055
FormatToken *Next = Tokens.back();
1056
1057
Tokens.resize(FirstInLineIndex + 1);
1058
// We do not need to build a complete token here, as we will skip it
1059
// during parsing anyway (as we must not touch whitespace around conflict
1060
// markers).
1061
Tokens.back()->setType(Type);
1062
Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1063
1064
Tokens.push_back(Next);
1065
return true;
1066
}
1067
1068
return false;
1069
}
1070
1071
FormatToken *FormatTokenLexer::getStashedToken() {
1072
// Create a synthesized second '>' or '<' token.
1073
Token Tok = FormatTok->Tok;
1074
StringRef TokenText = FormatTok->TokenText;
1075
1076
unsigned OriginalColumn = FormatTok->OriginalColumn;
1077
FormatTok = new (Allocator.Allocate()) FormatToken;
1078
FormatTok->Tok = Tok;
1079
SourceLocation TokLocation =
1080
FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1081
FormatTok->Tok.setLocation(TokLocation);
1082
FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1083
FormatTok->TokenText = TokenText;
1084
FormatTok->ColumnWidth = 1;
1085
FormatTok->OriginalColumn = OriginalColumn + 1;
1086
1087
return FormatTok;
1088
}
1089
1090
/// Truncate the current token to the new length and make the lexer continue
1091
/// from the end of the truncated token. Used for other languages that have
1092
/// different token boundaries, like JavaScript in which a comment ends at a
1093
/// line break regardless of whether the line break follows a backslash. Also
1094
/// used to set the lexer to the end of whitespace if the lexer regards
1095
/// whitespace and an unrecognized symbol as one token.
1096
void FormatTokenLexer::truncateToken(size_t NewLen) {
1097
assert(NewLen <= FormatTok->TokenText.size());
1098
resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1099
Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1100
FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1101
FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1102
FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1103
Encoding);
1104
FormatTok->Tok.setLength(NewLen);
1105
}
1106
1107
/// Count the length of leading whitespace in a token.
1108
static size_t countLeadingWhitespace(StringRef Text) {
1109
// Basically counting the length matched by this regex.
1110
// "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
1111
// Directly using the regex turned out to be slow. With the regex
1112
// version formatting all files in this directory took about 1.25
1113
// seconds. This version took about 0.5 seconds.
1114
const unsigned char *const Begin = Text.bytes_begin();
1115
const unsigned char *const End = Text.bytes_end();
1116
const unsigned char *Cur = Begin;
1117
while (Cur < End) {
1118
if (isspace(Cur[0])) {
1119
++Cur;
1120
} else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
1121
// A '\' followed by a newline always escapes the newline, regardless
1122
// of whether there is another '\' before it.
1123
// The source has a null byte at the end. So the end of the entire input
1124
// isn't reached yet. Also the lexer doesn't break apart an escaped
1125
// newline.
1126
assert(End - Cur >= 2);
1127
Cur += 2;
1128
} else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
1129
(Cur[3] == '\n' || Cur[3] == '\r')) {
1130
// Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
1131
// characters are quoted individually in this comment because if we write
1132
// them together some compilers warn that we have a trigraph in the code.
1133
assert(End - Cur >= 4);
1134
Cur += 4;
1135
} else {
1136
break;
1137
}
1138
}
1139
return Cur - Begin;
1140
}
1141
1142
FormatToken *FormatTokenLexer::getNextToken() {
1143
if (StateStack.top() == LexerState::TOKEN_STASHED) {
1144
StateStack.pop();
1145
return getStashedToken();
1146
}
1147
1148
FormatTok = new (Allocator.Allocate()) FormatToken;
1149
readRawToken(*FormatTok);
1150
SourceLocation WhitespaceStart =
1151
FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1152
FormatTok->IsFirst = IsFirstToken;
1153
IsFirstToken = false;
1154
1155
// Consume and record whitespace until we find a significant token.
1156
// Some tok::unknown tokens are not just whitespace, e.g. whitespace
1157
// followed by a symbol such as backtick. Those symbols may be
1158
// significant in other languages.
1159
unsigned WhitespaceLength = TrailingWhitespace;
1160
while (FormatTok->isNot(tok::eof)) {
1161
auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1162
if (LeadingWhitespace == 0)
1163
break;
1164
if (LeadingWhitespace < FormatTok->TokenText.size())
1165
truncateToken(LeadingWhitespace);
1166
StringRef Text = FormatTok->TokenText;
1167
bool InEscape = false;
1168
for (int i = 0, e = Text.size(); i != e; ++i) {
1169
switch (Text[i]) {
1170
case '\r':
1171
// If this is a CRLF sequence, break here and the LF will be handled on
1172
// the next loop iteration. Otherwise, this is a single Mac CR, treat it
1173
// the same as a single LF.
1174
if (i + 1 < e && Text[i + 1] == '\n')
1175
break;
1176
[[fallthrough]];
1177
case '\n':
1178
++FormatTok->NewlinesBefore;
1179
if (!InEscape)
1180
FormatTok->HasUnescapedNewline = true;
1181
else
1182
InEscape = false;
1183
FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1184
Column = 0;
1185
break;
1186
case '\f':
1187
case '\v':
1188
Column = 0;
1189
break;
1190
case ' ':
1191
++Column;
1192
break;
1193
case '\t':
1194
Column +=
1195
Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1196
break;
1197
case '\\':
1198
case '?':
1199
case '/':
1200
// The text was entirely whitespace when this loop was entered. Thus
1201
// this has to be an escape sequence.
1202
assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
1203
Text.substr(i, 4) == "\?\?/\r" ||
1204
Text.substr(i, 4) == "\?\?/\n" ||
1205
(i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
1206
Text.substr(i - 1, 4) == "\?\?/\n")) ||
1207
(i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
1208
Text.substr(i - 2, 4) == "\?\?/\n")));
1209
InEscape = true;
1210
break;
1211
default:
1212
// This shouldn't happen.
1213
assert(false);
1214
break;
1215
}
1216
}
1217
WhitespaceLength += Text.size();
1218
readRawToken(*FormatTok);
1219
}
1220
1221
if (FormatTok->is(tok::unknown))
1222
FormatTok->setType(TT_ImplicitStringLiteral);
1223
1224
// JavaScript and Java do not allow to escape the end of the line with a
1225
// backslash. Backslashes are syntax errors in plain source, but can occur in
1226
// comments. When a single line comment ends with a \, it'll cause the next
1227
// line of code to be lexed as a comment, breaking formatting. The code below
1228
// finds comments that contain a backslash followed by a line break, truncates
1229
// the comment token at the backslash, and resets the lexer to restart behind
1230
// the backslash.
1231
if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
1232
FormatTok->is(tok::comment) && FormatTok->TokenText.starts_with("//")) {
1233
size_t BackslashPos = FormatTok->TokenText.find('\\');
1234
while (BackslashPos != StringRef::npos) {
1235
if (BackslashPos + 1 < FormatTok->TokenText.size() &&
1236
FormatTok->TokenText[BackslashPos + 1] == '\n') {
1237
truncateToken(BackslashPos + 1);
1238
break;
1239
}
1240
BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
1241
}
1242
}
1243
1244
if (Style.isVerilog()) {
1245
static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1246
SmallVector<StringRef, 1> Matches;
1247
// Verilog uses the backtick instead of the hash for preprocessor stuff.
1248
// And it uses the hash for delays and parameter lists. In order to continue
1249
// using `tok::hash` in other places, the backtick gets marked as the hash
1250
// here. And in order to tell the backtick and hash apart for
1251
// Verilog-specific stuff, the hash becomes an identifier.
1252
if (FormatTok->is(tok::numeric_constant)) {
1253
// In Verilog the quote is not part of a number.
1254
auto Quote = FormatTok->TokenText.find('\'');
1255
if (Quote != StringRef::npos)
1256
truncateToken(Quote);
1257
} else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1258
FormatTok->Tok.setKind(tok::raw_identifier);
1259
} else if (FormatTok->is(tok::raw_identifier)) {
1260
if (FormatTok->TokenText == "`") {
1261
FormatTok->Tok.setIdentifierInfo(nullptr);
1262
FormatTok->Tok.setKind(tok::hash);
1263
} else if (FormatTok->TokenText == "``") {
1264
FormatTok->Tok.setIdentifierInfo(nullptr);
1265
FormatTok->Tok.setKind(tok::hashhash);
1266
} else if (Tokens.size() > 0 &&
1267
Tokens.back()->is(Keywords.kw_apostrophe) &&
1268
NumberBase.match(FormatTok->TokenText, &Matches)) {
1269
// In Verilog in a based number literal like `'b10`, there may be
1270
// whitespace between `'b` and `10`. Therefore we handle the base and
1271
// the rest of the number literal as two tokens. But if there is no
1272
// space in the input code, we need to manually separate the two parts.
1273
truncateToken(Matches[0].size());
1274
FormatTok->setFinalizedType(TT_VerilogNumberBase);
1275
}
1276
}
1277
}
1278
1279
FormatTok->WhitespaceRange = SourceRange(
1280
WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1281
1282
FormatTok->OriginalColumn = Column;
1283
1284
TrailingWhitespace = 0;
1285
if (FormatTok->is(tok::comment)) {
1286
// FIXME: Add the trimmed whitespace to Column.
1287
StringRef UntrimmedText = FormatTok->TokenText;
1288
FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1289
TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1290
} else if (FormatTok->is(tok::raw_identifier)) {
1291
IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1292
FormatTok->Tok.setIdentifierInfo(&Info);
1293
FormatTok->Tok.setKind(Info.getTokenID());
1294
if (Style.Language == FormatStyle::LK_Java &&
1295
FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1296
tok::kw_operator)) {
1297
FormatTok->Tok.setKind(tok::identifier);
1298
FormatTok->Tok.setIdentifierInfo(nullptr);
1299
} else if (Style.isJavaScript() &&
1300
FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1301
tok::kw_operator)) {
1302
FormatTok->Tok.setKind(tok::identifier);
1303
FormatTok->Tok.setIdentifierInfo(nullptr);
1304
} else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1305
FormatTok->Tok.setKind(tok::identifier);
1306
FormatTok->Tok.setIdentifierInfo(nullptr);
1307
}
1308
} else if (FormatTok->is(tok::greatergreater)) {
1309
FormatTok->Tok.setKind(tok::greater);
1310
FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1311
++Column;
1312
StateStack.push(LexerState::TOKEN_STASHED);
1313
} else if (FormatTok->is(tok::lessless)) {
1314
FormatTok->Tok.setKind(tok::less);
1315
FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1316
++Column;
1317
StateStack.push(LexerState::TOKEN_STASHED);
1318
}
1319
1320
if (Style.isVerilog() && Tokens.size() > 0 &&
1321
Tokens.back()->is(TT_VerilogNumberBase) &&
1322
FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1323
// Mark the number following a base like `'h?a0` as a number.
1324
FormatTok->Tok.setKind(tok::numeric_constant);
1325
}
1326
1327
// Now FormatTok is the next non-whitespace token.
1328
1329
StringRef Text = FormatTok->TokenText;
1330
size_t FirstNewlinePos = Text.find('\n');
1331
if (FirstNewlinePos == StringRef::npos) {
1332
// FIXME: ColumnWidth actually depends on the start column, we need to
1333
// take this into account when the token is moved.
1334
FormatTok->ColumnWidth =
1335
encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1336
Column += FormatTok->ColumnWidth;
1337
} else {
1338
FormatTok->IsMultiline = true;
1339
// FIXME: ColumnWidth actually depends on the start column, we need to
1340
// take this into account when the token is moved.
1341
FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1342
Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1343
1344
// The last line of the token always starts in column 0.
1345
// Thus, the length can be precomputed even in the presence of tabs.
1346
FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1347
Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1348
Column = FormatTok->LastLineColumnWidth;
1349
}
1350
1351
if (Style.isCpp()) {
1352
auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1353
auto it = Macros.find(Identifier);
1354
if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1355
Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1356
tok::pp_define) &&
1357
it != Macros.end()) {
1358
FormatTok->setType(it->second);
1359
if (it->second == TT_IfMacro) {
1360
// The lexer token currently has type tok::kw_unknown. However, for this
1361
// substitution to be treated correctly in the TokenAnnotator, faking
1362
// the tok value seems to be needed. Not sure if there's a more elegant
1363
// way.
1364
FormatTok->Tok.setKind(tok::kw_if);
1365
}
1366
} else if (FormatTok->is(tok::identifier)) {
1367
if (MacroBlockBeginRegex.match(Text))
1368
FormatTok->setType(TT_MacroBlockBegin);
1369
else if (MacroBlockEndRegex.match(Text))
1370
FormatTok->setType(TT_MacroBlockEnd);
1371
else if (TypeNames.contains(Identifier))
1372
FormatTok->setFinalizedType(TT_TypeName);
1373
}
1374
}
1375
1376
return FormatTok;
1377
}
1378
1379
bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1380
// In Verilog the quote is not a character literal.
1381
//
1382
// Make the backtick and double backtick identifiers to match against them
1383
// more easily.
1384
//
1385
// In Verilog an escaped identifier starts with backslash and ends with
1386
// whitespace. Unless that whitespace is an escaped newline. A backslash can
1387
// also begin an escaped newline outside of an escaped identifier. We check
1388
// for that outside of the Regex since we can't use negative lookhead
1389
// assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1390
// identifier may have a length of 0 according to Section A.9.3.
1391
// FIXME: If there is an escaped newline in the middle of an escaped
1392
// identifier, allow for pasting the two lines together, But escaped
1393
// identifiers usually occur only in generated code anyway.
1394
static const llvm::Regex VerilogToken(R"re(^('|``?|\\(\\)re"
1395
"(\r?\n|\r)|[^[:space:]])*)");
1396
1397
SmallVector<StringRef, 4> Matches;
1398
const char *Start = Lex->getBufferLocation();
1399
if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1400
&Matches)) {
1401
return false;
1402
}
1403
// There is a null byte at the end of the buffer, so we don't have to check
1404
// Start[1] is within the buffer.
1405
if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
1406
return false;
1407
size_t Len = Matches[0].size();
1408
1409
// The kind has to be an identifier so we can match it against those defined
1410
// in Keywords. The kind has to be set before the length because the setLength
1411
// function checks that the kind is not an annotation.
1412
Tok.setKind(tok::raw_identifier);
1413
Tok.setLength(Len);
1414
Tok.setLocation(Lex->getSourceLocation(Start, Len));
1415
Tok.setRawIdentifierData(Start);
1416
Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1417
return true;
1418
}
1419
1420
void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1421
// For Verilog, first see if there is a special token, and fall back to the
1422
// normal lexer if there isn't one.
1423
if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1424
Lex->LexFromRawLexer(Tok.Tok);
1425
Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1426
Tok.Tok.getLength());
1427
// For formatting, treat unterminated string literals like normal string
1428
// literals.
1429
if (Tok.is(tok::unknown)) {
1430
if (Tok.TokenText.starts_with("\"")) {
1431
Tok.Tok.setKind(tok::string_literal);
1432
Tok.IsUnterminatedLiteral = true;
1433
} else if (Style.isJavaScript() && Tok.TokenText == "''") {
1434
Tok.Tok.setKind(tok::string_literal);
1435
}
1436
}
1437
1438
if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1439
Tok.Tok.setKind(tok::string_literal);
1440
1441
if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1442
FormattingDisabled = false;
1443
1444
Tok.Finalized = FormattingDisabled;
1445
1446
if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1447
FormattingDisabled = true;
1448
}
1449
1450
void FormatTokenLexer::resetLexer(unsigned Offset) {
1451
StringRef Buffer = SourceMgr.getBufferData(ID);
1452
Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1453
Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1454
Lex->SetKeepWhitespaceMode(true);
1455
TrailingWhitespace = 0;
1456
}
1457
1458
} // namespace format
1459
} // namespace clang
1460
1461