Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/clang/lib/AST/CommentLexer.cpp
35260 views
1
//===--- CommentLexer.cpp -------------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#include "clang/AST/CommentLexer.h"
10
#include "clang/AST/CommentCommandTraits.h"
11
#include "clang/AST/CommentDiagnostic.h"
12
#include "clang/Basic/CharInfo.h"
13
#include "llvm/ADT/StringExtras.h"
14
#include "llvm/ADT/StringSwitch.h"
15
#include "llvm/Support/ConvertUTF.h"
16
#include "llvm/Support/ErrorHandling.h"
17
18
namespace clang {
19
namespace comments {
20
21
void Token::dump(const Lexer &L, const SourceManager &SM) const {
22
llvm::errs() << "comments::Token Kind=" << Kind << " ";
23
Loc.print(llvm::errs(), SM);
24
llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25
}
26
27
static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28
return isLetter(C);
29
}
30
31
static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32
return isDigit(C);
33
}
34
35
static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36
return isHexDigit(C);
37
}
38
39
static inline StringRef convertCodePointToUTF8(
40
llvm::BumpPtrAllocator &Allocator,
41
unsigned CodePoint) {
42
char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43
char *ResolvedPtr = Resolved;
44
if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45
return StringRef(Resolved, ResolvedPtr - Resolved);
46
else
47
return StringRef();
48
}
49
50
namespace {
51
52
#include "clang/AST/CommentHTMLTags.inc"
53
#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54
55
} // end anonymous namespace
56
57
StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58
// Fast path, first check a few most widely used named character references.
59
return llvm::StringSwitch<StringRef>(Name)
60
.Case("amp", "&")
61
.Case("lt", "<")
62
.Case("gt", ">")
63
.Case("quot", "\"")
64
.Case("apos", "\'")
65
// Slow path.
66
.Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67
}
68
69
StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70
unsigned CodePoint = 0;
71
for (unsigned i = 0, e = Name.size(); i != e; ++i) {
72
assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73
CodePoint *= 10;
74
CodePoint += Name[i] - '0';
75
}
76
return convertCodePointToUTF8(Allocator, CodePoint);
77
}
78
79
StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80
unsigned CodePoint = 0;
81
for (unsigned i = 0, e = Name.size(); i != e; ++i) {
82
CodePoint *= 16;
83
const char C = Name[i];
84
assert(isHTMLHexCharacterReferenceCharacter(C));
85
CodePoint += llvm::hexDigitValue(C);
86
}
87
return convertCodePointToUTF8(Allocator, CodePoint);
88
}
89
90
void Lexer::skipLineStartingDecorations() {
91
// This function should be called only for C comments
92
assert(CommentState == LCS_InsideCComment);
93
94
if (BufferPtr == CommentEnd)
95
return;
96
97
const char *NewBufferPtr = BufferPtr;
98
while (isHorizontalWhitespace(*NewBufferPtr))
99
if (++NewBufferPtr == CommentEnd)
100
return;
101
if (*NewBufferPtr == '*')
102
BufferPtr = NewBufferPtr + 1;
103
}
104
105
namespace {
106
/// Returns pointer to the first newline character in the string.
107
const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
108
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
109
if (isVerticalWhitespace(*BufferPtr))
110
return BufferPtr;
111
}
112
return BufferEnd;
113
}
114
115
const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
116
if (BufferPtr == BufferEnd)
117
return BufferPtr;
118
119
if (*BufferPtr == '\n')
120
BufferPtr++;
121
else {
122
assert(*BufferPtr == '\r');
123
BufferPtr++;
124
if (BufferPtr != BufferEnd && *BufferPtr == '\n')
125
BufferPtr++;
126
}
127
return BufferPtr;
128
}
129
130
const char *skipNamedCharacterReference(const char *BufferPtr,
131
const char *BufferEnd) {
132
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
133
if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
134
return BufferPtr;
135
}
136
return BufferEnd;
137
}
138
139
const char *skipDecimalCharacterReference(const char *BufferPtr,
140
const char *BufferEnd) {
141
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
142
if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
143
return BufferPtr;
144
}
145
return BufferEnd;
146
}
147
148
const char *skipHexCharacterReference(const char *BufferPtr,
149
const char *BufferEnd) {
150
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
151
if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
152
return BufferPtr;
153
}
154
return BufferEnd;
155
}
156
157
bool isHTMLIdentifierStartingCharacter(char C) {
158
return isLetter(C);
159
}
160
161
bool isHTMLIdentifierCharacter(char C) {
162
return isAlphanumeric(C);
163
}
164
165
const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
166
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
167
if (!isHTMLIdentifierCharacter(*BufferPtr))
168
return BufferPtr;
169
}
170
return BufferEnd;
171
}
172
173
/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
174
/// string allowed.
175
///
176
/// Returns pointer to closing quote.
177
const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
178
{
179
const char Quote = *BufferPtr;
180
assert(Quote == '\"' || Quote == '\'');
181
182
BufferPtr++;
183
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184
const char C = *BufferPtr;
185
if (C == Quote && BufferPtr[-1] != '\\')
186
return BufferPtr;
187
}
188
return BufferEnd;
189
}
190
191
const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
192
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
193
if (!isWhitespace(*BufferPtr))
194
return BufferPtr;
195
}
196
return BufferEnd;
197
}
198
199
bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
200
return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
201
}
202
203
bool isCommandNameStartCharacter(char C) {
204
return isLetter(C);
205
}
206
207
bool isCommandNameCharacter(char C) {
208
return isAlphanumeric(C);
209
}
210
211
const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
212
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
213
if (!isCommandNameCharacter(*BufferPtr))
214
return BufferPtr;
215
}
216
return BufferEnd;
217
}
218
219
/// Return the one past end pointer for BCPL comments.
220
/// Handles newlines escaped with backslash or trigraph for backslahs.
221
const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
222
const char *CurPtr = BufferPtr;
223
while (CurPtr != BufferEnd) {
224
while (!isVerticalWhitespace(*CurPtr)) {
225
CurPtr++;
226
if (CurPtr == BufferEnd)
227
return BufferEnd;
228
}
229
// We found a newline, check if it is escaped.
230
const char *EscapePtr = CurPtr - 1;
231
while(isHorizontalWhitespace(*EscapePtr))
232
EscapePtr--;
233
234
if (*EscapePtr == '\\' ||
235
(EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
236
EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
237
// We found an escaped newline.
238
CurPtr = skipNewline(CurPtr, BufferEnd);
239
} else
240
return CurPtr; // Not an escaped newline.
241
}
242
return BufferEnd;
243
}
244
245
/// Return the one past end pointer for C comments.
246
/// Very dumb, does not handle escaped newlines or trigraphs.
247
const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
248
for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
249
if (*BufferPtr == '*') {
250
assert(BufferPtr + 1 != BufferEnd);
251
if (*(BufferPtr + 1) == '/')
252
return BufferPtr;
253
}
254
}
255
llvm_unreachable("buffer end hit before '*/' was seen");
256
}
257
258
} // end anonymous namespace
259
260
void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
261
tok::TokenKind Kind) {
262
const unsigned TokLen = TokEnd - BufferPtr;
263
Result.setLocation(getSourceLocation(BufferPtr));
264
Result.setKind(Kind);
265
Result.setLength(TokLen);
266
#ifndef NDEBUG
267
Result.TextPtr = "<UNSET>";
268
Result.IntVal = 7;
269
#endif
270
BufferPtr = TokEnd;
271
}
272
273
const char *Lexer::skipTextToken() {
274
const char *TokenPtr = BufferPtr;
275
assert(TokenPtr < CommentEnd);
276
StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
277
278
again:
279
size_t End =
280
StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
281
if (End == StringRef::npos)
282
return CommentEnd;
283
284
// Doxygen doesn't recognize any commands in a one-line double quotation.
285
// If we don't find an ending quotation mark, we pretend it never began.
286
if (*(TokenPtr + End) == '\"') {
287
TokenPtr += End + 1;
288
End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
289
if (End != StringRef::npos && *(TokenPtr + End) == '\"')
290
TokenPtr += End + 1;
291
goto again;
292
}
293
return TokenPtr + End;
294
}
295
296
void Lexer::lexCommentText(Token &T) {
297
assert(CommentState == LCS_InsideBCPLComment ||
298
CommentState == LCS_InsideCComment);
299
300
// Handles lexing non-command text, i.e. text and newline.
301
auto HandleNonCommandToken = [&]() -> void {
302
assert(State == LS_Normal);
303
304
const char *TokenPtr = BufferPtr;
305
assert(TokenPtr < CommentEnd);
306
switch (*TokenPtr) {
307
case '\n':
308
case '\r':
309
TokenPtr = skipNewline(TokenPtr, CommentEnd);
310
formTokenWithChars(T, TokenPtr, tok::newline);
311
312
if (CommentState == LCS_InsideCComment)
313
skipLineStartingDecorations();
314
return;
315
316
default:
317
return formTextToken(T, skipTextToken());
318
}
319
};
320
321
if (!ParseCommands)
322
return HandleNonCommandToken();
323
324
switch (State) {
325
case LS_Normal:
326
break;
327
case LS_VerbatimBlockFirstLine:
328
lexVerbatimBlockFirstLine(T);
329
return;
330
case LS_VerbatimBlockBody:
331
lexVerbatimBlockBody(T);
332
return;
333
case LS_VerbatimLineText:
334
lexVerbatimLineText(T);
335
return;
336
case LS_HTMLStartTag:
337
lexHTMLStartTag(T);
338
return;
339
case LS_HTMLEndTag:
340
lexHTMLEndTag(T);
341
return;
342
}
343
344
assert(State == LS_Normal);
345
const char *TokenPtr = BufferPtr;
346
assert(TokenPtr < CommentEnd);
347
switch(*TokenPtr) {
348
case '\\':
349
case '@': {
350
// Commands that start with a backslash and commands that start with
351
// 'at' have equivalent semantics. But we keep information about the
352
// exact syntax in AST for comments.
353
tok::TokenKind CommandKind =
354
(*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
355
TokenPtr++;
356
if (TokenPtr == CommentEnd) {
357
formTextToken(T, TokenPtr);
358
return;
359
}
360
char C = *TokenPtr;
361
switch (C) {
362
default:
363
break;
364
365
case '\\': case '@': case '&': case '$':
366
case '#': case '<': case '>': case '%':
367
case '\"': case '.': case ':':
368
// This is one of \\ \@ \& \$ etc escape sequences.
369
TokenPtr++;
370
if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
371
// This is the \:: escape sequence.
372
TokenPtr++;
373
}
374
StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
375
formTokenWithChars(T, TokenPtr, tok::text);
376
T.setText(UnescapedText);
377
return;
378
}
379
380
// Don't make zero-length commands.
381
if (!isCommandNameStartCharacter(*TokenPtr)) {
382
formTextToken(T, TokenPtr);
383
return;
384
}
385
386
TokenPtr = skipCommandName(TokenPtr, CommentEnd);
387
unsigned Length = TokenPtr - (BufferPtr + 1);
388
389
// Hardcoded support for lexing LaTeX formula commands
390
// \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
391
if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
392
C = *TokenPtr;
393
if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
394
C == '{' || C == '}') {
395
TokenPtr++;
396
Length++;
397
}
398
}
399
400
StringRef CommandName(BufferPtr + 1, Length);
401
402
const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
403
if (!Info) {
404
if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
405
StringRef CorrectedName = Info->Name;
406
SourceLocation Loc = getSourceLocation(BufferPtr);
407
SourceLocation EndLoc = getSourceLocation(TokenPtr);
408
SourceRange FullRange = SourceRange(Loc, EndLoc);
409
SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
410
Diag(Loc, diag::warn_correct_comment_command_name)
411
<< FullRange << CommandName << CorrectedName
412
<< FixItHint::CreateReplacement(CommandRange, CorrectedName);
413
} else {
414
formTokenWithChars(T, TokenPtr, tok::unknown_command);
415
T.setUnknownCommandName(CommandName);
416
Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
417
<< SourceRange(T.getLocation(), T.getEndLocation());
418
return;
419
}
420
}
421
if (Info->IsVerbatimBlockCommand) {
422
setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
423
return;
424
}
425
if (Info->IsVerbatimLineCommand) {
426
setupAndLexVerbatimLine(T, TokenPtr, Info);
427
return;
428
}
429
formTokenWithChars(T, TokenPtr, CommandKind);
430
T.setCommandID(Info->getID());
431
return;
432
}
433
434
case '&':
435
lexHTMLCharacterReference(T);
436
return;
437
438
case '<': {
439
TokenPtr++;
440
if (TokenPtr == CommentEnd) {
441
formTextToken(T, TokenPtr);
442
return;
443
}
444
const char C = *TokenPtr;
445
if (isHTMLIdentifierStartingCharacter(C))
446
setupAndLexHTMLStartTag(T);
447
else if (C == '/')
448
setupAndLexHTMLEndTag(T);
449
else
450
formTextToken(T, TokenPtr);
451
return;
452
}
453
454
default:
455
return HandleNonCommandToken();
456
}
457
}
458
459
void Lexer::setupAndLexVerbatimBlock(Token &T,
460
const char *TextBegin,
461
char Marker, const CommandInfo *Info) {
462
assert(Info->IsVerbatimBlockCommand);
463
464
VerbatimBlockEndCommandName.clear();
465
VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
466
VerbatimBlockEndCommandName.append(Info->EndCommandName);
467
468
formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
469
T.setVerbatimBlockID(Info->getID());
470
471
// If there is a newline following the verbatim opening command, skip the
472
// newline so that we don't create an tok::verbatim_block_line with empty
473
// text content.
474
if (BufferPtr != CommentEnd &&
475
isVerticalWhitespace(*BufferPtr)) {
476
BufferPtr = skipNewline(BufferPtr, CommentEnd);
477
State = LS_VerbatimBlockBody;
478
return;
479
}
480
481
State = LS_VerbatimBlockFirstLine;
482
}
483
484
void Lexer::lexVerbatimBlockFirstLine(Token &T) {
485
again:
486
assert(BufferPtr < CommentEnd);
487
488
// FIXME: It would be better to scan the text once, finding either the block
489
// end command or newline.
490
//
491
// Extract current line.
492
const char *Newline = findNewline(BufferPtr, CommentEnd);
493
StringRef Line(BufferPtr, Newline - BufferPtr);
494
495
// Look for end command in current line.
496
size_t Pos = Line.find(VerbatimBlockEndCommandName);
497
const char *TextEnd;
498
const char *NextLine;
499
if (Pos == StringRef::npos) {
500
// Current line is completely verbatim.
501
TextEnd = Newline;
502
NextLine = skipNewline(Newline, CommentEnd);
503
} else if (Pos == 0) {
504
// Current line contains just an end command.
505
const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
506
StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
507
formTokenWithChars(T, End, tok::verbatim_block_end);
508
T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
509
State = LS_Normal;
510
return;
511
} else {
512
// There is some text, followed by end command. Extract text first.
513
TextEnd = BufferPtr + Pos;
514
NextLine = TextEnd;
515
// If there is only whitespace before end command, skip whitespace.
516
if (isWhitespace(BufferPtr, TextEnd)) {
517
BufferPtr = TextEnd;
518
goto again;
519
}
520
}
521
522
StringRef Text(BufferPtr, TextEnd - BufferPtr);
523
formTokenWithChars(T, NextLine, tok::verbatim_block_line);
524
T.setVerbatimBlockText(Text);
525
526
State = LS_VerbatimBlockBody;
527
}
528
529
void Lexer::lexVerbatimBlockBody(Token &T) {
530
assert(State == LS_VerbatimBlockBody);
531
532
if (CommentState == LCS_InsideCComment)
533
skipLineStartingDecorations();
534
535
if (BufferPtr == CommentEnd) {
536
formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
537
T.setVerbatimBlockText("");
538
return;
539
}
540
541
lexVerbatimBlockFirstLine(T);
542
}
543
544
void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
545
const CommandInfo *Info) {
546
assert(Info->IsVerbatimLineCommand);
547
formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
548
T.setVerbatimLineID(Info->getID());
549
550
State = LS_VerbatimLineText;
551
}
552
553
void Lexer::lexVerbatimLineText(Token &T) {
554
assert(State == LS_VerbatimLineText);
555
556
// Extract current line.
557
const char *Newline = findNewline(BufferPtr, CommentEnd);
558
StringRef Text(BufferPtr, Newline - BufferPtr);
559
formTokenWithChars(T, Newline, tok::verbatim_line_text);
560
T.setVerbatimLineText(Text);
561
562
State = LS_Normal;
563
}
564
565
void Lexer::lexHTMLCharacterReference(Token &T) {
566
const char *TokenPtr = BufferPtr;
567
assert(*TokenPtr == '&');
568
TokenPtr++;
569
if (TokenPtr == CommentEnd) {
570
formTextToken(T, TokenPtr);
571
return;
572
}
573
const char *NamePtr;
574
bool isNamed = false;
575
bool isDecimal = false;
576
char C = *TokenPtr;
577
if (isHTMLNamedCharacterReferenceCharacter(C)) {
578
NamePtr = TokenPtr;
579
TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
580
isNamed = true;
581
} else if (C == '#') {
582
TokenPtr++;
583
if (TokenPtr == CommentEnd) {
584
formTextToken(T, TokenPtr);
585
return;
586
}
587
C = *TokenPtr;
588
if (isHTMLDecimalCharacterReferenceCharacter(C)) {
589
NamePtr = TokenPtr;
590
TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
591
isDecimal = true;
592
} else if (C == 'x' || C == 'X') {
593
TokenPtr++;
594
NamePtr = TokenPtr;
595
TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
596
} else {
597
formTextToken(T, TokenPtr);
598
return;
599
}
600
} else {
601
formTextToken(T, TokenPtr);
602
return;
603
}
604
if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
605
*TokenPtr != ';') {
606
formTextToken(T, TokenPtr);
607
return;
608
}
609
StringRef Name(NamePtr, TokenPtr - NamePtr);
610
TokenPtr++; // Skip semicolon.
611
StringRef Resolved;
612
if (isNamed)
613
Resolved = resolveHTMLNamedCharacterReference(Name);
614
else if (isDecimal)
615
Resolved = resolveHTMLDecimalCharacterReference(Name);
616
else
617
Resolved = resolveHTMLHexCharacterReference(Name);
618
619
if (Resolved.empty()) {
620
formTextToken(T, TokenPtr);
621
return;
622
}
623
formTokenWithChars(T, TokenPtr, tok::text);
624
T.setText(Resolved);
625
}
626
627
void Lexer::setupAndLexHTMLStartTag(Token &T) {
628
assert(BufferPtr[0] == '<' &&
629
isHTMLIdentifierStartingCharacter(BufferPtr[1]));
630
const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
631
StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
632
if (!isHTMLTagName(Name)) {
633
formTextToken(T, TagNameEnd);
634
return;
635
}
636
637
formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
638
T.setHTMLTagStartName(Name);
639
640
BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
641
642
const char C = *BufferPtr;
643
if (BufferPtr != CommentEnd &&
644
(C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
645
State = LS_HTMLStartTag;
646
}
647
648
void Lexer::lexHTMLStartTag(Token &T) {
649
assert(State == LS_HTMLStartTag);
650
651
const char *TokenPtr = BufferPtr;
652
char C = *TokenPtr;
653
if (isHTMLIdentifierCharacter(C)) {
654
TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
655
StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
656
formTokenWithChars(T, TokenPtr, tok::html_ident);
657
T.setHTMLIdent(Ident);
658
} else {
659
switch (C) {
660
case '=':
661
TokenPtr++;
662
formTokenWithChars(T, TokenPtr, tok::html_equals);
663
break;
664
case '\"':
665
case '\'': {
666
const char *OpenQuote = TokenPtr;
667
TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
668
const char *ClosingQuote = TokenPtr;
669
if (TokenPtr != CommentEnd) // Skip closing quote.
670
TokenPtr++;
671
formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
672
T.setHTMLQuotedString(StringRef(OpenQuote + 1,
673
ClosingQuote - (OpenQuote + 1)));
674
break;
675
}
676
case '>':
677
TokenPtr++;
678
formTokenWithChars(T, TokenPtr, tok::html_greater);
679
State = LS_Normal;
680
return;
681
case '/':
682
TokenPtr++;
683
if (TokenPtr != CommentEnd && *TokenPtr == '>') {
684
TokenPtr++;
685
formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
686
} else
687
formTextToken(T, TokenPtr);
688
689
State = LS_Normal;
690
return;
691
}
692
}
693
694
// Now look ahead and return to normal state if we don't see any HTML tokens
695
// ahead.
696
BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
697
if (BufferPtr == CommentEnd) {
698
State = LS_Normal;
699
return;
700
}
701
702
C = *BufferPtr;
703
if (!isHTMLIdentifierStartingCharacter(C) &&
704
C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
705
State = LS_Normal;
706
return;
707
}
708
}
709
710
void Lexer::setupAndLexHTMLEndTag(Token &T) {
711
assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
712
713
const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
714
const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
715
StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
716
if (!isHTMLTagName(Name)) {
717
formTextToken(T, TagNameEnd);
718
return;
719
}
720
721
const char *End = skipWhitespace(TagNameEnd, CommentEnd);
722
723
formTokenWithChars(T, End, tok::html_end_tag);
724
T.setHTMLTagEndName(Name);
725
726
if (BufferPtr != CommentEnd && *BufferPtr == '>')
727
State = LS_HTMLEndTag;
728
}
729
730
void Lexer::lexHTMLEndTag(Token &T) {
731
assert(BufferPtr != CommentEnd && *BufferPtr == '>');
732
733
formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
734
State = LS_Normal;
735
}
736
737
Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
738
const CommandTraits &Traits, SourceLocation FileLoc,
739
const char *BufferStart, const char *BufferEnd, bool ParseCommands)
740
: Allocator(Allocator), Diags(Diags), Traits(Traits),
741
BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
742
FileLoc(FileLoc), ParseCommands(ParseCommands),
743
CommentState(LCS_BeforeComment), State(LS_Normal) {}
744
745
void Lexer::lex(Token &T) {
746
again:
747
switch (CommentState) {
748
case LCS_BeforeComment:
749
if (BufferPtr == BufferEnd) {
750
formTokenWithChars(T, BufferPtr, tok::eof);
751
return;
752
}
753
754
assert(*BufferPtr == '/');
755
BufferPtr++; // Skip first slash.
756
switch(*BufferPtr) {
757
case '/': { // BCPL comment.
758
BufferPtr++; // Skip second slash.
759
760
if (BufferPtr != BufferEnd) {
761
// Skip Doxygen magic marker, if it is present.
762
// It might be missing because of a typo //< or /*<, or because we
763
// merged this non-Doxygen comment into a bunch of Doxygen comments
764
// around it: /** ... */ /* ... */ /** ... */
765
const char C = *BufferPtr;
766
if (C == '/' || C == '!')
767
BufferPtr++;
768
}
769
770
// Skip less-than symbol that marks trailing comments.
771
// Skip it even if the comment is not a Doxygen one, because //< and /*<
772
// are frequent typos.
773
if (BufferPtr != BufferEnd && *BufferPtr == '<')
774
BufferPtr++;
775
776
CommentState = LCS_InsideBCPLComment;
777
if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
778
State = LS_Normal;
779
CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
780
goto again;
781
}
782
case '*': { // C comment.
783
BufferPtr++; // Skip star.
784
785
// Skip Doxygen magic marker.
786
const char C = *BufferPtr;
787
if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
788
BufferPtr++;
789
790
// Skip less-than symbol that marks trailing comments.
791
if (BufferPtr != BufferEnd && *BufferPtr == '<')
792
BufferPtr++;
793
794
CommentState = LCS_InsideCComment;
795
State = LS_Normal;
796
CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
797
goto again;
798
}
799
default:
800
llvm_unreachable("second character of comment should be '/' or '*'");
801
}
802
803
case LCS_BetweenComments: {
804
// Consecutive comments are extracted only if there is only whitespace
805
// between them. So we can search for the start of the next comment.
806
const char *EndWhitespace = BufferPtr;
807
while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
808
EndWhitespace++;
809
810
// Turn any whitespace between comments (and there is only whitespace
811
// between them -- guaranteed by comment extraction) into a newline. We
812
// have two newlines between C comments in total (first one was synthesized
813
// after a comment).
814
formTokenWithChars(T, EndWhitespace, tok::newline);
815
816
CommentState = LCS_BeforeComment;
817
break;
818
}
819
820
case LCS_InsideBCPLComment:
821
case LCS_InsideCComment:
822
if (BufferPtr != CommentEnd) {
823
lexCommentText(T);
824
break;
825
} else {
826
// Skip C comment closing sequence.
827
if (CommentState == LCS_InsideCComment) {
828
assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
829
BufferPtr += 2;
830
assert(BufferPtr <= BufferEnd);
831
832
// Synthenize newline just after the C comment, regardless if there is
833
// actually a newline.
834
formTokenWithChars(T, BufferPtr, tok::newline);
835
836
CommentState = LCS_BetweenComments;
837
break;
838
} else {
839
// Don't synthesized a newline after BCPL comment.
840
CommentState = LCS_BetweenComments;
841
goto again;
842
}
843
}
844
}
845
}
846
847
StringRef Lexer::getSpelling(const Token &Tok,
848
const SourceManager &SourceMgr) const {
849
SourceLocation Loc = Tok.getLocation();
850
std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
851
852
bool InvalidTemp = false;
853
StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
854
if (InvalidTemp)
855
return StringRef();
856
857
const char *Begin = File.data() + LocInfo.second;
858
return StringRef(Begin, Tok.getLength());
859
}
860
861
} // end namespace comments
862
} // end namespace clang
863
864