Path: blob/main/contrib/llvm-project/clang/lib/AST/CommentLexer.cpp
35260 views
//===--- CommentLexer.cpp -------------------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//78#include "clang/AST/CommentLexer.h"9#include "clang/AST/CommentCommandTraits.h"10#include "clang/AST/CommentDiagnostic.h"11#include "clang/Basic/CharInfo.h"12#include "llvm/ADT/StringExtras.h"13#include "llvm/ADT/StringSwitch.h"14#include "llvm/Support/ConvertUTF.h"15#include "llvm/Support/ErrorHandling.h"1617namespace clang {18namespace comments {1920void Token::dump(const Lexer &L, const SourceManager &SM) const {21llvm::errs() << "comments::Token Kind=" << Kind << " ";22Loc.print(llvm::errs(), SM);23llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";24}2526static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {27return isLetter(C);28}2930static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {31return isDigit(C);32}3334static inline bool isHTMLHexCharacterReferenceCharacter(char C) {35return isHexDigit(C);36}3738static inline StringRef convertCodePointToUTF8(39llvm::BumpPtrAllocator &Allocator,40unsigned CodePoint) {41char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);42char *ResolvedPtr = Resolved;43if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))44return StringRef(Resolved, ResolvedPtr - Resolved);45else46return StringRef();47}4849namespace {5051#include "clang/AST/CommentHTMLTags.inc"52#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"5354} // end anonymous namespace5556StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {57// Fast path, first check a few most widely used named character references.58return llvm::StringSwitch<StringRef>(Name)59.Case("amp", "&")60.Case("lt", "<")61.Case("gt", ">")62.Case("quot", "\"")63.Case("apos", "\'")64// Slow path.65.Default(translateHTMLNamedCharacterReferenceToUTF8(Name));66}6768StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {69unsigned CodePoint = 0;70for (unsigned i = 0, e = Name.size(); i != e; ++i) {71assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));72CodePoint *= 10;73CodePoint += Name[i] - '0';74}75return convertCodePointToUTF8(Allocator, CodePoint);76}7778StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {79unsigned CodePoint = 0;80for (unsigned i = 0, e = Name.size(); i != e; ++i) {81CodePoint *= 16;82const char C = Name[i];83assert(isHTMLHexCharacterReferenceCharacter(C));84CodePoint += llvm::hexDigitValue(C);85}86return convertCodePointToUTF8(Allocator, CodePoint);87}8889void Lexer::skipLineStartingDecorations() {90// This function should be called only for C comments91assert(CommentState == LCS_InsideCComment);9293if (BufferPtr == CommentEnd)94return;9596const char *NewBufferPtr = BufferPtr;97while (isHorizontalWhitespace(*NewBufferPtr))98if (++NewBufferPtr == CommentEnd)99return;100if (*NewBufferPtr == '*')101BufferPtr = NewBufferPtr + 1;102}103104namespace {105/// Returns pointer to the first newline character in the string.106const char *findNewline(const char *BufferPtr, const char *BufferEnd) {107for ( ; BufferPtr != BufferEnd; ++BufferPtr) {108if (isVerticalWhitespace(*BufferPtr))109return BufferPtr;110}111return BufferEnd;112}113114const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {115if (BufferPtr == BufferEnd)116return BufferPtr;117118if (*BufferPtr == '\n')119BufferPtr++;120else {121assert(*BufferPtr == '\r');122BufferPtr++;123if (BufferPtr != BufferEnd && *BufferPtr == '\n')124BufferPtr++;125}126return BufferPtr;127}128129const char *skipNamedCharacterReference(const char *BufferPtr,130const char *BufferEnd) {131for ( ; BufferPtr != BufferEnd; ++BufferPtr) {132if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))133return BufferPtr;134}135return BufferEnd;136}137138const char *skipDecimalCharacterReference(const char *BufferPtr,139const char *BufferEnd) {140for ( ; BufferPtr != BufferEnd; ++BufferPtr) {141if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))142return BufferPtr;143}144return BufferEnd;145}146147const char *skipHexCharacterReference(const char *BufferPtr,148const char *BufferEnd) {149for ( ; BufferPtr != BufferEnd; ++BufferPtr) {150if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))151return BufferPtr;152}153return BufferEnd;154}155156bool isHTMLIdentifierStartingCharacter(char C) {157return isLetter(C);158}159160bool isHTMLIdentifierCharacter(char C) {161return isAlphanumeric(C);162}163164const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {165for ( ; BufferPtr != BufferEnd; ++BufferPtr) {166if (!isHTMLIdentifierCharacter(*BufferPtr))167return BufferPtr;168}169return BufferEnd;170}171172/// Skip HTML string quoted in single or double quotes. Escaping quotes inside173/// string allowed.174///175/// Returns pointer to closing quote.176const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)177{178const char Quote = *BufferPtr;179assert(Quote == '\"' || Quote == '\'');180181BufferPtr++;182for ( ; BufferPtr != BufferEnd; ++BufferPtr) {183const char C = *BufferPtr;184if (C == Quote && BufferPtr[-1] != '\\')185return BufferPtr;186}187return BufferEnd;188}189190const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {191for ( ; BufferPtr != BufferEnd; ++BufferPtr) {192if (!isWhitespace(*BufferPtr))193return BufferPtr;194}195return BufferEnd;196}197198bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {199return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;200}201202bool isCommandNameStartCharacter(char C) {203return isLetter(C);204}205206bool isCommandNameCharacter(char C) {207return isAlphanumeric(C);208}209210const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {211for ( ; BufferPtr != BufferEnd; ++BufferPtr) {212if (!isCommandNameCharacter(*BufferPtr))213return BufferPtr;214}215return BufferEnd;216}217218/// Return the one past end pointer for BCPL comments.219/// Handles newlines escaped with backslash or trigraph for backslahs.220const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {221const char *CurPtr = BufferPtr;222while (CurPtr != BufferEnd) {223while (!isVerticalWhitespace(*CurPtr)) {224CurPtr++;225if (CurPtr == BufferEnd)226return BufferEnd;227}228// We found a newline, check if it is escaped.229const char *EscapePtr = CurPtr - 1;230while(isHorizontalWhitespace(*EscapePtr))231EscapePtr--;232233if (*EscapePtr == '\\' ||234(EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&235EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {236// We found an escaped newline.237CurPtr = skipNewline(CurPtr, BufferEnd);238} else239return CurPtr; // Not an escaped newline.240}241return BufferEnd;242}243244/// Return the one past end pointer for C comments.245/// Very dumb, does not handle escaped newlines or trigraphs.246const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {247for ( ; BufferPtr != BufferEnd; ++BufferPtr) {248if (*BufferPtr == '*') {249assert(BufferPtr + 1 != BufferEnd);250if (*(BufferPtr + 1) == '/')251return BufferPtr;252}253}254llvm_unreachable("buffer end hit before '*/' was seen");255}256257} // end anonymous namespace258259void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,260tok::TokenKind Kind) {261const unsigned TokLen = TokEnd - BufferPtr;262Result.setLocation(getSourceLocation(BufferPtr));263Result.setKind(Kind);264Result.setLength(TokLen);265#ifndef NDEBUG266Result.TextPtr = "<UNSET>";267Result.IntVal = 7;268#endif269BufferPtr = TokEnd;270}271272const char *Lexer::skipTextToken() {273const char *TokenPtr = BufferPtr;274assert(TokenPtr < CommentEnd);275StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";276277again:278size_t End =279StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);280if (End == StringRef::npos)281return CommentEnd;282283// Doxygen doesn't recognize any commands in a one-line double quotation.284// If we don't find an ending quotation mark, we pretend it never began.285if (*(TokenPtr + End) == '\"') {286TokenPtr += End + 1;287End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");288if (End != StringRef::npos && *(TokenPtr + End) == '\"')289TokenPtr += End + 1;290goto again;291}292return TokenPtr + End;293}294295void Lexer::lexCommentText(Token &T) {296assert(CommentState == LCS_InsideBCPLComment ||297CommentState == LCS_InsideCComment);298299// Handles lexing non-command text, i.e. text and newline.300auto HandleNonCommandToken = [&]() -> void {301assert(State == LS_Normal);302303const char *TokenPtr = BufferPtr;304assert(TokenPtr < CommentEnd);305switch (*TokenPtr) {306case '\n':307case '\r':308TokenPtr = skipNewline(TokenPtr, CommentEnd);309formTokenWithChars(T, TokenPtr, tok::newline);310311if (CommentState == LCS_InsideCComment)312skipLineStartingDecorations();313return;314315default:316return formTextToken(T, skipTextToken());317}318};319320if (!ParseCommands)321return HandleNonCommandToken();322323switch (State) {324case LS_Normal:325break;326case LS_VerbatimBlockFirstLine:327lexVerbatimBlockFirstLine(T);328return;329case LS_VerbatimBlockBody:330lexVerbatimBlockBody(T);331return;332case LS_VerbatimLineText:333lexVerbatimLineText(T);334return;335case LS_HTMLStartTag:336lexHTMLStartTag(T);337return;338case LS_HTMLEndTag:339lexHTMLEndTag(T);340return;341}342343assert(State == LS_Normal);344const char *TokenPtr = BufferPtr;345assert(TokenPtr < CommentEnd);346switch(*TokenPtr) {347case '\\':348case '@': {349// Commands that start with a backslash and commands that start with350// 'at' have equivalent semantics. But we keep information about the351// exact syntax in AST for comments.352tok::TokenKind CommandKind =353(*TokenPtr == '@') ? tok::at_command : tok::backslash_command;354TokenPtr++;355if (TokenPtr == CommentEnd) {356formTextToken(T, TokenPtr);357return;358}359char C = *TokenPtr;360switch (C) {361default:362break;363364case '\\': case '@': case '&': case '$':365case '#': case '<': case '>': case '%':366case '\"': case '.': case ':':367// This is one of \\ \@ \& \$ etc escape sequences.368TokenPtr++;369if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {370// This is the \:: escape sequence.371TokenPtr++;372}373StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));374formTokenWithChars(T, TokenPtr, tok::text);375T.setText(UnescapedText);376return;377}378379// Don't make zero-length commands.380if (!isCommandNameStartCharacter(*TokenPtr)) {381formTextToken(T, TokenPtr);382return;383}384385TokenPtr = skipCommandName(TokenPtr, CommentEnd);386unsigned Length = TokenPtr - (BufferPtr + 1);387388// Hardcoded support for lexing LaTeX formula commands389// \f$ \f( \f) \f[ \f] \f{ \f} as a single command.390if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {391C = *TokenPtr;392if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||393C == '{' || C == '}') {394TokenPtr++;395Length++;396}397}398399StringRef CommandName(BufferPtr + 1, Length);400401const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);402if (!Info) {403if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {404StringRef CorrectedName = Info->Name;405SourceLocation Loc = getSourceLocation(BufferPtr);406SourceLocation EndLoc = getSourceLocation(TokenPtr);407SourceRange FullRange = SourceRange(Loc, EndLoc);408SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);409Diag(Loc, diag::warn_correct_comment_command_name)410<< FullRange << CommandName << CorrectedName411<< FixItHint::CreateReplacement(CommandRange, CorrectedName);412} else {413formTokenWithChars(T, TokenPtr, tok::unknown_command);414T.setUnknownCommandName(CommandName);415Diag(T.getLocation(), diag::warn_unknown_comment_command_name)416<< SourceRange(T.getLocation(), T.getEndLocation());417return;418}419}420if (Info->IsVerbatimBlockCommand) {421setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);422return;423}424if (Info->IsVerbatimLineCommand) {425setupAndLexVerbatimLine(T, TokenPtr, Info);426return;427}428formTokenWithChars(T, TokenPtr, CommandKind);429T.setCommandID(Info->getID());430return;431}432433case '&':434lexHTMLCharacterReference(T);435return;436437case '<': {438TokenPtr++;439if (TokenPtr == CommentEnd) {440formTextToken(T, TokenPtr);441return;442}443const char C = *TokenPtr;444if (isHTMLIdentifierStartingCharacter(C))445setupAndLexHTMLStartTag(T);446else if (C == '/')447setupAndLexHTMLEndTag(T);448else449formTextToken(T, TokenPtr);450return;451}452453default:454return HandleNonCommandToken();455}456}457458void Lexer::setupAndLexVerbatimBlock(Token &T,459const char *TextBegin,460char Marker, const CommandInfo *Info) {461assert(Info->IsVerbatimBlockCommand);462463VerbatimBlockEndCommandName.clear();464VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");465VerbatimBlockEndCommandName.append(Info->EndCommandName);466467formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);468T.setVerbatimBlockID(Info->getID());469470// If there is a newline following the verbatim opening command, skip the471// newline so that we don't create an tok::verbatim_block_line with empty472// text content.473if (BufferPtr != CommentEnd &&474isVerticalWhitespace(*BufferPtr)) {475BufferPtr = skipNewline(BufferPtr, CommentEnd);476State = LS_VerbatimBlockBody;477return;478}479480State = LS_VerbatimBlockFirstLine;481}482483void Lexer::lexVerbatimBlockFirstLine(Token &T) {484again:485assert(BufferPtr < CommentEnd);486487// FIXME: It would be better to scan the text once, finding either the block488// end command or newline.489//490// Extract current line.491const char *Newline = findNewline(BufferPtr, CommentEnd);492StringRef Line(BufferPtr, Newline - BufferPtr);493494// Look for end command in current line.495size_t Pos = Line.find(VerbatimBlockEndCommandName);496const char *TextEnd;497const char *NextLine;498if (Pos == StringRef::npos) {499// Current line is completely verbatim.500TextEnd = Newline;501NextLine = skipNewline(Newline, CommentEnd);502} else if (Pos == 0) {503// Current line contains just an end command.504const char *End = BufferPtr + VerbatimBlockEndCommandName.size();505StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));506formTokenWithChars(T, End, tok::verbatim_block_end);507T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());508State = LS_Normal;509return;510} else {511// There is some text, followed by end command. Extract text first.512TextEnd = BufferPtr + Pos;513NextLine = TextEnd;514// If there is only whitespace before end command, skip whitespace.515if (isWhitespace(BufferPtr, TextEnd)) {516BufferPtr = TextEnd;517goto again;518}519}520521StringRef Text(BufferPtr, TextEnd - BufferPtr);522formTokenWithChars(T, NextLine, tok::verbatim_block_line);523T.setVerbatimBlockText(Text);524525State = LS_VerbatimBlockBody;526}527528void Lexer::lexVerbatimBlockBody(Token &T) {529assert(State == LS_VerbatimBlockBody);530531if (CommentState == LCS_InsideCComment)532skipLineStartingDecorations();533534if (BufferPtr == CommentEnd) {535formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);536T.setVerbatimBlockText("");537return;538}539540lexVerbatimBlockFirstLine(T);541}542543void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,544const CommandInfo *Info) {545assert(Info->IsVerbatimLineCommand);546formTokenWithChars(T, TextBegin, tok::verbatim_line_name);547T.setVerbatimLineID(Info->getID());548549State = LS_VerbatimLineText;550}551552void Lexer::lexVerbatimLineText(Token &T) {553assert(State == LS_VerbatimLineText);554555// Extract current line.556const char *Newline = findNewline(BufferPtr, CommentEnd);557StringRef Text(BufferPtr, Newline - BufferPtr);558formTokenWithChars(T, Newline, tok::verbatim_line_text);559T.setVerbatimLineText(Text);560561State = LS_Normal;562}563564void Lexer::lexHTMLCharacterReference(Token &T) {565const char *TokenPtr = BufferPtr;566assert(*TokenPtr == '&');567TokenPtr++;568if (TokenPtr == CommentEnd) {569formTextToken(T, TokenPtr);570return;571}572const char *NamePtr;573bool isNamed = false;574bool isDecimal = false;575char C = *TokenPtr;576if (isHTMLNamedCharacterReferenceCharacter(C)) {577NamePtr = TokenPtr;578TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);579isNamed = true;580} else if (C == '#') {581TokenPtr++;582if (TokenPtr == CommentEnd) {583formTextToken(T, TokenPtr);584return;585}586C = *TokenPtr;587if (isHTMLDecimalCharacterReferenceCharacter(C)) {588NamePtr = TokenPtr;589TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);590isDecimal = true;591} else if (C == 'x' || C == 'X') {592TokenPtr++;593NamePtr = TokenPtr;594TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);595} else {596formTextToken(T, TokenPtr);597return;598}599} else {600formTextToken(T, TokenPtr);601return;602}603if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||604*TokenPtr != ';') {605formTextToken(T, TokenPtr);606return;607}608StringRef Name(NamePtr, TokenPtr - NamePtr);609TokenPtr++; // Skip semicolon.610StringRef Resolved;611if (isNamed)612Resolved = resolveHTMLNamedCharacterReference(Name);613else if (isDecimal)614Resolved = resolveHTMLDecimalCharacterReference(Name);615else616Resolved = resolveHTMLHexCharacterReference(Name);617618if (Resolved.empty()) {619formTextToken(T, TokenPtr);620return;621}622formTokenWithChars(T, TokenPtr, tok::text);623T.setText(Resolved);624}625626void Lexer::setupAndLexHTMLStartTag(Token &T) {627assert(BufferPtr[0] == '<' &&628isHTMLIdentifierStartingCharacter(BufferPtr[1]));629const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);630StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));631if (!isHTMLTagName(Name)) {632formTextToken(T, TagNameEnd);633return;634}635636formTokenWithChars(T, TagNameEnd, tok::html_start_tag);637T.setHTMLTagStartName(Name);638639BufferPtr = skipWhitespace(BufferPtr, CommentEnd);640641const char C = *BufferPtr;642if (BufferPtr != CommentEnd &&643(C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))644State = LS_HTMLStartTag;645}646647void Lexer::lexHTMLStartTag(Token &T) {648assert(State == LS_HTMLStartTag);649650const char *TokenPtr = BufferPtr;651char C = *TokenPtr;652if (isHTMLIdentifierCharacter(C)) {653TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);654StringRef Ident(BufferPtr, TokenPtr - BufferPtr);655formTokenWithChars(T, TokenPtr, tok::html_ident);656T.setHTMLIdent(Ident);657} else {658switch (C) {659case '=':660TokenPtr++;661formTokenWithChars(T, TokenPtr, tok::html_equals);662break;663case '\"':664case '\'': {665const char *OpenQuote = TokenPtr;666TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);667const char *ClosingQuote = TokenPtr;668if (TokenPtr != CommentEnd) // Skip closing quote.669TokenPtr++;670formTokenWithChars(T, TokenPtr, tok::html_quoted_string);671T.setHTMLQuotedString(StringRef(OpenQuote + 1,672ClosingQuote - (OpenQuote + 1)));673break;674}675case '>':676TokenPtr++;677formTokenWithChars(T, TokenPtr, tok::html_greater);678State = LS_Normal;679return;680case '/':681TokenPtr++;682if (TokenPtr != CommentEnd && *TokenPtr == '>') {683TokenPtr++;684formTokenWithChars(T, TokenPtr, tok::html_slash_greater);685} else686formTextToken(T, TokenPtr);687688State = LS_Normal;689return;690}691}692693// Now look ahead and return to normal state if we don't see any HTML tokens694// ahead.695BufferPtr = skipWhitespace(BufferPtr, CommentEnd);696if (BufferPtr == CommentEnd) {697State = LS_Normal;698return;699}700701C = *BufferPtr;702if (!isHTMLIdentifierStartingCharacter(C) &&703C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {704State = LS_Normal;705return;706}707}708709void Lexer::setupAndLexHTMLEndTag(Token &T) {710assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');711712const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);713const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);714StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);715if (!isHTMLTagName(Name)) {716formTextToken(T, TagNameEnd);717return;718}719720const char *End = skipWhitespace(TagNameEnd, CommentEnd);721722formTokenWithChars(T, End, tok::html_end_tag);723T.setHTMLTagEndName(Name);724725if (BufferPtr != CommentEnd && *BufferPtr == '>')726State = LS_HTMLEndTag;727}728729void Lexer::lexHTMLEndTag(Token &T) {730assert(BufferPtr != CommentEnd && *BufferPtr == '>');731732formTokenWithChars(T, BufferPtr + 1, tok::html_greater);733State = LS_Normal;734}735736Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,737const CommandTraits &Traits, SourceLocation FileLoc,738const char *BufferStart, const char *BufferEnd, bool ParseCommands)739: Allocator(Allocator), Diags(Diags), Traits(Traits),740BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),741FileLoc(FileLoc), ParseCommands(ParseCommands),742CommentState(LCS_BeforeComment), State(LS_Normal) {}743744void Lexer::lex(Token &T) {745again:746switch (CommentState) {747case LCS_BeforeComment:748if (BufferPtr == BufferEnd) {749formTokenWithChars(T, BufferPtr, tok::eof);750return;751}752753assert(*BufferPtr == '/');754BufferPtr++; // Skip first slash.755switch(*BufferPtr) {756case '/': { // BCPL comment.757BufferPtr++; // Skip second slash.758759if (BufferPtr != BufferEnd) {760// Skip Doxygen magic marker, if it is present.761// It might be missing because of a typo //< or /*<, or because we762// merged this non-Doxygen comment into a bunch of Doxygen comments763// around it: /** ... */ /* ... */ /** ... */764const char C = *BufferPtr;765if (C == '/' || C == '!')766BufferPtr++;767}768769// Skip less-than symbol that marks trailing comments.770// Skip it even if the comment is not a Doxygen one, because //< and /*<771// are frequent typos.772if (BufferPtr != BufferEnd && *BufferPtr == '<')773BufferPtr++;774775CommentState = LCS_InsideBCPLComment;776if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)777State = LS_Normal;778CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);779goto again;780}781case '*': { // C comment.782BufferPtr++; // Skip star.783784// Skip Doxygen magic marker.785const char C = *BufferPtr;786if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')787BufferPtr++;788789// Skip less-than symbol that marks trailing comments.790if (BufferPtr != BufferEnd && *BufferPtr == '<')791BufferPtr++;792793CommentState = LCS_InsideCComment;794State = LS_Normal;795CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);796goto again;797}798default:799llvm_unreachable("second character of comment should be '/' or '*'");800}801802case LCS_BetweenComments: {803// Consecutive comments are extracted only if there is only whitespace804// between them. So we can search for the start of the next comment.805const char *EndWhitespace = BufferPtr;806while(EndWhitespace != BufferEnd && *EndWhitespace != '/')807EndWhitespace++;808809// Turn any whitespace between comments (and there is only whitespace810// between them -- guaranteed by comment extraction) into a newline. We811// have two newlines between C comments in total (first one was synthesized812// after a comment).813formTokenWithChars(T, EndWhitespace, tok::newline);814815CommentState = LCS_BeforeComment;816break;817}818819case LCS_InsideBCPLComment:820case LCS_InsideCComment:821if (BufferPtr != CommentEnd) {822lexCommentText(T);823break;824} else {825// Skip C comment closing sequence.826if (CommentState == LCS_InsideCComment) {827assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');828BufferPtr += 2;829assert(BufferPtr <= BufferEnd);830831// Synthenize newline just after the C comment, regardless if there is832// actually a newline.833formTokenWithChars(T, BufferPtr, tok::newline);834835CommentState = LCS_BetweenComments;836break;837} else {838// Don't synthesized a newline after BCPL comment.839CommentState = LCS_BetweenComments;840goto again;841}842}843}844}845846StringRef Lexer::getSpelling(const Token &Tok,847const SourceManager &SourceMgr) const {848SourceLocation Loc = Tok.getLocation();849std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);850851bool InvalidTemp = false;852StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);853if (InvalidTemp)854return StringRef();855856const char *Begin = File.data() + LocInfo.second;857return StringRef(Begin, Tok.getLength());858}859860} // end namespace comments861} // end namespace clang862863864