Path: blob/main/contrib/llvm-project/clang/lib/Tooling/Syntax/Tokens.cpp
35271 views
//===- Tokens.cpp - collect tokens from preprocessing ---------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7#include "clang/Tooling/Syntax/Tokens.h"89#include "clang/Basic/Diagnostic.h"10#include "clang/Basic/IdentifierTable.h"11#include "clang/Basic/LLVM.h"12#include "clang/Basic/LangOptions.h"13#include "clang/Basic/SourceLocation.h"14#include "clang/Basic/SourceManager.h"15#include "clang/Basic/TokenKinds.h"16#include "clang/Lex/PPCallbacks.h"17#include "clang/Lex/Preprocessor.h"18#include "clang/Lex/Token.h"19#include "llvm/ADT/ArrayRef.h"20#include "llvm/ADT/STLExtras.h"21#include "llvm/Support/Debug.h"22#include "llvm/Support/ErrorHandling.h"23#include "llvm/Support/FormatVariadic.h"24#include "llvm/Support/raw_ostream.h"25#include <algorithm>26#include <cassert>27#include <iterator>28#include <optional>29#include <string>30#include <utility>31#include <vector>3233using namespace clang;34using namespace clang::syntax;3536namespace {37// Finds the smallest consecutive subsuquence of Toks that covers R.38llvm::ArrayRef<syntax::Token>39getTokensCovering(llvm::ArrayRef<syntax::Token> Toks, SourceRange R,40const SourceManager &SM) {41if (R.isInvalid())42return {};43const syntax::Token *Begin =44llvm::partition_point(Toks, [&](const syntax::Token &T) {45return SM.isBeforeInTranslationUnit(T.location(), R.getBegin());46});47const syntax::Token *End =48llvm::partition_point(Toks, [&](const syntax::Token &T) {49return !SM.isBeforeInTranslationUnit(R.getEnd(), T.location());50});51if (Begin > End)52return {};53return {Begin, End};54}5556// Finds the range within FID corresponding to expanded tokens [First, Last].57// Prev precedes First and Next follows Last, these must *not* be included.58// If no range satisfies the criteria, returns an invalid range.59//60// #define ID(x) x61// ID(ID(ID(a1) a2))62// ~~ -> a163// ~~ -> a264// ~~~~~~~~~ -> a1 a265SourceRange spelledForExpandedSlow(SourceLocation First, SourceLocation Last,66SourceLocation Prev, SourceLocation Next,67FileID TargetFile,68const SourceManager &SM) {69// There are two main parts to this algorithm:70// - identifying which spelled range covers the expanded tokens71// - validating that this range doesn't cover any extra tokens (First/Last)72//73// We do these in order. However as we transform the expanded range into the74// spelled one, we adjust First/Last so the validation remains simple.7576assert(SM.getSLocEntry(TargetFile).isFile());77// In most cases, to select First and Last we must return their expansion78// range, i.e. the whole of any macros they are included in.79//80// When First and Last are part of the *same macro arg* of a macro written81// in TargetFile, we that slice of the arg, i.e. their spelling range.82//83// Unwrap such macro calls. If the target file has A(B(C)), the84// SourceLocation stack of a token inside C shows us the expansion of A first,85// then B, then any macros inside C's body, then C itself.86// (This is the reverse of the order the PP applies the expansions in).87while (First.isMacroID() && Last.isMacroID()) {88auto DecFirst = SM.getDecomposedLoc(First);89auto DecLast = SM.getDecomposedLoc(Last);90auto &ExpFirst = SM.getSLocEntry(DecFirst.first).getExpansion();91auto &ExpLast = SM.getSLocEntry(DecLast.first).getExpansion();9293if (!ExpFirst.isMacroArgExpansion() || !ExpLast.isMacroArgExpansion())94break;95// Locations are in the same macro arg if they expand to the same place.96// (They may still have different FileIDs - an arg can have >1 chunks!)97if (ExpFirst.getExpansionLocStart() != ExpLast.getExpansionLocStart())98break;99// Careful, given:100// #define HIDE ID(ID(a))101// ID(ID(HIDE))102// The token `a` is wrapped in 4 arg-expansions, we only want to unwrap 2.103// We distinguish them by whether the macro expands into the target file.104// Fortunately, the target file ones will always appear first.105auto ExpFileID = SM.getFileID(ExpFirst.getExpansionLocStart());106if (ExpFileID == TargetFile)107break;108// Replace each endpoint with its spelling inside the macro arg.109// (This is getImmediateSpellingLoc without repeating lookups).110First = ExpFirst.getSpellingLoc().getLocWithOffset(DecFirst.second);111Last = ExpLast.getSpellingLoc().getLocWithOffset(DecLast.second);112}113114// In all remaining cases we need the full containing macros.115// If this overlaps Prev or Next, then no range is possible.116SourceRange Candidate =117SM.getExpansionRange(SourceRange(First, Last)).getAsRange();118auto DecFirst = SM.getDecomposedExpansionLoc(Candidate.getBegin());119auto DecLast = SM.getDecomposedExpansionLoc(Candidate.getEnd());120// Can end up in the wrong file due to bad input or token-pasting shenanigans.121if (Candidate.isInvalid() || DecFirst.first != TargetFile ||122DecLast.first != TargetFile)123return SourceRange();124// Check bounds, which may still be inside macros.125if (Prev.isValid()) {126auto Dec = SM.getDecomposedLoc(SM.getExpansionRange(Prev).getBegin());127if (Dec.first != DecFirst.first || Dec.second >= DecFirst.second)128return SourceRange();129}130if (Next.isValid()) {131auto Dec = SM.getDecomposedLoc(SM.getExpansionRange(Next).getEnd());132if (Dec.first != DecLast.first || Dec.second <= DecLast.second)133return SourceRange();134}135// Now we know that Candidate is a file range that covers [First, Last]136// without encroaching on {Prev, Next}. Ship it!137return Candidate;138}139140} // namespace141142syntax::Token::Token(SourceLocation Location, unsigned Length,143tok::TokenKind Kind)144: Location(Location), Length(Length), Kind(Kind) {145assert(Location.isValid());146}147148syntax::Token::Token(const clang::Token &T)149: Token(T.getLocation(), T.getLength(), T.getKind()) {150assert(!T.isAnnotation());151}152153llvm::StringRef syntax::Token::text(const SourceManager &SM) const {154bool Invalid = false;155const char *Start = SM.getCharacterData(location(), &Invalid);156assert(!Invalid);157return llvm::StringRef(Start, length());158}159160FileRange syntax::Token::range(const SourceManager &SM) const {161assert(location().isFileID() && "must be a spelled token");162FileID File;163unsigned StartOffset;164std::tie(File, StartOffset) = SM.getDecomposedLoc(location());165return FileRange(File, StartOffset, StartOffset + length());166}167168FileRange syntax::Token::range(const SourceManager &SM,169const syntax::Token &First,170const syntax::Token &Last) {171auto F = First.range(SM);172auto L = Last.range(SM);173assert(F.file() == L.file() && "tokens from different files");174assert((F == L || F.endOffset() <= L.beginOffset()) &&175"wrong order of tokens");176return FileRange(F.file(), F.beginOffset(), L.endOffset());177}178179llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, const Token &T) {180return OS << T.str();181}182183FileRange::FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset)184: File(File), Begin(BeginOffset), End(EndOffset) {185assert(File.isValid());186assert(BeginOffset <= EndOffset);187}188189FileRange::FileRange(const SourceManager &SM, SourceLocation BeginLoc,190unsigned Length) {191assert(BeginLoc.isValid());192assert(BeginLoc.isFileID());193194std::tie(File, Begin) = SM.getDecomposedLoc(BeginLoc);195End = Begin + Length;196}197FileRange::FileRange(const SourceManager &SM, SourceLocation BeginLoc,198SourceLocation EndLoc) {199assert(BeginLoc.isValid());200assert(BeginLoc.isFileID());201assert(EndLoc.isValid());202assert(EndLoc.isFileID());203assert(SM.getFileID(BeginLoc) == SM.getFileID(EndLoc));204assert(SM.getFileOffset(BeginLoc) <= SM.getFileOffset(EndLoc));205206std::tie(File, Begin) = SM.getDecomposedLoc(BeginLoc);207End = SM.getFileOffset(EndLoc);208}209210llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS,211const FileRange &R) {212return OS << llvm::formatv("FileRange(file = {0}, offsets = {1}-{2})",213R.file().getHashValue(), R.beginOffset(),214R.endOffset());215}216217llvm::StringRef FileRange::text(const SourceManager &SM) const {218bool Invalid = false;219StringRef Text = SM.getBufferData(File, &Invalid);220if (Invalid)221return "";222assert(Begin <= Text.size());223assert(End <= Text.size());224return Text.substr(Begin, length());225}226227void TokenBuffer::indexExpandedTokens() {228// No-op if the index is already created.229if (!ExpandedTokIndex.empty())230return;231ExpandedTokIndex.reserve(ExpandedTokens.size());232// Index ExpandedTokens for faster lookups by SourceLocation.233for (size_t I = 0, E = ExpandedTokens.size(); I != E; ++I) {234SourceLocation Loc = ExpandedTokens[I].location();235if (Loc.isValid())236ExpandedTokIndex[Loc] = I;237}238}239240llvm::ArrayRef<syntax::Token> TokenBuffer::expandedTokens(SourceRange R) const {241if (R.isInvalid())242return {};243if (!ExpandedTokIndex.empty()) {244// Quick lookup if `R` is a token range.245// This is a huge win since majority of the users use ranges provided by an246// AST. Ranges in AST are token ranges from expanded token stream.247const auto B = ExpandedTokIndex.find(R.getBegin());248const auto E = ExpandedTokIndex.find(R.getEnd());249if (B != ExpandedTokIndex.end() && E != ExpandedTokIndex.end()) {250const Token *L = ExpandedTokens.data() + B->getSecond();251// Add 1 to End to make a half-open range.252const Token *R = ExpandedTokens.data() + E->getSecond() + 1;253if (L > R)254return {};255return {L, R};256}257}258// Slow case. Use `isBeforeInTranslationUnit` to binary search for the259// required range.260return getTokensCovering(expandedTokens(), R, *SourceMgr);261}262263CharSourceRange FileRange::toCharRange(const SourceManager &SM) const {264return CharSourceRange(265SourceRange(SM.getComposedLoc(File, Begin), SM.getComposedLoc(File, End)),266/*IsTokenRange=*/false);267}268269std::pair<const syntax::Token *, const TokenBuffer::Mapping *>270TokenBuffer::spelledForExpandedToken(const syntax::Token *Expanded) const {271assert(Expanded);272assert(ExpandedTokens.data() <= Expanded &&273Expanded < ExpandedTokens.data() + ExpandedTokens.size());274275auto FileIt = Files.find(276SourceMgr->getFileID(SourceMgr->getExpansionLoc(Expanded->location())));277assert(FileIt != Files.end() && "no file for an expanded token");278279const MarkedFile &File = FileIt->second;280281unsigned ExpandedIndex = Expanded - ExpandedTokens.data();282// Find the first mapping that produced tokens after \p Expanded.283auto It = llvm::partition_point(File.Mappings, [&](const Mapping &M) {284return M.BeginExpanded <= ExpandedIndex;285});286// Our token could only be produced by the previous mapping.287if (It == File.Mappings.begin()) {288// No previous mapping, no need to modify offsets.289return {&File.SpelledTokens[ExpandedIndex - File.BeginExpanded],290/*Mapping=*/nullptr};291}292--It; // 'It' now points to last mapping that started before our token.293294// Check if the token is part of the mapping.295if (ExpandedIndex < It->EndExpanded)296return {&File.SpelledTokens[It->BeginSpelled], /*Mapping=*/&*It};297298// Not part of the mapping, use the index from previous mapping to compute the299// corresponding spelled token.300return {301&File.SpelledTokens[It->EndSpelled + (ExpandedIndex - It->EndExpanded)],302/*Mapping=*/nullptr};303}304305const TokenBuffer::Mapping *306TokenBuffer::mappingStartingBeforeSpelled(const MarkedFile &F,307const syntax::Token *Spelled) {308assert(F.SpelledTokens.data() <= Spelled);309unsigned SpelledI = Spelled - F.SpelledTokens.data();310assert(SpelledI < F.SpelledTokens.size());311312auto It = llvm::partition_point(F.Mappings, [SpelledI](const Mapping &M) {313return M.BeginSpelled <= SpelledI;314});315if (It == F.Mappings.begin())316return nullptr;317--It;318return &*It;319}320321llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1>322TokenBuffer::expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const {323if (Spelled.empty())324return {};325const auto &File = fileForSpelled(Spelled);326327auto *FrontMapping = mappingStartingBeforeSpelled(File, &Spelled.front());328unsigned SpelledFrontI = &Spelled.front() - File.SpelledTokens.data();329assert(SpelledFrontI < File.SpelledTokens.size());330unsigned ExpandedBegin;331if (!FrontMapping) {332// No mapping that starts before the first token of Spelled, we don't have333// to modify offsets.334ExpandedBegin = File.BeginExpanded + SpelledFrontI;335} else if (SpelledFrontI < FrontMapping->EndSpelled) {336// This mapping applies to Spelled tokens.337if (SpelledFrontI != FrontMapping->BeginSpelled) {338// Spelled tokens don't cover the entire mapping, returning empty result.339return {}; // FIXME: support macro arguments.340}341// Spelled tokens start at the beginning of this mapping.342ExpandedBegin = FrontMapping->BeginExpanded;343} else {344// Spelled tokens start after the mapping ends (they start in the hole345// between 2 mappings, or between a mapping and end of the file).346ExpandedBegin =347FrontMapping->EndExpanded + (SpelledFrontI - FrontMapping->EndSpelled);348}349350auto *BackMapping = mappingStartingBeforeSpelled(File, &Spelled.back());351unsigned SpelledBackI = &Spelled.back() - File.SpelledTokens.data();352unsigned ExpandedEnd;353if (!BackMapping) {354// No mapping that starts before the last token of Spelled, we don't have to355// modify offsets.356ExpandedEnd = File.BeginExpanded + SpelledBackI + 1;357} else if (SpelledBackI < BackMapping->EndSpelled) {358// This mapping applies to Spelled tokens.359if (SpelledBackI + 1 != BackMapping->EndSpelled) {360// Spelled tokens don't cover the entire mapping, returning empty result.361return {}; // FIXME: support macro arguments.362}363ExpandedEnd = BackMapping->EndExpanded;364} else {365// Spelled tokens end after the mapping ends.366ExpandedEnd =367BackMapping->EndExpanded + (SpelledBackI - BackMapping->EndSpelled) + 1;368}369370assert(ExpandedBegin < ExpandedTokens.size());371assert(ExpandedEnd < ExpandedTokens.size());372// Avoid returning empty ranges.373if (ExpandedBegin == ExpandedEnd)374return {};375return {llvm::ArrayRef(ExpandedTokens.data() + ExpandedBegin,376ExpandedTokens.data() + ExpandedEnd)};377}378379llvm::ArrayRef<syntax::Token> TokenBuffer::spelledTokens(FileID FID) const {380auto It = Files.find(FID);381assert(It != Files.end());382return It->second.SpelledTokens;383}384385const syntax::Token *386TokenBuffer::spelledTokenContaining(SourceLocation Loc) const {387assert(Loc.isFileID());388const auto *Tok = llvm::partition_point(389spelledTokens(SourceMgr->getFileID(Loc)),390[&](const syntax::Token &Tok) { return Tok.endLocation() <= Loc; });391if (!Tok || Loc < Tok->location())392return nullptr;393return Tok;394}395396std::string TokenBuffer::Mapping::str() const {397return std::string(398llvm::formatv("spelled tokens: [{0},{1}), expanded tokens: [{2},{3})",399BeginSpelled, EndSpelled, BeginExpanded, EndExpanded));400}401402std::optional<llvm::ArrayRef<syntax::Token>>403TokenBuffer::spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const {404// In cases of invalid code, AST nodes can have source ranges that include405// the `eof` token. As there's no spelling for this token, exclude it from406// the range.407if (!Expanded.empty() && Expanded.back().kind() == tok::eof) {408Expanded = Expanded.drop_back();409}410// Mapping an empty range is ambiguous in case of empty mappings at either end411// of the range, bail out in that case.412if (Expanded.empty())413return std::nullopt;414const syntax::Token *First = &Expanded.front();415const syntax::Token *Last = &Expanded.back();416auto [FirstSpelled, FirstMapping] = spelledForExpandedToken(First);417auto [LastSpelled, LastMapping] = spelledForExpandedToken(Last);418419FileID FID = SourceMgr->getFileID(FirstSpelled->location());420// FIXME: Handle multi-file changes by trying to map onto a common root.421if (FID != SourceMgr->getFileID(LastSpelled->location()))422return std::nullopt;423424const MarkedFile &File = Files.find(FID)->second;425426// If the range is within one macro argument, the result may be only part of a427// Mapping. We must use the general (SourceManager-based) algorithm.428if (FirstMapping && FirstMapping == LastMapping &&429SourceMgr->isMacroArgExpansion(First->location()) &&430SourceMgr->isMacroArgExpansion(Last->location())) {431// We use excluded Prev/Next token for bounds checking.432SourceLocation Prev = (First == &ExpandedTokens.front())433? SourceLocation()434: (First - 1)->location();435SourceLocation Next = (Last == &ExpandedTokens.back())436? SourceLocation()437: (Last + 1)->location();438SourceRange Range = spelledForExpandedSlow(439First->location(), Last->location(), Prev, Next, FID, *SourceMgr);440if (Range.isInvalid())441return std::nullopt;442return getTokensCovering(File.SpelledTokens, Range, *SourceMgr);443}444445// Otherwise, use the fast version based on Mappings.446// Do not allow changes that doesn't cover full expansion.447unsigned FirstExpanded = Expanded.begin() - ExpandedTokens.data();448unsigned LastExpanded = Expanded.end() - ExpandedTokens.data();449if (FirstMapping && FirstExpanded != FirstMapping->BeginExpanded)450return std::nullopt;451if (LastMapping && LastMapping->EndExpanded != LastExpanded)452return std::nullopt;453return llvm::ArrayRef(454FirstMapping ? File.SpelledTokens.data() + FirstMapping->BeginSpelled455: FirstSpelled,456LastMapping ? File.SpelledTokens.data() + LastMapping->EndSpelled457: LastSpelled + 1);458}459460TokenBuffer::Expansion TokenBuffer::makeExpansion(const MarkedFile &F,461const Mapping &M) const {462Expansion E;463E.Spelled = llvm::ArrayRef(F.SpelledTokens.data() + M.BeginSpelled,464F.SpelledTokens.data() + M.EndSpelled);465E.Expanded = llvm::ArrayRef(ExpandedTokens.data() + M.BeginExpanded,466ExpandedTokens.data() + M.EndExpanded);467return E;468}469470const TokenBuffer::MarkedFile &471TokenBuffer::fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const {472assert(!Spelled.empty());473assert(Spelled.front().location().isFileID() && "not a spelled token");474auto FileIt = Files.find(SourceMgr->getFileID(Spelled.front().location()));475assert(FileIt != Files.end() && "file not tracked by token buffer");476const auto &File = FileIt->second;477assert(File.SpelledTokens.data() <= Spelled.data() &&478Spelled.end() <=479(File.SpelledTokens.data() + File.SpelledTokens.size()) &&480"Tokens not in spelled range");481#ifndef NDEBUG482auto T1 = Spelled.back().location();483auto T2 = File.SpelledTokens.back().location();484assert(T1 == T2 || sourceManager().isBeforeInTranslationUnit(T1, T2));485#endif486return File;487}488489std::optional<TokenBuffer::Expansion>490TokenBuffer::expansionStartingAt(const syntax::Token *Spelled) const {491assert(Spelled);492const auto &File = fileForSpelled(*Spelled);493494unsigned SpelledIndex = Spelled - File.SpelledTokens.data();495auto M = llvm::partition_point(File.Mappings, [&](const Mapping &M) {496return M.BeginSpelled < SpelledIndex;497});498if (M == File.Mappings.end() || M->BeginSpelled != SpelledIndex)499return std::nullopt;500return makeExpansion(File, *M);501}502503std::vector<TokenBuffer::Expansion> TokenBuffer::expansionsOverlapping(504llvm::ArrayRef<syntax::Token> Spelled) const {505if (Spelled.empty())506return {};507const auto &File = fileForSpelled(Spelled);508509// Find the first overlapping range, and then copy until we stop overlapping.510unsigned SpelledBeginIndex = Spelled.begin() - File.SpelledTokens.data();511unsigned SpelledEndIndex = Spelled.end() - File.SpelledTokens.data();512auto M = llvm::partition_point(File.Mappings, [&](const Mapping &M) {513return M.EndSpelled <= SpelledBeginIndex;514});515std::vector<TokenBuffer::Expansion> Expansions;516for (; M != File.Mappings.end() && M->BeginSpelled < SpelledEndIndex; ++M)517Expansions.push_back(makeExpansion(File, *M));518return Expansions;519}520521llvm::ArrayRef<syntax::Token>522syntax::spelledTokensTouching(SourceLocation Loc,523llvm::ArrayRef<syntax::Token> Tokens) {524assert(Loc.isFileID());525526auto *Right = llvm::partition_point(527Tokens, [&](const syntax::Token &Tok) { return Tok.location() < Loc; });528bool AcceptRight = Right != Tokens.end() && Right->location() <= Loc;529bool AcceptLeft =530Right != Tokens.begin() && (Right - 1)->endLocation() >= Loc;531return llvm::ArrayRef(Right - (AcceptLeft ? 1 : 0),532Right + (AcceptRight ? 1 : 0));533}534535llvm::ArrayRef<syntax::Token>536syntax::spelledTokensTouching(SourceLocation Loc,537const syntax::TokenBuffer &Tokens) {538return spelledTokensTouching(539Loc, Tokens.spelledTokens(Tokens.sourceManager().getFileID(Loc)));540}541542const syntax::Token *543syntax::spelledIdentifierTouching(SourceLocation Loc,544llvm::ArrayRef<syntax::Token> Tokens) {545for (const syntax::Token &Tok : spelledTokensTouching(Loc, Tokens)) {546if (Tok.kind() == tok::identifier)547return &Tok;548}549return nullptr;550}551552const syntax::Token *553syntax::spelledIdentifierTouching(SourceLocation Loc,554const syntax::TokenBuffer &Tokens) {555return spelledIdentifierTouching(556Loc, Tokens.spelledTokens(Tokens.sourceManager().getFileID(Loc)));557}558559std::vector<const syntax::Token *>560TokenBuffer::macroExpansions(FileID FID) const {561auto FileIt = Files.find(FID);562assert(FileIt != Files.end() && "file not tracked by token buffer");563auto &File = FileIt->second;564std::vector<const syntax::Token *> Expansions;565auto &Spelled = File.SpelledTokens;566for (auto Mapping : File.Mappings) {567const syntax::Token *Token = &Spelled[Mapping.BeginSpelled];568if (Token->kind() == tok::TokenKind::identifier)569Expansions.push_back(Token);570}571return Expansions;572}573574std::vector<syntax::Token> syntax::tokenize(const FileRange &FR,575const SourceManager &SM,576const LangOptions &LO) {577std::vector<syntax::Token> Tokens;578IdentifierTable Identifiers(LO);579auto AddToken = [&](clang::Token T) {580// Fill the proper token kind for keywords, etc.581if (T.getKind() == tok::raw_identifier && !T.needsCleaning() &&582!T.hasUCN()) { // FIXME: support needsCleaning and hasUCN cases.583clang::IdentifierInfo &II = Identifiers.get(T.getRawIdentifier());584T.setIdentifierInfo(&II);585T.setKind(II.getTokenID());586}587Tokens.push_back(syntax::Token(T));588};589590auto SrcBuffer = SM.getBufferData(FR.file());591Lexer L(SM.getLocForStartOfFile(FR.file()), LO, SrcBuffer.data(),592SrcBuffer.data() + FR.beginOffset(),593// We can't make BufEnd point to FR.endOffset, as Lexer requires a594// null terminated buffer.595SrcBuffer.data() + SrcBuffer.size());596597clang::Token T;598while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset())599AddToken(T);600// LexFromRawLexer returns true when it parses the last token of the file, add601// it iff it starts within the range we are interested in.602if (SM.getFileOffset(T.getLocation()) < FR.endOffset())603AddToken(T);604return Tokens;605}606607std::vector<syntax::Token> syntax::tokenize(FileID FID, const SourceManager &SM,608const LangOptions &LO) {609return tokenize(syntax::FileRange(FID, 0, SM.getFileIDSize(FID)), SM, LO);610}611612/// Records information reqired to construct mappings for the token buffer that613/// we are collecting.614class TokenCollector::CollectPPExpansions : public PPCallbacks {615public:616CollectPPExpansions(TokenCollector &C) : Collector(&C) {}617618/// Disabled instance will stop reporting anything to TokenCollector.619/// This ensures that uses of the preprocessor after TokenCollector::consume()620/// is called do not access the (possibly invalid) collector instance.621void disable() { Collector = nullptr; }622623void MacroExpands(const clang::Token &MacroNameTok, const MacroDefinition &MD,624SourceRange Range, const MacroArgs *Args) override {625if (!Collector)626return;627const auto &SM = Collector->PP.getSourceManager();628// Only record top-level expansions that directly produce expanded tokens.629// This excludes those where:630// - the macro use is inside a macro body,631// - the macro appears in an argument to another macro.632// However macro expansion isn't really a tree, it's token rewrite rules,633// so there are other cases, e.g.634// #define B(X) X635// #define A 1 + B636// A(2)637// Both A and B produce expanded tokens, though the macro name 'B' comes638// from an expansion. The best we can do is merge the mappings for both.639640// The *last* token of any top-level macro expansion must be in a file.641// (In the example above, see the closing paren of the expansion of B).642if (!Range.getEnd().isFileID())643return;644// If there's a current expansion that encloses this one, this one can't be645// top-level.646if (LastExpansionEnd.isValid() &&647!SM.isBeforeInTranslationUnit(LastExpansionEnd, Range.getEnd()))648return;649650// If the macro invocation (B) starts in a macro (A) but ends in a file,651// we'll create a merged mapping for A + B by overwriting the endpoint for652// A's startpoint.653if (!Range.getBegin().isFileID()) {654Range.setBegin(SM.getExpansionLoc(Range.getBegin()));655assert(Collector->Expansions.count(Range.getBegin()) &&656"Overlapping macros should have same expansion location");657}658659Collector->Expansions[Range.getBegin()] = Range.getEnd();660LastExpansionEnd = Range.getEnd();661}662// FIXME: handle directives like #pragma, #include, etc.663private:664TokenCollector *Collector;665/// Used to detect recursive macro expansions.666SourceLocation LastExpansionEnd;667};668669/// Fills in the TokenBuffer by tracing the run of a preprocessor. The670/// implementation tracks the tokens, macro expansions and directives coming671/// from the preprocessor and:672/// - for each token, figures out if it is a part of an expanded token stream,673/// spelled token stream or both. Stores the tokens appropriately.674/// - records mappings from the spelled to expanded token ranges, e.g. for macro675/// expansions.676/// FIXME: also properly record:677/// - #include directives,678/// - #pragma, #line and other PP directives,679/// - skipped pp regions,680/// - ...681682TokenCollector::TokenCollector(Preprocessor &PP) : PP(PP) {683// Collect the expanded token stream during preprocessing.684PP.setTokenWatcher([this](const clang::Token &T) {685if (T.isAnnotation())686return;687DEBUG_WITH_TYPE("collect-tokens", llvm::dbgs()688<< "Token: "689<< syntax::Token(T).dumpForTests(690this->PP.getSourceManager())691<< "\n"692693);694Expanded.push_back(syntax::Token(T));695});696// And locations of macro calls, to properly recover boundaries of those in697// case of empty expansions.698auto CB = std::make_unique<CollectPPExpansions>(*this);699this->Collector = CB.get();700PP.addPPCallbacks(std::move(CB));701}702703/// Builds mappings and spelled tokens in the TokenBuffer based on the expanded704/// token stream.705class TokenCollector::Builder {706public:707Builder(std::vector<syntax::Token> Expanded, PPExpansions CollectedExpansions,708const SourceManager &SM, const LangOptions &LangOpts)709: Result(SM), CollectedExpansions(std::move(CollectedExpansions)), SM(SM),710LangOpts(LangOpts) {711Result.ExpandedTokens = std::move(Expanded);712}713714TokenBuffer build() && {715assert(!Result.ExpandedTokens.empty());716assert(Result.ExpandedTokens.back().kind() == tok::eof);717718// Tokenize every file that contributed tokens to the expanded stream.719buildSpelledTokens();720721// The expanded token stream consists of runs of tokens that came from722// the same source (a macro expansion, part of a file etc).723// Between these runs are the logical positions of spelled tokens that724// didn't expand to anything.725while (NextExpanded < Result.ExpandedTokens.size() - 1 /* eof */) {726// Create empty mappings for spelled tokens that expanded to nothing here.727// May advance NextSpelled, but NextExpanded is unchanged.728discard();729// Create mapping for a contiguous run of expanded tokens.730// Advances NextExpanded past the run, and NextSpelled accordingly.731unsigned OldPosition = NextExpanded;732advance();733if (NextExpanded == OldPosition)734diagnoseAdvanceFailure();735}736// If any tokens remain in any of the files, they didn't expand to anything.737// Create empty mappings up until the end of the file.738for (const auto &File : Result.Files)739discard(File.first);740741#ifndef NDEBUG742for (auto &pair : Result.Files) {743auto &mappings = pair.second.Mappings;744assert(llvm::is_sorted(mappings, [](const TokenBuffer::Mapping &M1,745const TokenBuffer::Mapping &M2) {746return M1.BeginSpelled < M2.BeginSpelled &&747M1.EndSpelled < M2.EndSpelled &&748M1.BeginExpanded < M2.BeginExpanded &&749M1.EndExpanded < M2.EndExpanded;750}));751}752#endif753754return std::move(Result);755}756757private:758// Consume a sequence of spelled tokens that didn't expand to anything.759// In the simplest case, skips spelled tokens until finding one that produced760// the NextExpanded token, and creates an empty mapping for them.761// If Drain is provided, skips remaining tokens from that file instead.762void discard(std::optional<FileID> Drain = std::nullopt) {763SourceLocation Target =764Drain ? SM.getLocForEndOfFile(*Drain)765: SM.getExpansionLoc(766Result.ExpandedTokens[NextExpanded].location());767FileID File = SM.getFileID(Target);768const auto &SpelledTokens = Result.Files[File].SpelledTokens;769auto &NextSpelled = this->NextSpelled[File];770771TokenBuffer::Mapping Mapping;772Mapping.BeginSpelled = NextSpelled;773// When dropping trailing tokens from a file, the empty mapping should774// be positioned within the file's expanded-token range (at the end).775Mapping.BeginExpanded = Mapping.EndExpanded =776Drain ? Result.Files[*Drain].EndExpanded : NextExpanded;777// We may want to split into several adjacent empty mappings.778// FlushMapping() emits the current mapping and starts a new one.779auto FlushMapping = [&, this] {780Mapping.EndSpelled = NextSpelled;781if (Mapping.BeginSpelled != Mapping.EndSpelled)782Result.Files[File].Mappings.push_back(Mapping);783Mapping.BeginSpelled = NextSpelled;784};785786while (NextSpelled < SpelledTokens.size() &&787SpelledTokens[NextSpelled].location() < Target) {788// If we know mapping bounds at [NextSpelled, KnownEnd] (macro expansion)789// then we want to partition our (empty) mapping.790// [Start, NextSpelled) [NextSpelled, KnownEnd] (KnownEnd, Target)791SourceLocation KnownEnd =792CollectedExpansions.lookup(SpelledTokens[NextSpelled].location());793if (KnownEnd.isValid()) {794FlushMapping(); // Emits [Start, NextSpelled)795while (NextSpelled < SpelledTokens.size() &&796SpelledTokens[NextSpelled].location() <= KnownEnd)797++NextSpelled;798FlushMapping(); // Emits [NextSpelled, KnownEnd]799// Now the loop continues and will emit (KnownEnd, Target).800} else {801++NextSpelled;802}803}804FlushMapping();805}806807// Consumes the NextExpanded token and others that are part of the same run.808// Increases NextExpanded and NextSpelled by at least one, and adds a mapping809// (unless this is a run of file tokens, which we represent with no mapping).810void advance() {811const syntax::Token &Tok = Result.ExpandedTokens[NextExpanded];812SourceLocation Expansion = SM.getExpansionLoc(Tok.location());813FileID File = SM.getFileID(Expansion);814const auto &SpelledTokens = Result.Files[File].SpelledTokens;815auto &NextSpelled = this->NextSpelled[File];816817if (Tok.location().isFileID()) {818// A run of file tokens continues while the expanded/spelled tokens match.819while (NextSpelled < SpelledTokens.size() &&820NextExpanded < Result.ExpandedTokens.size() &&821SpelledTokens[NextSpelled].location() ==822Result.ExpandedTokens[NextExpanded].location()) {823++NextSpelled;824++NextExpanded;825}826// We need no mapping for file tokens copied to the expanded stream.827} else {828// We found a new macro expansion. We should have its spelling bounds.829auto End = CollectedExpansions.lookup(Expansion);830assert(End.isValid() && "Macro expansion wasn't captured?");831832// Mapping starts here...833TokenBuffer::Mapping Mapping;834Mapping.BeginExpanded = NextExpanded;835Mapping.BeginSpelled = NextSpelled;836// ... consumes spelled tokens within bounds we captured ...837while (NextSpelled < SpelledTokens.size() &&838SpelledTokens[NextSpelled].location() <= End)839++NextSpelled;840// ... consumes expanded tokens rooted at the same expansion ...841while (NextExpanded < Result.ExpandedTokens.size() &&842SM.getExpansionLoc(843Result.ExpandedTokens[NextExpanded].location()) == Expansion)844++NextExpanded;845// ... and ends here.846Mapping.EndExpanded = NextExpanded;847Mapping.EndSpelled = NextSpelled;848Result.Files[File].Mappings.push_back(Mapping);849}850}851852// advance() is supposed to consume at least one token - if not, we crash.853void diagnoseAdvanceFailure() {854#ifndef NDEBUG855// Show the failed-to-map token in context.856for (unsigned I = (NextExpanded < 10) ? 0 : NextExpanded - 10;857I < NextExpanded + 5 && I < Result.ExpandedTokens.size(); ++I) {858const char *L =859(I == NextExpanded) ? "!! " : (I < NextExpanded) ? "ok " : " ";860llvm::errs() << L << Result.ExpandedTokens[I].dumpForTests(SM) << "\n";861}862#endif863llvm_unreachable("Couldn't map expanded token to spelled tokens!");864}865866/// Initializes TokenBuffer::Files and fills spelled tokens and expanded867/// ranges for each of the files.868void buildSpelledTokens() {869for (unsigned I = 0; I < Result.ExpandedTokens.size(); ++I) {870const auto &Tok = Result.ExpandedTokens[I];871auto FID = SM.getFileID(SM.getExpansionLoc(Tok.location()));872auto It = Result.Files.try_emplace(FID);873TokenBuffer::MarkedFile &File = It.first->second;874875// The eof token should not be considered part of the main-file's range.876File.EndExpanded = Tok.kind() == tok::eof ? I : I + 1;877878if (!It.second)879continue; // we have seen this file before.880// This is the first time we see this file.881File.BeginExpanded = I;882File.SpelledTokens = tokenize(FID, SM, LangOpts);883}884}885886TokenBuffer Result;887unsigned NextExpanded = 0; // cursor in ExpandedTokens888llvm::DenseMap<FileID, unsigned> NextSpelled; // cursor in SpelledTokens889PPExpansions CollectedExpansions;890const SourceManager &SM;891const LangOptions &LangOpts;892};893894TokenBuffer TokenCollector::consume() && {895PP.setTokenWatcher(nullptr);896Collector->disable();897return Builder(std::move(Expanded), std::move(Expansions),898PP.getSourceManager(), PP.getLangOpts())899.build();900}901902std::string syntax::Token::str() const {903return std::string(llvm::formatv("Token({0}, length = {1})",904tok::getTokenName(kind()), length()));905}906907std::string syntax::Token::dumpForTests(const SourceManager &SM) const {908return std::string(llvm::formatv("Token(`{0}`, {1}, length = {2})", text(SM),909tok::getTokenName(kind()), length()));910}911912std::string TokenBuffer::dumpForTests() const {913auto PrintToken = [this](const syntax::Token &T) -> std::string {914if (T.kind() == tok::eof)915return "<eof>";916return std::string(T.text(*SourceMgr));917};918919auto DumpTokens = [this, &PrintToken](llvm::raw_ostream &OS,920llvm::ArrayRef<syntax::Token> Tokens) {921if (Tokens.empty()) {922OS << "<empty>";923return;924}925OS << Tokens[0].text(*SourceMgr);926for (unsigned I = 1; I < Tokens.size(); ++I) {927if (Tokens[I].kind() == tok::eof)928continue;929OS << " " << PrintToken(Tokens[I]);930}931};932933std::string Dump;934llvm::raw_string_ostream OS(Dump);935936OS << "expanded tokens:\n"937<< " ";938// (!) we do not show '<eof>'.939DumpTokens(OS, llvm::ArrayRef(ExpandedTokens).drop_back());940OS << "\n";941942std::vector<FileID> Keys;943for (const auto &F : Files)944Keys.push_back(F.first);945llvm::sort(Keys);946947for (FileID ID : Keys) {948const MarkedFile &File = Files.find(ID)->second;949auto Entry = SourceMgr->getFileEntryRefForID(ID);950if (!Entry)951continue; // Skip builtin files.952std::string Path = llvm::sys::path::convert_to_slash(Entry->getName());953OS << llvm::formatv("file '{0}'\n", Path) << " spelled tokens:\n"954<< " ";955DumpTokens(OS, File.SpelledTokens);956OS << "\n";957958if (File.Mappings.empty()) {959OS << " no mappings.\n";960continue;961}962OS << " mappings:\n";963for (auto &M : File.Mappings) {964OS << llvm::formatv(965" ['{0}'_{1}, '{2}'_{3}) => ['{4}'_{5}, '{6}'_{7})\n",966PrintToken(File.SpelledTokens[M.BeginSpelled]), M.BeginSpelled,967M.EndSpelled == File.SpelledTokens.size()968? "<eof>"969: PrintToken(File.SpelledTokens[M.EndSpelled]),970M.EndSpelled, PrintToken(ExpandedTokens[M.BeginExpanded]),971M.BeginExpanded, PrintToken(ExpandedTokens[M.EndExpanded]),972M.EndExpanded);973}974}975return Dump;976}977978979