Path: blob/main/contrib/llvm-project/clang/lib/Tooling/Transformer/SourceCode.cpp
35266 views
//===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file provides functions that simplify extraction of source code.9//10//===----------------------------------------------------------------------===//11#include "clang/Tooling/Transformer/SourceCode.h"12#include "clang/AST/ASTContext.h"13#include "clang/AST/Attr.h"14#include "clang/AST/Comment.h"15#include "clang/AST/Decl.h"16#include "clang/AST/DeclCXX.h"17#include "clang/AST/DeclTemplate.h"18#include "clang/AST/Expr.h"19#include "clang/Basic/SourceManager.h"20#include "clang/Lex/Lexer.h"21#include "llvm/Support/Errc.h"22#include "llvm/Support/Error.h"23#include <set>2425using namespace clang;2627using llvm::errc;28using llvm::StringError;2930StringRef clang::tooling::getText(CharSourceRange Range,31const ASTContext &Context) {32return Lexer::getSourceText(Range, Context.getSourceManager(),33Context.getLangOpts());34}3536CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range,37tok::TokenKind Next,38ASTContext &Context) {39CharSourceRange R = Lexer::getAsCharRange(Range, Context.getSourceManager(),40Context.getLangOpts());41if (R.isInvalid())42return Range;43Token Tok;44bool Err =45Lexer::getRawToken(R.getEnd(), Tok, Context.getSourceManager(),46Context.getLangOpts(), /*IgnoreWhiteSpace=*/true);47if (Err || !Tok.is(Next))48return Range;49return CharSourceRange::getTokenRange(Range.getBegin(), Tok.getLocation());50}5152llvm::Error clang::tooling::validateRange(const CharSourceRange &Range,53const SourceManager &SM,54bool AllowSystemHeaders) {55if (Range.isInvalid())56return llvm::make_error<StringError>(errc::invalid_argument,57"Invalid range");5859if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID())60return llvm::make_error<StringError>(61errc::invalid_argument, "Range starts or ends in a macro expansion");6263if (!AllowSystemHeaders) {64if (SM.isInSystemHeader(Range.getBegin()) ||65SM.isInSystemHeader(Range.getEnd()))66return llvm::make_error<StringError>(errc::invalid_argument,67"Range is in system header");68}6970std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin());71std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd());72if (BeginInfo.first != EndInfo.first)73return llvm::make_error<StringError>(74errc::invalid_argument, "Range begins and ends in different files");7576if (BeginInfo.second > EndInfo.second)77return llvm::make_error<StringError>(errc::invalid_argument,78"Range's begin is past its end");7980return llvm::Error::success();81}8283llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range,84const SourceManager &SM) {85return validateRange(Range, SM, /*AllowSystemHeaders=*/false);86}8788static bool spelledInMacroDefinition(SourceLocation Loc,89const SourceManager &SM) {90while (Loc.isMacroID()) {91const auto &Expansion = SM.getSLocEntry(SM.getFileID(Loc)).getExpansion();92if (Expansion.isMacroArgExpansion()) {93// Check the spelling location of the macro arg, in case the arg itself is94// in a macro expansion.95Loc = Expansion.getSpellingLoc();96} else {97return true;98}99}100return false;101}102103// Returns the expansion char-range of `Loc` if `Loc` is a split token. For104// example, `>>` in nested templates needs the first `>` to be split, otherwise105// the `SourceLocation` of the token would lex as `>>` instead of `>`.106static std::optional<CharSourceRange>107getExpansionForSplitToken(SourceLocation Loc, const SourceManager &SM,108const LangOptions &LangOpts) {109if (Loc.isMacroID()) {110bool Invalid = false;111auto &SLoc = SM.getSLocEntry(SM.getFileID(Loc), &Invalid);112if (Invalid)113return std::nullopt;114if (auto &Expansion = SLoc.getExpansion();115!Expansion.isExpansionTokenRange()) {116// A char-range expansion is only used where a token-range would be117// incorrect, and so identifies this as a split token (and importantly,118// not as a macro).119return Expansion.getExpansionLocRange();120}121}122return std::nullopt;123}124125// If `Range` covers a split token, returns the expansion range, otherwise126// returns `Range`.127static CharSourceRange getRangeForSplitTokens(CharSourceRange Range,128const SourceManager &SM,129const LangOptions &LangOpts) {130if (Range.isTokenRange()) {131auto BeginToken = getExpansionForSplitToken(Range.getBegin(), SM, LangOpts);132auto EndToken = getExpansionForSplitToken(Range.getEnd(), SM, LangOpts);133if (EndToken) {134SourceLocation BeginLoc =135BeginToken ? BeginToken->getBegin() : Range.getBegin();136// We can't use the expansion location with a token-range, because that137// will incorrectly lex the end token, so use a char-range that ends at138// the split.139return CharSourceRange::getCharRange(BeginLoc, EndToken->getEnd());140} else if (BeginToken) {141// Since the end token is not split, the whole range covers the split, so142// the only adjustment we make is to use the expansion location of the143// begin token.144return CharSourceRange::getTokenRange(BeginToken->getBegin(),145Range.getEnd());146}147}148return Range;149}150151static CharSourceRange getRange(const CharSourceRange &EditRange,152const SourceManager &SM,153const LangOptions &LangOpts,154bool IncludeMacroExpansion) {155CharSourceRange Range;156if (IncludeMacroExpansion) {157Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts);158} else {159auto AdjustedRange = getRangeForSplitTokens(EditRange, SM, LangOpts);160if (spelledInMacroDefinition(AdjustedRange.getBegin(), SM) ||161spelledInMacroDefinition(AdjustedRange.getEnd(), SM))162return {};163164auto B = SM.getSpellingLoc(AdjustedRange.getBegin());165auto E = SM.getSpellingLoc(AdjustedRange.getEnd());166if (AdjustedRange.isTokenRange())167E = Lexer::getLocForEndOfToken(E, 0, SM, LangOpts);168Range = CharSourceRange::getCharRange(B, E);169}170return Range;171}172173std::optional<CharSourceRange> clang::tooling::getFileRangeForEdit(174const CharSourceRange &EditRange, const SourceManager &SM,175const LangOptions &LangOpts, bool IncludeMacroExpansion) {176CharSourceRange Range =177getRange(EditRange, SM, LangOpts, IncludeMacroExpansion);178bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM));179if (IsInvalid)180return std::nullopt;181return Range;182}183184std::optional<CharSourceRange> clang::tooling::getFileRange(185const CharSourceRange &EditRange, const SourceManager &SM,186const LangOptions &LangOpts, bool IncludeMacroExpansion) {187CharSourceRange Range =188getRange(EditRange, SM, LangOpts, IncludeMacroExpansion);189bool IsInvalid =190llvm::errorToBool(validateRange(Range, SM, /*AllowSystemHeaders=*/true));191if (IsInvalid)192return std::nullopt;193return Range;194}195196static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {197return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);198}199200static bool contains(const std::set<tok::TokenKind> &Terminators,201const Token &Tok) {202return Terminators.count(Tok.getKind()) > 0;203}204205// Returns the exclusive, *file* end location of the entity whose last token is206// at location 'EntityLast'. That is, it returns the location one past the last207// relevant character.208//209// Associated tokens include comments, horizontal whitespace and 'Terminators'210// -- optional tokens, which, if any are found, will be included; if211// 'Terminators' is empty, we will not include any extra tokens beyond comments212// and horizontal whitespace.213static SourceLocation214getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast,215const std::set<tok::TokenKind> &Terminators,216const LangOptions &LangOpts) {217assert(EntityLast.isValid() && "Invalid end location found.");218219// We remember the last location of a non-horizontal-whitespace token we have220// lexed; this is the location up to which we will want to delete.221// FIXME: Support using the spelling loc here for cases where we want to222// analyze the macro text.223224CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast);225// FIXME: Should check isTokenRange(), for the (rare) case that226// `ExpansionRange` is a character range.227std::unique_ptr<Lexer> Lexer = [&]() {228bool Invalid = false;229auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd());230llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);231assert(!Invalid && "Cannot get file/offset");232return std::make_unique<clang::Lexer>(233SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(),234File.data() + FileOffset.second, File.end());235}();236237// Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).238Lexer->SetKeepWhitespaceMode(true);239240// Generally, the code we want to include looks like this ([] are optional),241// If Terminators is empty:242// [ <comment> ] [ <newline> ]243// Otherwise:244// ... <terminator> [ <comment> ] [ <newline> ]245246Token Tok;247bool Terminated = false;248249// First, lex to the current token (which is the last token of the range that250// is definitely associated with the decl). Then, we process the first token251// separately from the rest based on conditions that hold specifically for252// that first token.253//254// We do not search for a terminator if none is required or we've already255// encountered it. Otherwise, if the original `EntityLast` location was in a256// macro expansion, we don't have visibility into the text, so we assume we've257// already terminated. However, we note this assumption with258// `TerminatedByMacro`, because we'll want to handle it somewhat differently259// for the terminators semicolon and comma. These terminators can be safely260// associated with the entity when they appear after the macro -- extra261// semicolons have no effect on the program and a well-formed program won't262// have multiple commas in a row, so we're guaranteed that there is only one.263//264// FIXME: This handling of macros is more conservative than necessary. When265// the end of the expansion coincides with the end of the node, we can still266// safely analyze the code. But, it is more complicated, because we need to267// start by lexing the spelling loc for the first token and then switch to the268// expansion loc.269bool TerminatedByMacro = false;270Lexer->LexFromRawLexer(Tok);271if (Terminators.empty() || contains(Terminators, Tok))272Terminated = true;273else if (EntityLast.isMacroID()) {274Terminated = true;275TerminatedByMacro = true;276}277278// We save the most recent candidate for the exclusive end location.279SourceLocation End = Tok.getEndLoc();280281while (!Terminated) {282// Lex the next token we want to possibly expand the range with.283Lexer->LexFromRawLexer(Tok);284285switch (Tok.getKind()) {286case tok::eof:287// Unexpected separators.288case tok::l_brace:289case tok::r_brace:290case tok::comma:291return End;292// Whitespace pseudo-tokens.293case tok::unknown:294if (startsWithNewline(SM, Tok))295// Include at least until the end of the line.296End = Tok.getEndLoc();297break;298default:299if (contains(Terminators, Tok))300Terminated = true;301End = Tok.getEndLoc();302break;303}304}305306do {307// Lex the next token we want to possibly expand the range with.308Lexer->LexFromRawLexer(Tok);309310switch (Tok.getKind()) {311case tok::unknown:312if (startsWithNewline(SM, Tok))313// We're done, but include this newline.314return Tok.getEndLoc();315break;316case tok::comment:317// Include any comments we find on the way.318End = Tok.getEndLoc();319break;320case tok::semi:321case tok::comma:322if (TerminatedByMacro && contains(Terminators, Tok)) {323End = Tok.getEndLoc();324// We've found a real terminator.325TerminatedByMacro = false;326break;327}328// Found an unrelated token; stop and don't include it.329return End;330default:331// Found an unrelated token; stop and don't include it.332return End;333}334} while (true);335}336337// Returns the expected terminator tokens for the given declaration.338//339// If we do not know the correct terminator token, returns an empty set.340//341// There are cases where we have more than one possible terminator (for example,342// we find either a comma or a semicolon after a VarDecl).343static std::set<tok::TokenKind> getTerminators(const Decl &D) {344if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D))345return {tok::semi};346347if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D))348return {tok::r_brace, tok::semi};349350if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D))351return {tok::comma, tok::semi};352353return {};354}355356// Starting from `Loc`, skips whitespace up to, and including, a single357// newline. Returns the (exclusive) end of any skipped whitespace (that is, the358// location immediately after the whitespace).359static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM,360SourceLocation Loc,361const LangOptions &LangOpts) {362const char *LocChars = SM.getCharacterData(Loc);363int i = 0;364while (isHorizontalWhitespace(LocChars[i]))365++i;366if (isVerticalWhitespace(LocChars[i]))367++i;368return Loc.getLocWithOffset(i);369}370371// Is `Loc` separated from any following decl by something meaningful (e.g. an372// empty line, a comment), ignoring horizontal whitespace? Since this is a373// heuristic, we return false when in doubt. `Loc` cannot be the first location374// in the file.375static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc,376const LangOptions &LangOpts) {377// If the preceding character is a newline, we'll check for an empty line as a378// separator. However, we can't identify an empty line using tokens, so we379// analyse the characters. If we try to use tokens, we'll just end up with a380// whitespace token, whose characters we'd have to analyse anyhow.381bool Invalid = false;382const char *LocChars =383SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid);384assert(!Invalid &&385"Loc must be a valid character and not the first of the source file.");386if (isVerticalWhitespace(LocChars[0])) {387for (int i = 1; isWhitespace(LocChars[i]); ++i)388if (isVerticalWhitespace(LocChars[i]))389return true;390}391// We didn't find an empty line, so lex the next token, skipping past any392// whitespace we just scanned.393Token Tok;394bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts,395/*IgnoreWhiteSpace=*/true);396if (Failed)397// Any text that confuses the lexer seems fair to consider a separation.398return true;399400switch (Tok.getKind()) {401case tok::comment:402case tok::l_brace:403case tok::r_brace:404case tok::eof:405return true;406default:407return false;408}409}410411CharSourceRange tooling::getAssociatedRange(const Decl &Decl,412ASTContext &Context) {413const SourceManager &SM = Context.getSourceManager();414const LangOptions &LangOpts = Context.getLangOpts();415CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange());416417// First, expand to the start of the template<> declaration if necessary.418if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {419if (const auto *T = Record->getDescribedClassTemplate())420if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))421Range.setBegin(T->getBeginLoc());422} else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {423if (const auto *T = F->getDescribedFunctionTemplate())424if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))425Range.setBegin(T->getBeginLoc());426}427428// Next, expand the end location past trailing comments to include a potential429// newline at the end of the decl's line.430Range.setEnd(431getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts));432Range.setTokenRange(false);433434// Expand to include preceeding associated comments. We ignore any comments435// that are not preceeding the decl, since we've already skipped trailing436// comments with getEntityEndLoc.437if (const RawComment *Comment =438Decl.getASTContext().getRawCommentForDeclNoCache(&Decl))439// Only include a preceding comment if:440// * it is *not* separate from the declaration (not including any newline441// that immediately follows the comment),442// * the decl *is* separate from any following entity (so, there are no443// other entities the comment could refer to), and444// * it is not a IfThisThenThat lint check.445if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),446Range.getBegin()) &&447!atOrBeforeSeparation(448SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts),449LangOpts) &&450atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) {451const StringRef CommentText = Comment->getRawText(SM);452if (!CommentText.contains("LINT.IfChange") &&453!CommentText.contains("LINT.ThenChange"))454Range.setBegin(Comment->getBeginLoc());455}456// Add leading attributes.457for (auto *Attr : Decl.attrs()) {458if (Attr->getLocation().isInvalid() ||459!SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin()))460continue;461Range.setBegin(Attr->getLocation());462463// Extend to the left '[[' or '__attribute((' if we saw the attribute,464// unless it is not a valid location.465bool Invalid;466StringRef Source =467SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);468if (Invalid)469continue;470llvm::StringRef BeforeAttr =471Source.substr(0, SM.getFileOffset(Range.getBegin()));472llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();473474for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {475// Handle whitespace between attribute prefix and attribute value.476if (BeforeAttrStripped.ends_with(Prefix)) {477// Move start to start position of prefix, which is478// length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)479// positions to the left.480Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>(481-BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size())));482break;483// If we didn't see '[[' or '__attribute' it's probably coming from a484// macro expansion which is already handled by makeFileCharRange(),485// below.486}487}488}489490// Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,491// Range.getBegin() may be inside an expansion.492return Lexer::makeFileCharRange(Range, SM, LangOpts);493}494495496