Path: blob/main/contrib/llvm-project/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp
35266 views
//===--- HeaderIncludes.cpp - Insert/Delete #includes --*- C++ -*----------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//78#include "clang/Tooling/Inclusions/HeaderIncludes.h"9#include "clang/Basic/FileManager.h"10#include "clang/Basic/SourceManager.h"11#include "clang/Lex/Lexer.h"12#include "llvm/Support/FormatVariadic.h"13#include "llvm/Support/Path.h"14#include <optional>1516namespace clang {17namespace tooling {18namespace {1920LangOptions createLangOpts() {21LangOptions LangOpts;22LangOpts.CPlusPlus = 1;23LangOpts.CPlusPlus11 = 1;24LangOpts.CPlusPlus14 = 1;25LangOpts.LineComment = 1;26LangOpts.CXXOperatorNames = 1;27LangOpts.Bool = 1;28LangOpts.ObjC = 1;29LangOpts.MicrosoftExt = 1; // To get kw___try, kw___finally.30LangOpts.DeclSpecKeyword = 1; // To get __declspec.31LangOpts.WChar = 1; // To get wchar_t32return LangOpts;33}3435// Returns the offset after skipping a sequence of tokens, matched by \p36// GetOffsetAfterSequence, from the start of the code.37// \p GetOffsetAfterSequence should be a function that matches a sequence of38// tokens and returns an offset after the sequence.39unsigned getOffsetAfterTokenSequence(40StringRef FileName, StringRef Code, const IncludeStyle &Style,41llvm::function_ref<unsigned(const SourceManager &, Lexer &, Token &)>42GetOffsetAfterSequence) {43SourceManagerForFile VirtualSM(FileName, Code);44SourceManager &SM = VirtualSM.get();45LangOptions LangOpts = createLangOpts();46Lexer Lex(SM.getMainFileID(), SM.getBufferOrFake(SM.getMainFileID()), SM,47LangOpts);48Token Tok;49// Get the first token.50Lex.LexFromRawLexer(Tok);51return GetOffsetAfterSequence(SM, Lex, Tok);52}5354// Check if a sequence of tokens is like "#<Name> <raw_identifier>". If it is,55// \p Tok will be the token after this directive; otherwise, it can be any token56// after the given \p Tok (including \p Tok). If \p RawIDName is provided, the57// (second) raw_identifier name is checked.58bool checkAndConsumeDirectiveWithName(59Lexer &Lex, StringRef Name, Token &Tok,60std::optional<StringRef> RawIDName = std::nullopt) {61bool Matched = Tok.is(tok::hash) && !Lex.LexFromRawLexer(Tok) &&62Tok.is(tok::raw_identifier) &&63Tok.getRawIdentifier() == Name && !Lex.LexFromRawLexer(Tok) &&64Tok.is(tok::raw_identifier) &&65(!RawIDName || Tok.getRawIdentifier() == *RawIDName);66if (Matched)67Lex.LexFromRawLexer(Tok);68return Matched;69}7071void skipComments(Lexer &Lex, Token &Tok) {72while (Tok.is(tok::comment))73if (Lex.LexFromRawLexer(Tok))74return;75}7677// Returns the offset after header guard directives and any comments78// before/after header guards (e.g. #ifndef/#define pair, #pragma once). If no79// header guard is present in the code, this will return the offset after80// skipping all comments from the start of the code.81unsigned getOffsetAfterHeaderGuardsAndComments(StringRef FileName,82StringRef Code,83const IncludeStyle &Style) {84// \p Consume returns location after header guard or 0 if no header guard is85// found.86auto ConsumeHeaderGuardAndComment =87[&](std::function<unsigned(const SourceManager &SM, Lexer &Lex,88Token Tok)>89Consume) {90return getOffsetAfterTokenSequence(91FileName, Code, Style,92[&Consume](const SourceManager &SM, Lexer &Lex, Token Tok) {93skipComments(Lex, Tok);94unsigned InitialOffset = SM.getFileOffset(Tok.getLocation());95return std::max(InitialOffset, Consume(SM, Lex, Tok));96});97};98return std::max(99// #ifndef/#define100ConsumeHeaderGuardAndComment(101[](const SourceManager &SM, Lexer &Lex, Token Tok) -> unsigned {102if (checkAndConsumeDirectiveWithName(Lex, "ifndef", Tok)) {103skipComments(Lex, Tok);104if (checkAndConsumeDirectiveWithName(Lex, "define", Tok) &&105Tok.isAtStartOfLine())106return SM.getFileOffset(Tok.getLocation());107}108return 0;109}),110// #pragma once111ConsumeHeaderGuardAndComment(112[](const SourceManager &SM, Lexer &Lex, Token Tok) -> unsigned {113if (checkAndConsumeDirectiveWithName(Lex, "pragma", Tok,114StringRef("once")))115return SM.getFileOffset(Tok.getLocation());116return 0;117}));118}119120// Check if a sequence of tokens is like121// "#include ("header.h" | <header.h>)".122// If it is, \p Tok will be the token after this directive; otherwise, it can be123// any token after the given \p Tok (including \p Tok).124bool checkAndConsumeInclusiveDirective(Lexer &Lex, Token &Tok) {125auto Matched = [&]() {126Lex.LexFromRawLexer(Tok);127return true;128};129if (Tok.is(tok::hash) && !Lex.LexFromRawLexer(Tok) &&130Tok.is(tok::raw_identifier) && Tok.getRawIdentifier() == "include") {131if (Lex.LexFromRawLexer(Tok))132return false;133if (Tok.is(tok::string_literal))134return Matched();135if (Tok.is(tok::less)) {136while (!Lex.LexFromRawLexer(Tok) && Tok.isNot(tok::greater)) {137}138if (Tok.is(tok::greater))139return Matched();140}141}142return false;143}144145// Returns the offset of the last #include directive after which a new146// #include can be inserted. This ignores #include's after the #include block(s)147// in the beginning of a file to avoid inserting headers into code sections148// where new #include's should not be added by default.149// These code sections include:150// - raw string literals (containing #include).151// - #if blocks.152// - Special #include's among declarations (e.g. functions).153//154// If no #include after which a new #include can be inserted, this returns the155// offset after skipping all comments from the start of the code.156// Inserting after an #include is not allowed if it comes after code that is not157// #include (e.g. pre-processing directive that is not #include, declarations).158unsigned getMaxHeaderInsertionOffset(StringRef FileName, StringRef Code,159const IncludeStyle &Style) {160return getOffsetAfterTokenSequence(161FileName, Code, Style,162[](const SourceManager &SM, Lexer &Lex, Token Tok) {163skipComments(Lex, Tok);164unsigned MaxOffset = SM.getFileOffset(Tok.getLocation());165while (checkAndConsumeInclusiveDirective(Lex, Tok))166MaxOffset = SM.getFileOffset(Tok.getLocation());167return MaxOffset;168});169}170171inline StringRef trimInclude(StringRef IncludeName) {172return IncludeName.trim("\"<>");173}174175const char IncludeRegexPattern[] =176R"(^[\t\ ]*#[\t\ ]*(import|include)[^"<]*(["<][^">]*[">]))";177178// The filename of Path excluding extension.179// Used to match implementation with headers, this differs from sys::path::stem:180// - in names with multiple dots (foo.cu.cc) it terminates at the *first*181// - an empty stem is never returned: /foo/.bar.x => .bar182// - we don't bother to handle . and .. specially183StringRef matchingStem(llvm::StringRef Path) {184StringRef Name = llvm::sys::path::filename(Path);185return Name.substr(0, Name.find('.', 1));186}187188} // anonymous namespace189190IncludeCategoryManager::IncludeCategoryManager(const IncludeStyle &Style,191StringRef FileName)192: Style(Style), FileName(FileName) {193for (const auto &Category : Style.IncludeCategories) {194CategoryRegexs.emplace_back(Category.Regex, Category.RegexIsCaseSensitive195? llvm::Regex::NoFlags196: llvm::Regex::IgnoreCase);197}198IsMainFile = FileName.ends_with(".c") || FileName.ends_with(".cc") ||199FileName.ends_with(".cpp") || FileName.ends_with(".c++") ||200FileName.ends_with(".cxx") || FileName.ends_with(".m") ||201FileName.ends_with(".mm");202if (!Style.IncludeIsMainSourceRegex.empty()) {203llvm::Regex MainFileRegex(Style.IncludeIsMainSourceRegex);204IsMainFile |= MainFileRegex.match(FileName);205}206}207208int IncludeCategoryManager::getIncludePriority(StringRef IncludeName,209bool CheckMainHeader) const {210int Ret = INT_MAX;211for (unsigned i = 0, e = CategoryRegexs.size(); i != e; ++i)212if (CategoryRegexs[i].match(IncludeName)) {213Ret = Style.IncludeCategories[i].Priority;214break;215}216if (CheckMainHeader && IsMainFile && Ret > 0 && isMainHeader(IncludeName))217Ret = 0;218return Ret;219}220221int IncludeCategoryManager::getSortIncludePriority(StringRef IncludeName,222bool CheckMainHeader) const {223int Ret = INT_MAX;224for (unsigned i = 0, e = CategoryRegexs.size(); i != e; ++i)225if (CategoryRegexs[i].match(IncludeName)) {226Ret = Style.IncludeCategories[i].SortPriority;227if (Ret == 0)228Ret = Style.IncludeCategories[i].Priority;229break;230}231if (CheckMainHeader && IsMainFile && Ret > 0 && isMainHeader(IncludeName))232Ret = 0;233return Ret;234}235bool IncludeCategoryManager::isMainHeader(StringRef IncludeName) const {236switch (Style.MainIncludeChar) {237case IncludeStyle::MICD_Quote:238if (!IncludeName.starts_with("\""))239return false;240break;241case IncludeStyle::MICD_AngleBracket:242if (!IncludeName.starts_with("<"))243return false;244break;245case IncludeStyle::MICD_Any:246break;247}248249IncludeName =250IncludeName.drop_front(1).drop_back(1); // remove the surrounding "" or <>251// Not matchingStem: implementation files may have compound extensions but252// headers may not.253StringRef HeaderStem = llvm::sys::path::stem(IncludeName);254StringRef FileStem = llvm::sys::path::stem(FileName); // foo.cu for foo.cu.cc255StringRef MatchingFileStem = matchingStem(FileName); // foo for foo.cu.cc256// main-header examples:257// 1) foo.h => foo.cc258// 2) foo.h => foo.cu.cc259// 3) foo.proto.h => foo.proto.cc260//261// non-main-header examples:262// 1) foo.h => bar.cc263// 2) foo.proto.h => foo.cc264StringRef Matching;265if (MatchingFileStem.starts_with_insensitive(HeaderStem))266Matching = MatchingFileStem; // example 1), 2)267else if (FileStem.equals_insensitive(HeaderStem))268Matching = FileStem; // example 3)269if (!Matching.empty()) {270llvm::Regex MainIncludeRegex(HeaderStem.str() + Style.IncludeIsMainRegex,271llvm::Regex::IgnoreCase);272if (MainIncludeRegex.match(Matching))273return true;274}275return false;276}277278const llvm::Regex HeaderIncludes::IncludeRegex(IncludeRegexPattern);279280HeaderIncludes::HeaderIncludes(StringRef FileName, StringRef Code,281const IncludeStyle &Style)282: FileName(FileName), Code(Code), FirstIncludeOffset(-1),283MinInsertOffset(284getOffsetAfterHeaderGuardsAndComments(FileName, Code, Style)),285MaxInsertOffset(MinInsertOffset +286getMaxHeaderInsertionOffset(287FileName, Code.drop_front(MinInsertOffset), Style)),288MainIncludeFound(false),289Categories(Style, FileName) {290// Add 0 for main header and INT_MAX for headers that are not in any291// category.292Priorities = {0, INT_MAX};293for (const auto &Category : Style.IncludeCategories)294Priorities.insert(Category.Priority);295SmallVector<StringRef, 32> Lines;296Code.drop_front(MinInsertOffset).split(Lines, "\n");297298unsigned Offset = MinInsertOffset;299unsigned NextLineOffset;300SmallVector<StringRef, 4> Matches;301for (auto Line : Lines) {302NextLineOffset = std::min(Code.size(), Offset + Line.size() + 1);303if (IncludeRegex.match(Line, &Matches)) {304// If this is the last line without trailing newline, we need to make305// sure we don't delete across the file boundary.306addExistingInclude(307Include(Matches[2],308tooling::Range(309Offset, std::min(Line.size() + 1, Code.size() - Offset)),310Matches[1] == "import" ? tooling::IncludeDirective::Import311: tooling::IncludeDirective::Include),312NextLineOffset);313}314Offset = NextLineOffset;315}316317// Populate CategoryEndOfssets:318// - Ensure that CategoryEndOffset[Highest] is always populated.319// - If CategoryEndOffset[Priority] isn't set, use the next higher value320// that is set, up to CategoryEndOffset[Highest].321auto Highest = Priorities.begin();322if (CategoryEndOffsets.find(*Highest) == CategoryEndOffsets.end()) {323if (FirstIncludeOffset >= 0)324CategoryEndOffsets[*Highest] = FirstIncludeOffset;325else326CategoryEndOffsets[*Highest] = MinInsertOffset;327}328// By this point, CategoryEndOffset[Highest] is always set appropriately:329// - to an appropriate location before/after existing #includes, or330// - to right after the header guard, or331// - to the beginning of the file.332for (auto I = ++Priorities.begin(), E = Priorities.end(); I != E; ++I)333if (CategoryEndOffsets.find(*I) == CategoryEndOffsets.end())334CategoryEndOffsets[*I] = CategoryEndOffsets[*std::prev(I)];335}336337// \p Offset: the start of the line following this include directive.338void HeaderIncludes::addExistingInclude(Include IncludeToAdd,339unsigned NextLineOffset) {340auto Iter =341ExistingIncludes.try_emplace(trimInclude(IncludeToAdd.Name)).first;342Iter->second.push_back(std::move(IncludeToAdd));343auto &CurInclude = Iter->second.back();344// The header name with quotes or angle brackets.345// Only record the offset of current #include if we can insert after it.346if (CurInclude.R.getOffset() <= MaxInsertOffset) {347int Priority = Categories.getIncludePriority(348CurInclude.Name, /*CheckMainHeader=*/!MainIncludeFound);349if (Priority == 0)350MainIncludeFound = true;351CategoryEndOffsets[Priority] = NextLineOffset;352IncludesByPriority[Priority].push_back(&CurInclude);353if (FirstIncludeOffset < 0)354FirstIncludeOffset = CurInclude.R.getOffset();355}356}357358std::optional<tooling::Replacement>359HeaderIncludes::insert(llvm::StringRef IncludeName, bool IsAngled,360IncludeDirective Directive) const {361assert(IncludeName == trimInclude(IncludeName));362// If a <header> ("header") already exists in code, "header" (<header>) with363// different quotation and/or directive will still be inserted.364// FIXME: figure out if this is the best behavior.365auto It = ExistingIncludes.find(IncludeName);366if (It != ExistingIncludes.end()) {367for (const auto &Inc : It->second)368if (Inc.Directive == Directive &&369((IsAngled && StringRef(Inc.Name).starts_with("<")) ||370(!IsAngled && StringRef(Inc.Name).starts_with("\""))))371return std::nullopt;372}373std::string Quoted =374std::string(llvm::formatv(IsAngled ? "<{0}>" : "\"{0}\"", IncludeName));375StringRef QuotedName = Quoted;376int Priority = Categories.getIncludePriority(377QuotedName, /*CheckMainHeader=*/!MainIncludeFound);378auto CatOffset = CategoryEndOffsets.find(Priority);379assert(CatOffset != CategoryEndOffsets.end());380unsigned InsertOffset = CatOffset->second; // Fall back offset381auto Iter = IncludesByPriority.find(Priority);382if (Iter != IncludesByPriority.end()) {383for (const auto *Inc : Iter->second) {384if (QuotedName < Inc->Name) {385InsertOffset = Inc->R.getOffset();386break;387}388}389}390assert(InsertOffset <= Code.size());391llvm::StringRef DirectiveSpelling =392Directive == IncludeDirective::Include ? "include" : "import";393std::string NewInclude =394llvm::formatv("#{0} {1}\n", DirectiveSpelling, QuotedName);395// When inserting headers at end of the code, also append '\n' to the code396// if it does not end with '\n'.397// FIXME: when inserting multiple #includes at the end of code, only one398// newline should be added.399if (InsertOffset == Code.size() && (!Code.empty() && Code.back() != '\n'))400NewInclude = "\n" + NewInclude;401return tooling::Replacement(FileName, InsertOffset, 0, NewInclude);402}403404tooling::Replacements HeaderIncludes::remove(llvm::StringRef IncludeName,405bool IsAngled) const {406assert(IncludeName == trimInclude(IncludeName));407tooling::Replacements Result;408auto Iter = ExistingIncludes.find(IncludeName);409if (Iter == ExistingIncludes.end())410return Result;411for (const auto &Inc : Iter->second) {412if ((IsAngled && StringRef(Inc.Name).starts_with("\"")) ||413(!IsAngled && StringRef(Inc.Name).starts_with("<")))414continue;415llvm::Error Err = Result.add(tooling::Replacement(416FileName, Inc.R.getOffset(), Inc.R.getLength(), ""));417if (Err) {418auto ErrMsg = "Unexpected conflicts in #include deletions: " +419llvm::toString(std::move(Err));420llvm_unreachable(ErrMsg.c_str());421}422}423return Result;424}425426} // namespace tooling427} // namespace clang428429430