Path: blob/main/contrib/llvm-project/clang/lib/Analysis/CloneDetection.cpp
35233 views
//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7///8/// This file implements classes for searching and analyzing source code clones.9///10//===----------------------------------------------------------------------===//1112#include "clang/Analysis/CloneDetection.h"13#include "clang/AST/Attr.h"14#include "clang/AST/DataCollection.h"15#include "clang/AST/DeclTemplate.h"16#include "clang/Basic/SourceManager.h"17#include "llvm/Support/MD5.h"18#include "llvm/Support/Path.h"1920using namespace clang;2122StmtSequence::StmtSequence(const CompoundStmt *Stmt, const Decl *D,23unsigned StartIndex, unsigned EndIndex)24: S(Stmt), D(D), StartIndex(StartIndex), EndIndex(EndIndex) {25assert(Stmt && "Stmt must not be a nullptr");26assert(StartIndex < EndIndex && "Given array should not be empty");27assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");28}2930StmtSequence::StmtSequence(const Stmt *Stmt, const Decl *D)31: S(Stmt), D(D), StartIndex(0), EndIndex(0) {}3233StmtSequence::StmtSequence()34: S(nullptr), D(nullptr), StartIndex(0), EndIndex(0) {}3536bool StmtSequence::contains(const StmtSequence &Other) const {37// If both sequences reside in different declarations, they can never contain38// each other.39if (D != Other.D)40return false;4142const SourceManager &SM = getASTContext().getSourceManager();4344// Otherwise check if the start and end locations of the current sequence45// surround the other sequence.46bool StartIsInBounds =47SM.isBeforeInTranslationUnit(getBeginLoc(), Other.getBeginLoc()) ||48getBeginLoc() == Other.getBeginLoc();49if (!StartIsInBounds)50return false;5152bool EndIsInBounds =53SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||54Other.getEndLoc() == getEndLoc();55return EndIsInBounds;56}5758StmtSequence::iterator StmtSequence::begin() const {59if (!holdsSequence()) {60return &S;61}62auto CS = cast<CompoundStmt>(S);63return CS->body_begin() + StartIndex;64}6566StmtSequence::iterator StmtSequence::end() const {67if (!holdsSequence()) {68return reinterpret_cast<StmtSequence::iterator>(&S) + 1;69}70auto CS = cast<CompoundStmt>(S);71return CS->body_begin() + EndIndex;72}7374ASTContext &StmtSequence::getASTContext() const {75assert(D);76return D->getASTContext();77}7879SourceLocation StmtSequence::getBeginLoc() const {80return front()->getBeginLoc();81}8283SourceLocation StmtSequence::getEndLoc() const { return back()->getEndLoc(); }8485SourceRange StmtSequence::getSourceRange() const {86return SourceRange(getBeginLoc(), getEndLoc());87}8889void CloneDetector::analyzeCodeBody(const Decl *D) {90assert(D);91assert(D->hasBody());9293Sequences.push_back(StmtSequence(D->getBody(), D));94}9596/// Returns true if and only if \p Stmt contains at least one other97/// sequence in the \p Group.98static bool containsAnyInGroup(StmtSequence &Seq,99CloneDetector::CloneGroup &Group) {100for (StmtSequence &GroupSeq : Group) {101if (Seq.contains(GroupSeq))102return true;103}104return false;105}106107/// Returns true if and only if all sequences in \p OtherGroup are108/// contained by a sequence in \p Group.109static bool containsGroup(CloneDetector::CloneGroup &Group,110CloneDetector::CloneGroup &OtherGroup) {111// We have less sequences in the current group than we have in the other,112// so we will never fulfill the requirement for returning true. This is only113// possible because we know that a sequence in Group can contain at most114// one sequence in OtherGroup.115if (Group.size() < OtherGroup.size())116return false;117118for (StmtSequence &Stmt : Group) {119if (!containsAnyInGroup(Stmt, OtherGroup))120return false;121}122return true;123}124125void OnlyLargestCloneConstraint::constrain(126std::vector<CloneDetector::CloneGroup> &Result) {127std::vector<unsigned> IndexesToRemove;128129// Compare every group in the result with the rest. If one groups contains130// another group, we only need to return the bigger group.131// Note: This doesn't scale well, so if possible avoid calling any heavy132// function from this loop to minimize the performance impact.133for (unsigned i = 0; i < Result.size(); ++i) {134for (unsigned j = 0; j < Result.size(); ++j) {135// Don't compare a group with itself.136if (i == j)137continue;138139if (containsGroup(Result[j], Result[i])) {140IndexesToRemove.push_back(i);141break;142}143}144}145146// Erasing a list of indexes from the vector should be done with decreasing147// indexes. As IndexesToRemove is constructed with increasing values, we just148// reverse iterate over it to get the desired order.149for (unsigned I : llvm::reverse(IndexesToRemove))150Result.erase(Result.begin() + I);151}152153bool FilenamePatternConstraint::isAutoGenerated(154const CloneDetector::CloneGroup &Group) {155if (IgnoredFilesPattern.empty() || Group.empty() ||156!IgnoredFilesRegex->isValid())157return false;158159for (const StmtSequence &S : Group) {160const SourceManager &SM = S.getASTContext().getSourceManager();161StringRef Filename = llvm::sys::path::filename(162SM.getFilename(S.getContainingDecl()->getLocation()));163if (IgnoredFilesRegex->match(Filename))164return true;165}166167return false;168}169170/// This class defines what a type II code clone is: If it collects for two171/// statements the same data, then those two statements are considered to be172/// clones of each other.173///174/// All collected data is forwarded to the given data consumer of the type T.175/// The data consumer class needs to provide a member method with the signature:176/// update(StringRef Str)177namespace {178template <class T>179class CloneTypeIIStmtDataCollector180: public ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>> {181ASTContext &Context;182/// The data sink to which all data is forwarded.183T &DataConsumer;184185template <class Ty> void addData(const Ty &Data) {186data_collection::addDataToConsumer(DataConsumer, Data);187}188189public:190CloneTypeIIStmtDataCollector(const Stmt *S, ASTContext &Context,191T &DataConsumer)192: Context(Context), DataConsumer(DataConsumer) {193this->Visit(S);194}195196// Define a visit method for each class to collect data and subsequently visit197// all parent classes. This uses a template so that custom visit methods by us198// take precedence.199#define DEF_ADD_DATA(CLASS, CODE) \200template <class = void> void Visit##CLASS(const CLASS *S) { \201CODE; \202ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \203}204205#include "clang/AST/StmtDataCollectors.inc"206207// Type II clones ignore variable names and literals, so let's skip them.208#define SKIP(CLASS) \209void Visit##CLASS(const CLASS *S) { \210ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \211}212SKIP(DeclRefExpr)213SKIP(MemberExpr)214SKIP(IntegerLiteral)215SKIP(FloatingLiteral)216SKIP(StringLiteral)217SKIP(CXXBoolLiteralExpr)218SKIP(CharacterLiteral)219#undef SKIP220};221} // end anonymous namespace222223static size_t createHash(llvm::MD5 &Hash) {224size_t HashCode;225226// Create the final hash code for the current Stmt.227llvm::MD5::MD5Result HashResult;228Hash.final(HashResult);229230// Copy as much as possible of the generated hash code to the Stmt's hash231// code.232std::memcpy(&HashCode, &HashResult,233std::min(sizeof(HashCode), sizeof(HashResult)));234235return HashCode;236}237238/// Generates and saves a hash code for the given Stmt.239/// \param S The given Stmt.240/// \param D The Decl containing S.241/// \param StmtsByHash Output parameter that will contain the hash codes for242/// each StmtSequence in the given Stmt.243/// \return The hash code of the given Stmt.244///245/// If the given Stmt is a CompoundStmt, this method will also generate246/// hashes for all possible StmtSequences in the children of this Stmt.247static size_t248saveHash(const Stmt *S, const Decl *D,249std::vector<std::pair<size_t, StmtSequence>> &StmtsByHash) {250llvm::MD5 Hash;251ASTContext &Context = D->getASTContext();252253CloneTypeIIStmtDataCollector<llvm::MD5>(S, Context, Hash);254255auto CS = dyn_cast<CompoundStmt>(S);256SmallVector<size_t, 8> ChildHashes;257258for (const Stmt *Child : S->children()) {259if (Child == nullptr) {260ChildHashes.push_back(0);261continue;262}263size_t ChildHash = saveHash(Child, D, StmtsByHash);264Hash.update(265StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash)));266ChildHashes.push_back(ChildHash);267}268269if (CS) {270// If we're in a CompoundStmt, we hash all possible combinations of child271// statements to find clones in those subsequences.272// We first go through every possible starting position of a subsequence.273for (unsigned Pos = 0; Pos < CS->size(); ++Pos) {274// Then we try all possible lengths this subsequence could have and275// reuse the same hash object to make sure we only hash every child276// hash exactly once.277llvm::MD5 Hash;278for (unsigned Length = 1; Length <= CS->size() - Pos; ++Length) {279// Grab the current child hash and put it into our hash. We do280// -1 on the index because we start counting the length at 1.281size_t ChildHash = ChildHashes[Pos + Length - 1];282Hash.update(283StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash)));284// If we have at least two elements in our subsequence, we can start285// saving it.286if (Length > 1) {287llvm::MD5 SubHash = Hash;288StmtsByHash.push_back(std::make_pair(289createHash(SubHash), StmtSequence(CS, D, Pos, Pos + Length)));290}291}292}293}294295size_t HashCode = createHash(Hash);296StmtsByHash.push_back(std::make_pair(HashCode, StmtSequence(S, D)));297return HashCode;298}299300namespace {301/// Wrapper around FoldingSetNodeID that it can be used as the template302/// argument of the StmtDataCollector.303class FoldingSetNodeIDWrapper {304305llvm::FoldingSetNodeID &FS;306307public:308FoldingSetNodeIDWrapper(llvm::FoldingSetNodeID &FS) : FS(FS) {}309310void update(StringRef Str) { FS.AddString(Str); }311};312} // end anonymous namespace313314/// Writes the relevant data from all statements and child statements315/// in the given StmtSequence into the given FoldingSetNodeID.316static void CollectStmtSequenceData(const StmtSequence &Sequence,317FoldingSetNodeIDWrapper &OutputData) {318for (const Stmt *S : Sequence) {319CloneTypeIIStmtDataCollector<FoldingSetNodeIDWrapper>(320S, Sequence.getASTContext(), OutputData);321322for (const Stmt *Child : S->children()) {323if (!Child)324continue;325326CollectStmtSequenceData(StmtSequence(Child, Sequence.getContainingDecl()),327OutputData);328}329}330}331332/// Returns true if both sequences are clones of each other.333static bool areSequencesClones(const StmtSequence &LHS,334const StmtSequence &RHS) {335// We collect the data from all statements in the sequence as we did before336// when generating a hash value for each sequence. But this time we don't337// hash the collected data and compare the whole data set instead. This338// prevents any false-positives due to hash code collisions.339llvm::FoldingSetNodeID DataLHS, DataRHS;340FoldingSetNodeIDWrapper LHSWrapper(DataLHS);341FoldingSetNodeIDWrapper RHSWrapper(DataRHS);342343CollectStmtSequenceData(LHS, LHSWrapper);344CollectStmtSequenceData(RHS, RHSWrapper);345346return DataLHS == DataRHS;347}348349void RecursiveCloneTypeIIHashConstraint::constrain(350std::vector<CloneDetector::CloneGroup> &Sequences) {351// FIXME: Maybe we can do this in-place and don't need this additional vector.352std::vector<CloneDetector::CloneGroup> Result;353354for (CloneDetector::CloneGroup &Group : Sequences) {355// We assume in the following code that the Group is non-empty, so we356// skip all empty groups.357if (Group.empty())358continue;359360std::vector<std::pair<size_t, StmtSequence>> StmtsByHash;361362// Generate hash codes for all children of S and save them in StmtsByHash.363for (const StmtSequence &S : Group) {364saveHash(S.front(), S.getContainingDecl(), StmtsByHash);365}366367// Sort hash_codes in StmtsByHash.368llvm::stable_sort(StmtsByHash, llvm::less_first());369370// Check for each StmtSequence if its successor has the same hash value.371// We don't check the last StmtSequence as it has no successor.372// Note: The 'size - 1 ' in the condition is safe because we check for an373// empty Group vector at the beginning of this function.374for (unsigned i = 0; i < StmtsByHash.size() - 1; ++i) {375const auto Current = StmtsByHash[i];376377// It's likely that we just found a sequence of StmtSequences that378// represent a CloneGroup, so we create a new group and start checking and379// adding the StmtSequences in this sequence.380CloneDetector::CloneGroup NewGroup;381382size_t PrototypeHash = Current.first;383384for (; i < StmtsByHash.size(); ++i) {385// A different hash value means we have reached the end of the sequence.386if (PrototypeHash != StmtsByHash[i].first) {387// The current sequence could be the start of a new CloneGroup. So we388// decrement i so that we visit it again in the outer loop.389// Note: i can never be 0 at this point because we are just comparing390// the hash of the Current StmtSequence with itself in the 'if' above.391assert(i != 0);392--i;393break;394}395// Same hash value means we should add the StmtSequence to the current396// group.397NewGroup.push_back(StmtsByHash[i].second);398}399400// We created a new clone group with matching hash codes and move it to401// the result vector.402Result.push_back(NewGroup);403}404}405// Sequences is the output parameter, so we copy our result into it.406Sequences = Result;407}408409void RecursiveCloneTypeIIVerifyConstraint::constrain(410std::vector<CloneDetector::CloneGroup> &Sequences) {411CloneConstraint::splitCloneGroups(412Sequences, [](const StmtSequence &A, const StmtSequence &B) {413return areSequencesClones(A, B);414});415}416417size_t MinComplexityConstraint::calculateStmtComplexity(418const StmtSequence &Seq, std::size_t Limit,419const std::string &ParentMacroStack) {420if (Seq.empty())421return 0;422423size_t Complexity = 1;424425ASTContext &Context = Seq.getASTContext();426427// Look up what macros expanded into the current statement.428std::string MacroStack =429data_collection::getMacroStack(Seq.getBeginLoc(), Context);430431// First, check if ParentMacroStack is not empty which means we are currently432// dealing with a parent statement which was expanded from a macro.433// If this parent statement was expanded from the same macros as this434// statement, we reduce the initial complexity of this statement to zero.435// This causes that a group of statements that were generated by a single436// macro expansion will only increase the total complexity by one.437// Note: This is not the final complexity of this statement as we still438// add the complexity of the child statements to the complexity value.439if (!ParentMacroStack.empty() && MacroStack == ParentMacroStack) {440Complexity = 0;441}442443// Iterate over the Stmts in the StmtSequence and add their complexity values444// to the current complexity value.445if (Seq.holdsSequence()) {446for (const Stmt *S : Seq) {447Complexity += calculateStmtComplexity(448StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack);449if (Complexity >= Limit)450return Limit;451}452} else {453for (const Stmt *S : Seq.front()->children()) {454Complexity += calculateStmtComplexity(455StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack);456if (Complexity >= Limit)457return Limit;458}459}460return Complexity;461}462463void MatchingVariablePatternConstraint::constrain(464std::vector<CloneDetector::CloneGroup> &CloneGroups) {465CloneConstraint::splitCloneGroups(466CloneGroups, [](const StmtSequence &A, const StmtSequence &B) {467VariablePattern PatternA(A);468VariablePattern PatternB(B);469return PatternA.countPatternDifferences(PatternB) == 0;470});471}472473void CloneConstraint::splitCloneGroups(474std::vector<CloneDetector::CloneGroup> &CloneGroups,475llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>476Compare) {477std::vector<CloneDetector::CloneGroup> Result;478for (auto &HashGroup : CloneGroups) {479// Contains all indexes in HashGroup that were already added to a480// CloneGroup.481std::vector<char> Indexes;482Indexes.resize(HashGroup.size());483484for (unsigned i = 0; i < HashGroup.size(); ++i) {485// Skip indexes that are already part of a CloneGroup.486if (Indexes[i])487continue;488489// Pick the first unhandled StmtSequence and consider it as the490// beginning491// of a new CloneGroup for now.492// We don't add i to Indexes because we never iterate back.493StmtSequence Prototype = HashGroup[i];494CloneDetector::CloneGroup PotentialGroup = {Prototype};495++Indexes[i];496497// Check all following StmtSequences for clones.498for (unsigned j = i + 1; j < HashGroup.size(); ++j) {499// Skip indexes that are already part of a CloneGroup.500if (Indexes[j])501continue;502503// If a following StmtSequence belongs to our CloneGroup, we add it.504const StmtSequence &Candidate = HashGroup[j];505506if (!Compare(Prototype, Candidate))507continue;508509PotentialGroup.push_back(Candidate);510// Make sure we never visit this StmtSequence again.511++Indexes[j];512}513514// Otherwise, add it to the result and continue searching for more515// groups.516Result.push_back(PotentialGroup);517}518519assert(llvm::all_of(Indexes, [](char c) { return c == 1; }));520}521CloneGroups = Result;522}523524void VariablePattern::addVariableOccurence(const VarDecl *VarDecl,525const Stmt *Mention) {526// First check if we already reference this variable527for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) {528if (Variables[KindIndex] == VarDecl) {529// If yes, add a new occurrence that points to the existing entry in530// the Variables vector.531Occurences.emplace_back(KindIndex, Mention);532return;533}534}535// If this variable wasn't already referenced, add it to the list of536// referenced variables and add a occurrence that points to this new entry.537Occurences.emplace_back(Variables.size(), Mention);538Variables.push_back(VarDecl);539}540541void VariablePattern::addVariables(const Stmt *S) {542// Sometimes we get a nullptr (such as from IfStmts which often have nullptr543// children). We skip such statements as they don't reference any544// variables.545if (!S)546return;547548// Check if S is a reference to a variable. If yes, add it to the pattern.549if (auto D = dyn_cast<DeclRefExpr>(S)) {550if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl()))551addVariableOccurence(VD, D);552}553554// Recursively check all children of the given statement.555for (const Stmt *Child : S->children()) {556addVariables(Child);557}558}559560unsigned VariablePattern::countPatternDifferences(561const VariablePattern &Other,562VariablePattern::SuspiciousClonePair *FirstMismatch) {563unsigned NumberOfDifferences = 0;564565assert(Other.Occurences.size() == Occurences.size());566for (unsigned i = 0; i < Occurences.size(); ++i) {567auto ThisOccurence = Occurences[i];568auto OtherOccurence = Other.Occurences[i];569if (ThisOccurence.KindID == OtherOccurence.KindID)570continue;571572++NumberOfDifferences;573574// If FirstMismatch is not a nullptr, we need to store information about575// the first difference between the two patterns.576if (FirstMismatch == nullptr)577continue;578579// Only proceed if we just found the first difference as we only store580// information about the first difference.581if (NumberOfDifferences != 1)582continue;583584const VarDecl *FirstSuggestion = nullptr;585// If there is a variable available in the list of referenced variables586// which wouldn't break the pattern if it is used in place of the587// current variable, we provide this variable as the suggested fix.588if (OtherOccurence.KindID < Variables.size())589FirstSuggestion = Variables[OtherOccurence.KindID];590591// Store information about the first clone.592FirstMismatch->FirstCloneInfo =593VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo(594Variables[ThisOccurence.KindID], ThisOccurence.Mention,595FirstSuggestion);596597// Same as above but with the other clone. We do this for both clones as598// we don't know which clone is the one containing the unintended599// pattern error.600const VarDecl *SecondSuggestion = nullptr;601if (ThisOccurence.KindID < Other.Variables.size())602SecondSuggestion = Other.Variables[ThisOccurence.KindID];603604// Store information about the second clone.605FirstMismatch->SecondCloneInfo =606VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo(607Other.Variables[OtherOccurence.KindID], OtherOccurence.Mention,608SecondSuggestion);609610// SuspiciousClonePair guarantees that the first clone always has a611// suggested variable associated with it. As we know that one of the two612// clones in the pair always has suggestion, we swap the two clones613// in case the first clone has no suggested variable which means that614// the second clone has a suggested variable and should be first.615if (!FirstMismatch->FirstCloneInfo.Suggestion)616std::swap(FirstMismatch->FirstCloneInfo, FirstMismatch->SecondCloneInfo);617618// This ensures that we always have at least one suggestion in a pair.619assert(FirstMismatch->FirstCloneInfo.Suggestion);620}621622return NumberOfDifferences;623}624625626