Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/clang/lib/Analysis/CloneDetection.cpp
35233 views
1
//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// This file implements classes for searching and analyzing source code clones.
10
///
11
//===----------------------------------------------------------------------===//
12
13
#include "clang/Analysis/CloneDetection.h"
14
#include "clang/AST/Attr.h"
15
#include "clang/AST/DataCollection.h"
16
#include "clang/AST/DeclTemplate.h"
17
#include "clang/Basic/SourceManager.h"
18
#include "llvm/Support/MD5.h"
19
#include "llvm/Support/Path.h"
20
21
using namespace clang;
22
23
StmtSequence::StmtSequence(const CompoundStmt *Stmt, const Decl *D,
24
unsigned StartIndex, unsigned EndIndex)
25
: S(Stmt), D(D), StartIndex(StartIndex), EndIndex(EndIndex) {
26
assert(Stmt && "Stmt must not be a nullptr");
27
assert(StartIndex < EndIndex && "Given array should not be empty");
28
assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
29
}
30
31
StmtSequence::StmtSequence(const Stmt *Stmt, const Decl *D)
32
: S(Stmt), D(D), StartIndex(0), EndIndex(0) {}
33
34
StmtSequence::StmtSequence()
35
: S(nullptr), D(nullptr), StartIndex(0), EndIndex(0) {}
36
37
bool StmtSequence::contains(const StmtSequence &Other) const {
38
// If both sequences reside in different declarations, they can never contain
39
// each other.
40
if (D != Other.D)
41
return false;
42
43
const SourceManager &SM = getASTContext().getSourceManager();
44
45
// Otherwise check if the start and end locations of the current sequence
46
// surround the other sequence.
47
bool StartIsInBounds =
48
SM.isBeforeInTranslationUnit(getBeginLoc(), Other.getBeginLoc()) ||
49
getBeginLoc() == Other.getBeginLoc();
50
if (!StartIsInBounds)
51
return false;
52
53
bool EndIsInBounds =
54
SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||
55
Other.getEndLoc() == getEndLoc();
56
return EndIsInBounds;
57
}
58
59
StmtSequence::iterator StmtSequence::begin() const {
60
if (!holdsSequence()) {
61
return &S;
62
}
63
auto CS = cast<CompoundStmt>(S);
64
return CS->body_begin() + StartIndex;
65
}
66
67
StmtSequence::iterator StmtSequence::end() const {
68
if (!holdsSequence()) {
69
return reinterpret_cast<StmtSequence::iterator>(&S) + 1;
70
}
71
auto CS = cast<CompoundStmt>(S);
72
return CS->body_begin() + EndIndex;
73
}
74
75
ASTContext &StmtSequence::getASTContext() const {
76
assert(D);
77
return D->getASTContext();
78
}
79
80
SourceLocation StmtSequence::getBeginLoc() const {
81
return front()->getBeginLoc();
82
}
83
84
SourceLocation StmtSequence::getEndLoc() const { return back()->getEndLoc(); }
85
86
SourceRange StmtSequence::getSourceRange() const {
87
return SourceRange(getBeginLoc(), getEndLoc());
88
}
89
90
void CloneDetector::analyzeCodeBody(const Decl *D) {
91
assert(D);
92
assert(D->hasBody());
93
94
Sequences.push_back(StmtSequence(D->getBody(), D));
95
}
96
97
/// Returns true if and only if \p Stmt contains at least one other
98
/// sequence in the \p Group.
99
static bool containsAnyInGroup(StmtSequence &Seq,
100
CloneDetector::CloneGroup &Group) {
101
for (StmtSequence &GroupSeq : Group) {
102
if (Seq.contains(GroupSeq))
103
return true;
104
}
105
return false;
106
}
107
108
/// Returns true if and only if all sequences in \p OtherGroup are
109
/// contained by a sequence in \p Group.
110
static bool containsGroup(CloneDetector::CloneGroup &Group,
111
CloneDetector::CloneGroup &OtherGroup) {
112
// We have less sequences in the current group than we have in the other,
113
// so we will never fulfill the requirement for returning true. This is only
114
// possible because we know that a sequence in Group can contain at most
115
// one sequence in OtherGroup.
116
if (Group.size() < OtherGroup.size())
117
return false;
118
119
for (StmtSequence &Stmt : Group) {
120
if (!containsAnyInGroup(Stmt, OtherGroup))
121
return false;
122
}
123
return true;
124
}
125
126
void OnlyLargestCloneConstraint::constrain(
127
std::vector<CloneDetector::CloneGroup> &Result) {
128
std::vector<unsigned> IndexesToRemove;
129
130
// Compare every group in the result with the rest. If one groups contains
131
// another group, we only need to return the bigger group.
132
// Note: This doesn't scale well, so if possible avoid calling any heavy
133
// function from this loop to minimize the performance impact.
134
for (unsigned i = 0; i < Result.size(); ++i) {
135
for (unsigned j = 0; j < Result.size(); ++j) {
136
// Don't compare a group with itself.
137
if (i == j)
138
continue;
139
140
if (containsGroup(Result[j], Result[i])) {
141
IndexesToRemove.push_back(i);
142
break;
143
}
144
}
145
}
146
147
// Erasing a list of indexes from the vector should be done with decreasing
148
// indexes. As IndexesToRemove is constructed with increasing values, we just
149
// reverse iterate over it to get the desired order.
150
for (unsigned I : llvm::reverse(IndexesToRemove))
151
Result.erase(Result.begin() + I);
152
}
153
154
bool FilenamePatternConstraint::isAutoGenerated(
155
const CloneDetector::CloneGroup &Group) {
156
if (IgnoredFilesPattern.empty() || Group.empty() ||
157
!IgnoredFilesRegex->isValid())
158
return false;
159
160
for (const StmtSequence &S : Group) {
161
const SourceManager &SM = S.getASTContext().getSourceManager();
162
StringRef Filename = llvm::sys::path::filename(
163
SM.getFilename(S.getContainingDecl()->getLocation()));
164
if (IgnoredFilesRegex->match(Filename))
165
return true;
166
}
167
168
return false;
169
}
170
171
/// This class defines what a type II code clone is: If it collects for two
172
/// statements the same data, then those two statements are considered to be
173
/// clones of each other.
174
///
175
/// All collected data is forwarded to the given data consumer of the type T.
176
/// The data consumer class needs to provide a member method with the signature:
177
/// update(StringRef Str)
178
namespace {
179
template <class T>
180
class CloneTypeIIStmtDataCollector
181
: public ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>> {
182
ASTContext &Context;
183
/// The data sink to which all data is forwarded.
184
T &DataConsumer;
185
186
template <class Ty> void addData(const Ty &Data) {
187
data_collection::addDataToConsumer(DataConsumer, Data);
188
}
189
190
public:
191
CloneTypeIIStmtDataCollector(const Stmt *S, ASTContext &Context,
192
T &DataConsumer)
193
: Context(Context), DataConsumer(DataConsumer) {
194
this->Visit(S);
195
}
196
197
// Define a visit method for each class to collect data and subsequently visit
198
// all parent classes. This uses a template so that custom visit methods by us
199
// take precedence.
200
#define DEF_ADD_DATA(CLASS, CODE) \
201
template <class = void> void Visit##CLASS(const CLASS *S) { \
202
CODE; \
203
ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \
204
}
205
206
#include "clang/AST/StmtDataCollectors.inc"
207
208
// Type II clones ignore variable names and literals, so let's skip them.
209
#define SKIP(CLASS) \
210
void Visit##CLASS(const CLASS *S) { \
211
ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \
212
}
213
SKIP(DeclRefExpr)
214
SKIP(MemberExpr)
215
SKIP(IntegerLiteral)
216
SKIP(FloatingLiteral)
217
SKIP(StringLiteral)
218
SKIP(CXXBoolLiteralExpr)
219
SKIP(CharacterLiteral)
220
#undef SKIP
221
};
222
} // end anonymous namespace
223
224
static size_t createHash(llvm::MD5 &Hash) {
225
size_t HashCode;
226
227
// Create the final hash code for the current Stmt.
228
llvm::MD5::MD5Result HashResult;
229
Hash.final(HashResult);
230
231
// Copy as much as possible of the generated hash code to the Stmt's hash
232
// code.
233
std::memcpy(&HashCode, &HashResult,
234
std::min(sizeof(HashCode), sizeof(HashResult)));
235
236
return HashCode;
237
}
238
239
/// Generates and saves a hash code for the given Stmt.
240
/// \param S The given Stmt.
241
/// \param D The Decl containing S.
242
/// \param StmtsByHash Output parameter that will contain the hash codes for
243
/// each StmtSequence in the given Stmt.
244
/// \return The hash code of the given Stmt.
245
///
246
/// If the given Stmt is a CompoundStmt, this method will also generate
247
/// hashes for all possible StmtSequences in the children of this Stmt.
248
static size_t
249
saveHash(const Stmt *S, const Decl *D,
250
std::vector<std::pair<size_t, StmtSequence>> &StmtsByHash) {
251
llvm::MD5 Hash;
252
ASTContext &Context = D->getASTContext();
253
254
CloneTypeIIStmtDataCollector<llvm::MD5>(S, Context, Hash);
255
256
auto CS = dyn_cast<CompoundStmt>(S);
257
SmallVector<size_t, 8> ChildHashes;
258
259
for (const Stmt *Child : S->children()) {
260
if (Child == nullptr) {
261
ChildHashes.push_back(0);
262
continue;
263
}
264
size_t ChildHash = saveHash(Child, D, StmtsByHash);
265
Hash.update(
266
StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash)));
267
ChildHashes.push_back(ChildHash);
268
}
269
270
if (CS) {
271
// If we're in a CompoundStmt, we hash all possible combinations of child
272
// statements to find clones in those subsequences.
273
// We first go through every possible starting position of a subsequence.
274
for (unsigned Pos = 0; Pos < CS->size(); ++Pos) {
275
// Then we try all possible lengths this subsequence could have and
276
// reuse the same hash object to make sure we only hash every child
277
// hash exactly once.
278
llvm::MD5 Hash;
279
for (unsigned Length = 1; Length <= CS->size() - Pos; ++Length) {
280
// Grab the current child hash and put it into our hash. We do
281
// -1 on the index because we start counting the length at 1.
282
size_t ChildHash = ChildHashes[Pos + Length - 1];
283
Hash.update(
284
StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash)));
285
// If we have at least two elements in our subsequence, we can start
286
// saving it.
287
if (Length > 1) {
288
llvm::MD5 SubHash = Hash;
289
StmtsByHash.push_back(std::make_pair(
290
createHash(SubHash), StmtSequence(CS, D, Pos, Pos + Length)));
291
}
292
}
293
}
294
}
295
296
size_t HashCode = createHash(Hash);
297
StmtsByHash.push_back(std::make_pair(HashCode, StmtSequence(S, D)));
298
return HashCode;
299
}
300
301
namespace {
302
/// Wrapper around FoldingSetNodeID that it can be used as the template
303
/// argument of the StmtDataCollector.
304
class FoldingSetNodeIDWrapper {
305
306
llvm::FoldingSetNodeID &FS;
307
308
public:
309
FoldingSetNodeIDWrapper(llvm::FoldingSetNodeID &FS) : FS(FS) {}
310
311
void update(StringRef Str) { FS.AddString(Str); }
312
};
313
} // end anonymous namespace
314
315
/// Writes the relevant data from all statements and child statements
316
/// in the given StmtSequence into the given FoldingSetNodeID.
317
static void CollectStmtSequenceData(const StmtSequence &Sequence,
318
FoldingSetNodeIDWrapper &OutputData) {
319
for (const Stmt *S : Sequence) {
320
CloneTypeIIStmtDataCollector<FoldingSetNodeIDWrapper>(
321
S, Sequence.getASTContext(), OutputData);
322
323
for (const Stmt *Child : S->children()) {
324
if (!Child)
325
continue;
326
327
CollectStmtSequenceData(StmtSequence(Child, Sequence.getContainingDecl()),
328
OutputData);
329
}
330
}
331
}
332
333
/// Returns true if both sequences are clones of each other.
334
static bool areSequencesClones(const StmtSequence &LHS,
335
const StmtSequence &RHS) {
336
// We collect the data from all statements in the sequence as we did before
337
// when generating a hash value for each sequence. But this time we don't
338
// hash the collected data and compare the whole data set instead. This
339
// prevents any false-positives due to hash code collisions.
340
llvm::FoldingSetNodeID DataLHS, DataRHS;
341
FoldingSetNodeIDWrapper LHSWrapper(DataLHS);
342
FoldingSetNodeIDWrapper RHSWrapper(DataRHS);
343
344
CollectStmtSequenceData(LHS, LHSWrapper);
345
CollectStmtSequenceData(RHS, RHSWrapper);
346
347
return DataLHS == DataRHS;
348
}
349
350
void RecursiveCloneTypeIIHashConstraint::constrain(
351
std::vector<CloneDetector::CloneGroup> &Sequences) {
352
// FIXME: Maybe we can do this in-place and don't need this additional vector.
353
std::vector<CloneDetector::CloneGroup> Result;
354
355
for (CloneDetector::CloneGroup &Group : Sequences) {
356
// We assume in the following code that the Group is non-empty, so we
357
// skip all empty groups.
358
if (Group.empty())
359
continue;
360
361
std::vector<std::pair<size_t, StmtSequence>> StmtsByHash;
362
363
// Generate hash codes for all children of S and save them in StmtsByHash.
364
for (const StmtSequence &S : Group) {
365
saveHash(S.front(), S.getContainingDecl(), StmtsByHash);
366
}
367
368
// Sort hash_codes in StmtsByHash.
369
llvm::stable_sort(StmtsByHash, llvm::less_first());
370
371
// Check for each StmtSequence if its successor has the same hash value.
372
// We don't check the last StmtSequence as it has no successor.
373
// Note: The 'size - 1 ' in the condition is safe because we check for an
374
// empty Group vector at the beginning of this function.
375
for (unsigned i = 0; i < StmtsByHash.size() - 1; ++i) {
376
const auto Current = StmtsByHash[i];
377
378
// It's likely that we just found a sequence of StmtSequences that
379
// represent a CloneGroup, so we create a new group and start checking and
380
// adding the StmtSequences in this sequence.
381
CloneDetector::CloneGroup NewGroup;
382
383
size_t PrototypeHash = Current.first;
384
385
for (; i < StmtsByHash.size(); ++i) {
386
// A different hash value means we have reached the end of the sequence.
387
if (PrototypeHash != StmtsByHash[i].first) {
388
// The current sequence could be the start of a new CloneGroup. So we
389
// decrement i so that we visit it again in the outer loop.
390
// Note: i can never be 0 at this point because we are just comparing
391
// the hash of the Current StmtSequence with itself in the 'if' above.
392
assert(i != 0);
393
--i;
394
break;
395
}
396
// Same hash value means we should add the StmtSequence to the current
397
// group.
398
NewGroup.push_back(StmtsByHash[i].second);
399
}
400
401
// We created a new clone group with matching hash codes and move it to
402
// the result vector.
403
Result.push_back(NewGroup);
404
}
405
}
406
// Sequences is the output parameter, so we copy our result into it.
407
Sequences = Result;
408
}
409
410
void RecursiveCloneTypeIIVerifyConstraint::constrain(
411
std::vector<CloneDetector::CloneGroup> &Sequences) {
412
CloneConstraint::splitCloneGroups(
413
Sequences, [](const StmtSequence &A, const StmtSequence &B) {
414
return areSequencesClones(A, B);
415
});
416
}
417
418
size_t MinComplexityConstraint::calculateStmtComplexity(
419
const StmtSequence &Seq, std::size_t Limit,
420
const std::string &ParentMacroStack) {
421
if (Seq.empty())
422
return 0;
423
424
size_t Complexity = 1;
425
426
ASTContext &Context = Seq.getASTContext();
427
428
// Look up what macros expanded into the current statement.
429
std::string MacroStack =
430
data_collection::getMacroStack(Seq.getBeginLoc(), Context);
431
432
// First, check if ParentMacroStack is not empty which means we are currently
433
// dealing with a parent statement which was expanded from a macro.
434
// If this parent statement was expanded from the same macros as this
435
// statement, we reduce the initial complexity of this statement to zero.
436
// This causes that a group of statements that were generated by a single
437
// macro expansion will only increase the total complexity by one.
438
// Note: This is not the final complexity of this statement as we still
439
// add the complexity of the child statements to the complexity value.
440
if (!ParentMacroStack.empty() && MacroStack == ParentMacroStack) {
441
Complexity = 0;
442
}
443
444
// Iterate over the Stmts in the StmtSequence and add their complexity values
445
// to the current complexity value.
446
if (Seq.holdsSequence()) {
447
for (const Stmt *S : Seq) {
448
Complexity += calculateStmtComplexity(
449
StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack);
450
if (Complexity >= Limit)
451
return Limit;
452
}
453
} else {
454
for (const Stmt *S : Seq.front()->children()) {
455
Complexity += calculateStmtComplexity(
456
StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack);
457
if (Complexity >= Limit)
458
return Limit;
459
}
460
}
461
return Complexity;
462
}
463
464
void MatchingVariablePatternConstraint::constrain(
465
std::vector<CloneDetector::CloneGroup> &CloneGroups) {
466
CloneConstraint::splitCloneGroups(
467
CloneGroups, [](const StmtSequence &A, const StmtSequence &B) {
468
VariablePattern PatternA(A);
469
VariablePattern PatternB(B);
470
return PatternA.countPatternDifferences(PatternB) == 0;
471
});
472
}
473
474
void CloneConstraint::splitCloneGroups(
475
std::vector<CloneDetector::CloneGroup> &CloneGroups,
476
llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
477
Compare) {
478
std::vector<CloneDetector::CloneGroup> Result;
479
for (auto &HashGroup : CloneGroups) {
480
// Contains all indexes in HashGroup that were already added to a
481
// CloneGroup.
482
std::vector<char> Indexes;
483
Indexes.resize(HashGroup.size());
484
485
for (unsigned i = 0; i < HashGroup.size(); ++i) {
486
// Skip indexes that are already part of a CloneGroup.
487
if (Indexes[i])
488
continue;
489
490
// Pick the first unhandled StmtSequence and consider it as the
491
// beginning
492
// of a new CloneGroup for now.
493
// We don't add i to Indexes because we never iterate back.
494
StmtSequence Prototype = HashGroup[i];
495
CloneDetector::CloneGroup PotentialGroup = {Prototype};
496
++Indexes[i];
497
498
// Check all following StmtSequences for clones.
499
for (unsigned j = i + 1; j < HashGroup.size(); ++j) {
500
// Skip indexes that are already part of a CloneGroup.
501
if (Indexes[j])
502
continue;
503
504
// If a following StmtSequence belongs to our CloneGroup, we add it.
505
const StmtSequence &Candidate = HashGroup[j];
506
507
if (!Compare(Prototype, Candidate))
508
continue;
509
510
PotentialGroup.push_back(Candidate);
511
// Make sure we never visit this StmtSequence again.
512
++Indexes[j];
513
}
514
515
// Otherwise, add it to the result and continue searching for more
516
// groups.
517
Result.push_back(PotentialGroup);
518
}
519
520
assert(llvm::all_of(Indexes, [](char c) { return c == 1; }));
521
}
522
CloneGroups = Result;
523
}
524
525
void VariablePattern::addVariableOccurence(const VarDecl *VarDecl,
526
const Stmt *Mention) {
527
// First check if we already reference this variable
528
for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) {
529
if (Variables[KindIndex] == VarDecl) {
530
// If yes, add a new occurrence that points to the existing entry in
531
// the Variables vector.
532
Occurences.emplace_back(KindIndex, Mention);
533
return;
534
}
535
}
536
// If this variable wasn't already referenced, add it to the list of
537
// referenced variables and add a occurrence that points to this new entry.
538
Occurences.emplace_back(Variables.size(), Mention);
539
Variables.push_back(VarDecl);
540
}
541
542
void VariablePattern::addVariables(const Stmt *S) {
543
// Sometimes we get a nullptr (such as from IfStmts which often have nullptr
544
// children). We skip such statements as they don't reference any
545
// variables.
546
if (!S)
547
return;
548
549
// Check if S is a reference to a variable. If yes, add it to the pattern.
550
if (auto D = dyn_cast<DeclRefExpr>(S)) {
551
if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl()))
552
addVariableOccurence(VD, D);
553
}
554
555
// Recursively check all children of the given statement.
556
for (const Stmt *Child : S->children()) {
557
addVariables(Child);
558
}
559
}
560
561
unsigned VariablePattern::countPatternDifferences(
562
const VariablePattern &Other,
563
VariablePattern::SuspiciousClonePair *FirstMismatch) {
564
unsigned NumberOfDifferences = 0;
565
566
assert(Other.Occurences.size() == Occurences.size());
567
for (unsigned i = 0; i < Occurences.size(); ++i) {
568
auto ThisOccurence = Occurences[i];
569
auto OtherOccurence = Other.Occurences[i];
570
if (ThisOccurence.KindID == OtherOccurence.KindID)
571
continue;
572
573
++NumberOfDifferences;
574
575
// If FirstMismatch is not a nullptr, we need to store information about
576
// the first difference between the two patterns.
577
if (FirstMismatch == nullptr)
578
continue;
579
580
// Only proceed if we just found the first difference as we only store
581
// information about the first difference.
582
if (NumberOfDifferences != 1)
583
continue;
584
585
const VarDecl *FirstSuggestion = nullptr;
586
// If there is a variable available in the list of referenced variables
587
// which wouldn't break the pattern if it is used in place of the
588
// current variable, we provide this variable as the suggested fix.
589
if (OtherOccurence.KindID < Variables.size())
590
FirstSuggestion = Variables[OtherOccurence.KindID];
591
592
// Store information about the first clone.
593
FirstMismatch->FirstCloneInfo =
594
VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo(
595
Variables[ThisOccurence.KindID], ThisOccurence.Mention,
596
FirstSuggestion);
597
598
// Same as above but with the other clone. We do this for both clones as
599
// we don't know which clone is the one containing the unintended
600
// pattern error.
601
const VarDecl *SecondSuggestion = nullptr;
602
if (ThisOccurence.KindID < Other.Variables.size())
603
SecondSuggestion = Other.Variables[ThisOccurence.KindID];
604
605
// Store information about the second clone.
606
FirstMismatch->SecondCloneInfo =
607
VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo(
608
Other.Variables[OtherOccurence.KindID], OtherOccurence.Mention,
609
SecondSuggestion);
610
611
// SuspiciousClonePair guarantees that the first clone always has a
612
// suggested variable associated with it. As we know that one of the two
613
// clones in the pair always has suggestion, we swap the two clones
614
// in case the first clone has no suggested variable which means that
615
// the second clone has a suggested variable and should be first.
616
if (!FirstMismatch->FirstCloneInfo.Suggestion)
617
std::swap(FirstMismatch->FirstCloneInfo, FirstMismatch->SecondCloneInfo);
618
619
// This ensures that we always have at least one suggestion in a pair.
620
assert(FirstMismatch->FirstCloneInfo.Suggestion);
621
}
622
623
return NumberOfDifferences;
624
}
625
626