Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp
35266 views
1
//===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This pass does the inverse transformation of what LICM does.
10
// It traverses all of the instructions in the loop's preheader and sinks
11
// them to the loop body where frequency is lower than the loop's preheader.
12
// This pass is a reverse-transformation of LICM. It differs from the Sink
13
// pass in the following ways:
14
//
15
// * It only handles sinking of instructions from the loop's preheader to the
16
// loop's body
17
// * It uses alias set tracker to get more accurate alias info
18
// * It uses block frequency info to find the optimal sinking locations
19
//
20
// Overall algorithm:
21
//
22
// For I in Preheader:
23
// InsertBBs = BBs that uses I
24
// For BB in sorted(LoopBBs):
25
// DomBBs = BBs in InsertBBs that are dominated by BB
26
// if freq(DomBBs) > freq(BB)
27
// InsertBBs = UseBBs - DomBBs + BB
28
// For BB in InsertBBs:
29
// Insert I at BB's beginning
30
//
31
//===----------------------------------------------------------------------===//
32
33
#include "llvm/Transforms/Scalar/LoopSink.h"
34
#include "llvm/ADT/SetOperations.h"
35
#include "llvm/ADT/Statistic.h"
36
#include "llvm/Analysis/AliasAnalysis.h"
37
#include "llvm/Analysis/BlockFrequencyInfo.h"
38
#include "llvm/Analysis/LoopInfo.h"
39
#include "llvm/Analysis/MemorySSA.h"
40
#include "llvm/Analysis/MemorySSAUpdater.h"
41
#include "llvm/Analysis/ScalarEvolution.h"
42
#include "llvm/IR/Dominators.h"
43
#include "llvm/IR/Instructions.h"
44
#include "llvm/Support/BranchProbability.h"
45
#include "llvm/Support/CommandLine.h"
46
#include "llvm/Transforms/Scalar.h"
47
#include "llvm/Transforms/Utils/Local.h"
48
#include "llvm/Transforms/Utils/LoopUtils.h"
49
using namespace llvm;
50
51
#define DEBUG_TYPE "loopsink"
52
53
STATISTIC(NumLoopSunk, "Number of instructions sunk into loop");
54
STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop");
55
56
static cl::opt<unsigned> SinkFrequencyPercentThreshold(
57
"sink-freq-percent-threshold", cl::Hidden, cl::init(90),
58
cl::desc("Do not sink instructions that require cloning unless they "
59
"execute less than this percent of the time."));
60
61
static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
62
"max-uses-for-sinking", cl::Hidden, cl::init(30),
63
cl::desc("Do not sink instructions that have too many uses."));
64
65
/// Return adjusted total frequency of \p BBs.
66
///
67
/// * If there is only one BB, sinking instruction will not introduce code
68
/// size increase. Thus there is no need to adjust the frequency.
69
/// * If there are more than one BB, sinking would lead to code size increase.
70
/// In this case, we add some "tax" to the total frequency to make it harder
71
/// to sink. E.g.
72
/// Freq(Preheader) = 100
73
/// Freq(BBs) = sum(50, 49) = 99
74
/// Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to
75
/// BBs as the difference is too small to justify the code size increase.
76
/// To model this, The adjusted Freq(BBs) will be:
77
/// AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold%
78
static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs,
79
BlockFrequencyInfo &BFI) {
80
BlockFrequency T(0);
81
for (BasicBlock *B : BBs)
82
T += BFI.getBlockFreq(B);
83
if (BBs.size() > 1)
84
T /= BranchProbability(SinkFrequencyPercentThreshold, 100);
85
return T;
86
}
87
88
/// Return a set of basic blocks to insert sinked instructions.
89
///
90
/// The returned set of basic blocks (BBsToSinkInto) should satisfy:
91
///
92
/// * Inside the loop \p L
93
/// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto
94
/// that domintates the UseBB
95
/// * Has minimum total frequency that is no greater than preheader frequency
96
///
97
/// The purpose of the function is to find the optimal sinking points to
98
/// minimize execution cost, which is defined as "sum of frequency of
99
/// BBsToSinkInto".
100
/// As a result, the returned BBsToSinkInto needs to have minimum total
101
/// frequency.
102
/// Additionally, if the total frequency of BBsToSinkInto exceeds preheader
103
/// frequency, the optimal solution is not sinking (return empty set).
104
///
105
/// \p ColdLoopBBs is used to help find the optimal sinking locations.
106
/// It stores a list of BBs that is:
107
///
108
/// * Inside the loop \p L
109
/// * Has a frequency no larger than the loop's preheader
110
/// * Sorted by BB frequency
111
///
112
/// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()).
113
/// To avoid expensive computation, we cap the maximum UseBBs.size() in its
114
/// caller.
115
static SmallPtrSet<BasicBlock *, 2>
116
findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
117
const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
118
DominatorTree &DT, BlockFrequencyInfo &BFI) {
119
SmallPtrSet<BasicBlock *, 2> BBsToSinkInto;
120
if (UseBBs.size() == 0)
121
return BBsToSinkInto;
122
123
BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end());
124
SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB;
125
126
// For every iteration:
127
// * Pick the ColdestBB from ColdLoopBBs
128
// * Find the set BBsDominatedByColdestBB that satisfy:
129
// - BBsDominatedByColdestBB is a subset of BBsToSinkInto
130
// - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB
131
// * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove
132
// BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to
133
// BBsToSinkInto
134
for (BasicBlock *ColdestBB : ColdLoopBBs) {
135
BBsDominatedByColdestBB.clear();
136
for (BasicBlock *SinkedBB : BBsToSinkInto)
137
if (DT.dominates(ColdestBB, SinkedBB))
138
BBsDominatedByColdestBB.insert(SinkedBB);
139
if (BBsDominatedByColdestBB.size() == 0)
140
continue;
141
if (adjustedSumFreq(BBsDominatedByColdestBB, BFI) >
142
BFI.getBlockFreq(ColdestBB)) {
143
for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) {
144
BBsToSinkInto.erase(DominatedBB);
145
}
146
BBsToSinkInto.insert(ColdestBB);
147
}
148
}
149
150
// Can't sink into blocks that have no valid insertion point.
151
for (BasicBlock *BB : BBsToSinkInto) {
152
if (BB->getFirstInsertionPt() == BB->end()) {
153
BBsToSinkInto.clear();
154
break;
155
}
156
}
157
158
// If the total frequency of BBsToSinkInto is larger than preheader frequency,
159
// do not sink.
160
if (adjustedSumFreq(BBsToSinkInto, BFI) >
161
BFI.getBlockFreq(L.getLoopPreheader()))
162
BBsToSinkInto.clear();
163
return BBsToSinkInto;
164
}
165
166
// Sinks \p I from the loop \p L's preheader to its uses. Returns true if
167
// sinking is successful.
168
// \p LoopBlockNumber is used to sort the insertion blocks to ensure
169
// determinism.
170
static bool sinkInstruction(
171
Loop &L, Instruction &I, const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
172
const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber, LoopInfo &LI,
173
DominatorTree &DT, BlockFrequencyInfo &BFI, MemorySSAUpdater *MSSAU) {
174
// Compute the set of blocks in loop L which contain a use of I.
175
SmallPtrSet<BasicBlock *, 2> BBs;
176
for (auto &U : I.uses()) {
177
Instruction *UI = cast<Instruction>(U.getUser());
178
179
// We cannot sink I if it has uses outside of the loop.
180
if (!L.contains(LI.getLoopFor(UI->getParent())))
181
return false;
182
183
if (!isa<PHINode>(UI)) {
184
BBs.insert(UI->getParent());
185
continue;
186
}
187
188
// We cannot sink I to PHI-uses, try to look through PHI to find the incoming
189
// block of the value being used.
190
PHINode *PN = dyn_cast<PHINode>(UI);
191
BasicBlock *PhiBB = PN->getIncomingBlock(U);
192
193
// If value's incoming block is from loop preheader directly, there's no
194
// place to sink to, bailout.
195
if (L.getLoopPreheader() == PhiBB)
196
return false;
197
198
BBs.insert(PhiBB);
199
}
200
201
// findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
202
// BBs.size() to avoid expensive computation.
203
// FIXME: Handle code size growth for min_size and opt_size.
204
if (BBs.size() > MaxNumberOfUseBBsForSinking)
205
return false;
206
207
// Find the set of BBs that we should insert a copy of I.
208
SmallPtrSet<BasicBlock *, 2> BBsToSinkInto =
209
findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI);
210
if (BBsToSinkInto.empty())
211
return false;
212
213
// Return if any of the candidate blocks to sink into is non-cold.
214
if (BBsToSinkInto.size() > 1 &&
215
!llvm::set_is_subset(BBsToSinkInto, LoopBlockNumber))
216
return false;
217
218
// Copy the final BBs into a vector and sort them using the total ordering
219
// of the loop block numbers as iterating the set doesn't give a useful
220
// order. No need to stable sort as the block numbers are a total ordering.
221
SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
222
llvm::append_range(SortedBBsToSinkInto, BBsToSinkInto);
223
if (SortedBBsToSinkInto.size() > 1) {
224
llvm::sort(SortedBBsToSinkInto, [&](BasicBlock *A, BasicBlock *B) {
225
return LoopBlockNumber.find(A)->second < LoopBlockNumber.find(B)->second;
226
});
227
}
228
229
BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
230
// FIXME: Optimize the efficiency for cloned value replacement. The current
231
// implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
232
for (BasicBlock *N : ArrayRef(SortedBBsToSinkInto).drop_front(1)) {
233
assert(LoopBlockNumber.find(N)->second >
234
LoopBlockNumber.find(MoveBB)->second &&
235
"BBs not sorted!");
236
// Clone I and replace its uses.
237
Instruction *IC = I.clone();
238
IC->setName(I.getName());
239
IC->insertBefore(&*N->getFirstInsertionPt());
240
241
if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
242
// Create a new MemoryAccess and let MemorySSA set its defining access.
243
MemoryAccess *NewMemAcc =
244
MSSAU->createMemoryAccessInBB(IC, nullptr, N, MemorySSA::Beginning);
245
if (NewMemAcc) {
246
if (auto *MemDef = dyn_cast<MemoryDef>(NewMemAcc))
247
MSSAU->insertDef(MemDef, /*RenameUses=*/true);
248
else {
249
auto *MemUse = cast<MemoryUse>(NewMemAcc);
250
MSSAU->insertUse(MemUse, /*RenameUses=*/true);
251
}
252
}
253
}
254
255
// Replaces uses of I with IC in N, except PHI-use which is being taken
256
// care of by defs in PHI's incoming blocks.
257
I.replaceUsesWithIf(IC, [N](Use &U) {
258
Instruction *UIToReplace = cast<Instruction>(U.getUser());
259
return UIToReplace->getParent() == N && !isa<PHINode>(UIToReplace);
260
});
261
// Replaces uses of I with IC in blocks dominated by N
262
replaceDominatedUsesWith(&I, IC, DT, N);
263
LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
264
<< '\n');
265
NumLoopSunkCloned++;
266
}
267
LLVM_DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
268
NumLoopSunk++;
269
I.moveBefore(&*MoveBB->getFirstInsertionPt());
270
271
if (MSSAU)
272
if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
273
MSSAU->getMemorySSA()->getMemoryAccess(&I)))
274
MSSAU->moveToPlace(OldMemAcc, MoveBB, MemorySSA::Beginning);
275
276
return true;
277
}
278
279
/// Sinks instructions from loop's preheader to the loop body if the
280
/// sum frequency of inserted copy is smaller than preheader's frequency.
281
static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
282
DominatorTree &DT,
283
BlockFrequencyInfo &BFI,
284
MemorySSA &MSSA,
285
ScalarEvolution *SE) {
286
BasicBlock *Preheader = L.getLoopPreheader();
287
assert(Preheader && "Expected loop to have preheader");
288
289
assert(Preheader->getParent()->hasProfileData() &&
290
"Unexpected call when profile data unavailable.");
291
292
const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
293
// If there are no basic blocks with lower frequency than the preheader then
294
// we can avoid the detailed analysis as we will never find profitable sinking
295
// opportunities.
296
if (all_of(L.blocks(), [&](const BasicBlock *BB) {
297
return BFI.getBlockFreq(BB) > PreheaderFreq;
298
}))
299
return false;
300
301
MemorySSAUpdater MSSAU(&MSSA);
302
SinkAndHoistLICMFlags LICMFlags(/*IsSink=*/true, L, MSSA);
303
304
bool Changed = false;
305
306
// Sort loop's basic blocks by frequency
307
SmallVector<BasicBlock *, 10> ColdLoopBBs;
308
SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber;
309
int i = 0;
310
for (BasicBlock *B : L.blocks())
311
if (BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())) {
312
ColdLoopBBs.push_back(B);
313
LoopBlockNumber[B] = ++i;
314
}
315
llvm::stable_sort(ColdLoopBBs, [&](BasicBlock *A, BasicBlock *B) {
316
return BFI.getBlockFreq(A) < BFI.getBlockFreq(B);
317
});
318
319
// Traverse preheader's instructions in reverse order because if A depends
320
// on B (A appears after B), A needs to be sunk first before B can be
321
// sinked.
322
for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
323
if (isa<PHINode>(&I))
324
continue;
325
// No need to check for instruction's operands are loop invariant.
326
assert(L.hasLoopInvariantOperands(&I) &&
327
"Insts in a loop's preheader should have loop invariant operands!");
328
if (!canSinkOrHoistInst(I, &AA, &DT, &L, MSSAU, false, LICMFlags))
329
continue;
330
if (sinkInstruction(L, I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI,
331
&MSSAU)) {
332
Changed = true;
333
if (SE)
334
SE->forgetBlockAndLoopDispositions(&I);
335
}
336
}
337
338
return Changed;
339
}
340
341
PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
342
// Enable LoopSink only when runtime profile is available.
343
// With static profile, the sinking decision may be sub-optimal.
344
if (!F.hasProfileData())
345
return PreservedAnalyses::all();
346
347
LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
348
// Nothing to do if there are no loops.
349
if (LI.empty())
350
return PreservedAnalyses::all();
351
352
AAResults &AA = FAM.getResult<AAManager>(F);
353
DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
354
BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
355
MemorySSA &MSSA = FAM.getResult<MemorySSAAnalysis>(F).getMSSA();
356
357
// We want to do a postorder walk over the loops. Since loops are a tree this
358
// is equivalent to a reversed preorder walk and preorder is easy to compute
359
// without recursion. Since we reverse the preorder, we will visit siblings
360
// in reverse program order. This isn't expected to matter at all but is more
361
// consistent with sinking algorithms which generally work bottom-up.
362
SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder();
363
364
bool Changed = false;
365
do {
366
Loop &L = *PreorderLoops.pop_back_val();
367
368
BasicBlock *Preheader = L.getLoopPreheader();
369
if (!Preheader)
370
continue;
371
372
// Note that we don't pass SCEV here because it is only used to invalidate
373
// loops in SCEV and we don't preserve (or request) SCEV at all making that
374
// unnecessary.
375
Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI, MSSA,
376
/*ScalarEvolution*/ nullptr);
377
} while (!PreorderLoops.empty());
378
379
if (!Changed)
380
return PreservedAnalyses::all();
381
382
PreservedAnalyses PA;
383
PA.preserveSet<CFGAnalyses>();
384
PA.preserve<MemorySSAAnalysis>();
385
386
if (VerifyMemorySSA)
387
MSSA.verifyMemorySSA();
388
389
return PA;
390
}
391
392