Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
35266 views
1
//===- PartialInlining.cpp - Inline parts of functions --------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This pass performs partial inlining, typically by inlining an if statement
10
// that surrounds the body of the function.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "llvm/Transforms/IPO/PartialInlining.h"
15
#include "llvm/ADT/DenseMap.h"
16
#include "llvm/ADT/DenseSet.h"
17
#include "llvm/ADT/DepthFirstIterator.h"
18
#include "llvm/ADT/STLExtras.h"
19
#include "llvm/ADT/SmallVector.h"
20
#include "llvm/ADT/Statistic.h"
21
#include "llvm/Analysis/BlockFrequencyInfo.h"
22
#include "llvm/Analysis/BranchProbabilityInfo.h"
23
#include "llvm/Analysis/InlineCost.h"
24
#include "llvm/Analysis/LoopInfo.h"
25
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26
#include "llvm/Analysis/ProfileSummaryInfo.h"
27
#include "llvm/Analysis/TargetLibraryInfo.h"
28
#include "llvm/Analysis/TargetTransformInfo.h"
29
#include "llvm/IR/Attributes.h"
30
#include "llvm/IR/BasicBlock.h"
31
#include "llvm/IR/CFG.h"
32
#include "llvm/IR/DebugLoc.h"
33
#include "llvm/IR/DiagnosticInfo.h"
34
#include "llvm/IR/Dominators.h"
35
#include "llvm/IR/Function.h"
36
#include "llvm/IR/InstrTypes.h"
37
#include "llvm/IR/Instruction.h"
38
#include "llvm/IR/Instructions.h"
39
#include "llvm/IR/IntrinsicInst.h"
40
#include "llvm/IR/Intrinsics.h"
41
#include "llvm/IR/Module.h"
42
#include "llvm/IR/Operator.h"
43
#include "llvm/IR/ProfDataUtils.h"
44
#include "llvm/IR/User.h"
45
#include "llvm/Support/BlockFrequency.h"
46
#include "llvm/Support/BranchProbability.h"
47
#include "llvm/Support/Casting.h"
48
#include "llvm/Support/CommandLine.h"
49
#include "llvm/Support/ErrorHandling.h"
50
#include "llvm/Transforms/IPO.h"
51
#include "llvm/Transforms/Utils/Cloning.h"
52
#include "llvm/Transforms/Utils/CodeExtractor.h"
53
#include "llvm/Transforms/Utils/ValueMapper.h"
54
#include <algorithm>
55
#include <cassert>
56
#include <cstdint>
57
#include <memory>
58
#include <tuple>
59
#include <vector>
60
61
using namespace llvm;
62
63
#define DEBUG_TYPE "partial-inlining"
64
65
STATISTIC(NumPartialInlined,
66
"Number of callsites functions partially inlined into.");
67
STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
68
"cold outlined regions were partially "
69
"inlined into its caller(s).");
70
STATISTIC(NumColdRegionsFound,
71
"Number of cold single entry/exit regions found.");
72
STATISTIC(NumColdRegionsOutlined,
73
"Number of cold single entry/exit regions outlined.");
74
75
// Command line option to disable partial-inlining. The default is false:
76
static cl::opt<bool>
77
DisablePartialInlining("disable-partial-inlining", cl::init(false),
78
cl::Hidden, cl::desc("Disable partial inlining"));
79
// Command line option to disable multi-region partial-inlining. The default is
80
// false:
81
static cl::opt<bool> DisableMultiRegionPartialInline(
82
"disable-mr-partial-inlining", cl::init(false), cl::Hidden,
83
cl::desc("Disable multi-region partial inlining"));
84
85
// Command line option to force outlining in regions with live exit variables.
86
// The default is false:
87
static cl::opt<bool>
88
ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
89
cl::desc("Force outline regions with live exits"));
90
91
// Command line option to enable marking outline functions with Cold Calling
92
// Convention. The default is false:
93
static cl::opt<bool>
94
MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
95
cl::desc("Mark outline function calls with ColdCC"));
96
97
// This is an option used by testing:
98
static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
99
100
cl::ReallyHidden,
101
cl::desc("Skip Cost Analysis"));
102
// Used to determine if a cold region is worth outlining based on
103
// its inlining cost compared to the original function. Default is set at 10%.
104
// ie. if the cold region reduces the inlining cost of the original function by
105
// at least 10%.
106
static cl::opt<float> MinRegionSizeRatio(
107
"min-region-size-ratio", cl::init(0.1), cl::Hidden,
108
cl::desc("Minimum ratio comparing relative sizes of each "
109
"outline candidate and original function"));
110
// Used to tune the minimum number of execution counts needed in the predecessor
111
// block to the cold edge. ie. confidence interval.
112
static cl::opt<unsigned>
113
MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
114
cl::desc("Minimum block executions to consider "
115
"its BranchProbabilityInfo valid"));
116
// Used to determine when an edge is considered cold. Default is set to 10%. ie.
117
// if the branch probability is 10% or less, then it is deemed as 'cold'.
118
static cl::opt<float> ColdBranchRatio(
119
"cold-branch-ratio", cl::init(0.1), cl::Hidden,
120
cl::desc("Minimum BranchProbability to consider a region cold."));
121
122
static cl::opt<unsigned> MaxNumInlineBlocks(
123
"max-num-inline-blocks", cl::init(5), cl::Hidden,
124
cl::desc("Max number of blocks to be partially inlined"));
125
126
// Command line option to set the maximum number of partial inlining allowed
127
// for the module. The default value of -1 means no limit.
128
static cl::opt<int> MaxNumPartialInlining(
129
"max-partial-inlining", cl::init(-1), cl::Hidden,
130
cl::desc("Max number of partial inlining. The default is unlimited"));
131
132
// Used only when PGO or user annotated branch data is absent. It is
133
// the least value that is used to weigh the outline region. If BFI
134
// produces larger value, the BFI value will be used.
135
static cl::opt<int>
136
OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
137
cl::Hidden,
138
cl::desc("Relative frequency of outline region to "
139
"the entry block"));
140
141
static cl::opt<unsigned> ExtraOutliningPenalty(
142
"partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
143
cl::desc("A debug option to add additional penalty to the computed one."));
144
145
namespace {
146
147
struct FunctionOutliningInfo {
148
FunctionOutliningInfo() = default;
149
150
// Returns the number of blocks to be inlined including all blocks
151
// in Entries and one return block.
152
unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
153
154
// A set of blocks including the function entry that guard
155
// the region to be outlined.
156
SmallVector<BasicBlock *, 4> Entries;
157
158
// The return block that is not included in the outlined region.
159
BasicBlock *ReturnBlock = nullptr;
160
161
// The dominating block of the region to be outlined.
162
BasicBlock *NonReturnBlock = nullptr;
163
164
// The set of blocks in Entries that are predecessors to ReturnBlock
165
SmallVector<BasicBlock *, 4> ReturnBlockPreds;
166
};
167
168
struct FunctionOutliningMultiRegionInfo {
169
FunctionOutliningMultiRegionInfo() = default;
170
171
// Container for outline regions
172
struct OutlineRegionInfo {
173
OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
174
BasicBlock *EntryBlock, BasicBlock *ExitBlock,
175
BasicBlock *ReturnBlock)
176
: Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
177
ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
178
SmallVector<BasicBlock *, 8> Region;
179
BasicBlock *EntryBlock;
180
BasicBlock *ExitBlock;
181
BasicBlock *ReturnBlock;
182
};
183
184
SmallVector<OutlineRegionInfo, 4> ORI;
185
};
186
187
struct PartialInlinerImpl {
188
189
PartialInlinerImpl(
190
function_ref<AssumptionCache &(Function &)> GetAC,
191
function_ref<AssumptionCache *(Function &)> LookupAC,
192
function_ref<TargetTransformInfo &(Function &)> GTTI,
193
function_ref<const TargetLibraryInfo &(Function &)> GTLI,
194
ProfileSummaryInfo &ProfSI,
195
function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
196
: GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
197
GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
198
199
bool run(Module &M);
200
// Main part of the transformation that calls helper functions to find
201
// outlining candidates, clone & outline the function, and attempt to
202
// partially inline the resulting function. Returns true if
203
// inlining was successful, false otherwise. Also returns the outline
204
// function (only if we partially inlined early returns) as there is a
205
// possibility to further "peel" early return statements that were left in the
206
// outline function due to code size.
207
std::pair<bool, Function *> unswitchFunction(Function &F);
208
209
// This class speculatively clones the function to be partial inlined.
210
// At the end of partial inlining, the remaining callsites to the cloned
211
// function that are not partially inlined will be fixed up to reference
212
// the original function, and the cloned function will be erased.
213
struct FunctionCloner {
214
// Two constructors, one for single region outlining, the other for
215
// multi-region outlining.
216
FunctionCloner(Function *F, FunctionOutliningInfo *OI,
217
OptimizationRemarkEmitter &ORE,
218
function_ref<AssumptionCache *(Function &)> LookupAC,
219
function_ref<TargetTransformInfo &(Function &)> GetTTI);
220
FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
221
OptimizationRemarkEmitter &ORE,
222
function_ref<AssumptionCache *(Function &)> LookupAC,
223
function_ref<TargetTransformInfo &(Function &)> GetTTI);
224
225
~FunctionCloner();
226
227
// Prepare for function outlining: making sure there is only
228
// one incoming edge from the extracted/outlined region to
229
// the return block.
230
void normalizeReturnBlock() const;
231
232
// Do function outlining for cold regions.
233
bool doMultiRegionFunctionOutlining();
234
// Do function outlining for region after early return block(s).
235
// NOTE: For vararg functions that do the vararg handling in the outlined
236
// function, we temporarily generate IR that does not properly
237
// forward varargs to the outlined function. Calling InlineFunction
238
// will update calls to the outlined functions to properly forward
239
// the varargs.
240
Function *doSingleRegionFunctionOutlining();
241
242
Function *OrigFunc = nullptr;
243
Function *ClonedFunc = nullptr;
244
245
typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
246
// Keep track of Outlined Functions and the basic block they're called from.
247
SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
248
249
// ClonedFunc is inlined in one of its callers after function
250
// outlining.
251
bool IsFunctionInlined = false;
252
// The cost of the region to be outlined.
253
InstructionCost OutlinedRegionCost = 0;
254
// ClonedOI is specific to outlining non-early return blocks.
255
std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
256
// ClonedOMRI is specific to outlining cold regions.
257
std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
258
std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
259
OptimizationRemarkEmitter &ORE;
260
function_ref<AssumptionCache *(Function &)> LookupAC;
261
function_ref<TargetTransformInfo &(Function &)> GetTTI;
262
};
263
264
private:
265
int NumPartialInlining = 0;
266
function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
267
function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
268
function_ref<TargetTransformInfo &(Function &)> GetTTI;
269
function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
270
function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
271
ProfileSummaryInfo &PSI;
272
273
// Return the frequency of the OutlininingBB relative to F's entry point.
274
// The result is no larger than 1 and is represented using BP.
275
// (Note that the outlined region's 'head' block can only have incoming
276
// edges from the guarding entry blocks).
277
BranchProbability
278
getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
279
280
// Return true if the callee of CB should be partially inlined with
281
// profit.
282
bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
283
BlockFrequency WeightedOutliningRcost,
284
OptimizationRemarkEmitter &ORE) const;
285
286
// Try to inline DuplicateFunction (cloned from F with call to
287
// the OutlinedFunction into its callers. Return true
288
// if there is any successful inlining.
289
bool tryPartialInline(FunctionCloner &Cloner);
290
291
// Compute the mapping from use site of DuplicationFunction to the enclosing
292
// BB's profile count.
293
void
294
computeCallsiteToProfCountMap(Function *DuplicateFunction,
295
DenseMap<User *, uint64_t> &SiteCountMap) const;
296
297
bool isLimitReached() const {
298
return (MaxNumPartialInlining != -1 &&
299
NumPartialInlining >= MaxNumPartialInlining);
300
}
301
302
static CallBase *getSupportedCallBase(User *U) {
303
if (isa<CallInst>(U) || isa<InvokeInst>(U))
304
return cast<CallBase>(U);
305
llvm_unreachable("All uses must be calls");
306
return nullptr;
307
}
308
309
static CallBase *getOneCallSiteTo(Function &F) {
310
User *User = *F.user_begin();
311
return getSupportedCallBase(User);
312
}
313
314
std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
315
CallBase *CB = getOneCallSiteTo(F);
316
DebugLoc DLoc = CB->getDebugLoc();
317
BasicBlock *Block = CB->getParent();
318
return std::make_tuple(DLoc, Block);
319
}
320
321
// Returns the costs associated with function outlining:
322
// - The first value is the non-weighted runtime cost for making the call
323
// to the outlined function, including the addtional setup cost in the
324
// outlined function itself;
325
// - The second value is the estimated size of the new call sequence in
326
// basic block Cloner.OutliningCallBB;
327
std::tuple<InstructionCost, InstructionCost>
328
computeOutliningCosts(FunctionCloner &Cloner) const;
329
330
// Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
331
// approximate both the size and runtime cost (Note that in the current
332
// inline cost analysis, there is no clear distinction there either).
333
static InstructionCost computeBBInlineCost(BasicBlock *BB,
334
TargetTransformInfo *TTI);
335
336
std::unique_ptr<FunctionOutliningInfo>
337
computeOutliningInfo(Function &F) const;
338
339
std::unique_ptr<FunctionOutliningMultiRegionInfo>
340
computeOutliningColdRegionsInfo(Function &F,
341
OptimizationRemarkEmitter &ORE) const;
342
};
343
344
} // end anonymous namespace
345
346
std::unique_ptr<FunctionOutliningMultiRegionInfo>
347
PartialInlinerImpl::computeOutliningColdRegionsInfo(
348
Function &F, OptimizationRemarkEmitter &ORE) const {
349
BasicBlock *EntryBlock = &F.front();
350
351
DominatorTree DT(F);
352
LoopInfo LI(DT);
353
BranchProbabilityInfo BPI(F, LI);
354
std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
355
BlockFrequencyInfo *BFI;
356
if (!GetBFI) {
357
ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
358
BFI = ScopedBFI.get();
359
} else
360
BFI = &(GetBFI(F));
361
362
// Return if we don't have profiling information.
363
if (!PSI.hasInstrumentationProfile())
364
return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
365
366
std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
367
std::make_unique<FunctionOutliningMultiRegionInfo>();
368
369
auto IsSingleExit =
370
[&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
371
BasicBlock *ExitBlock = nullptr;
372
for (auto *Block : BlockList) {
373
for (BasicBlock *Succ : successors(Block)) {
374
if (!is_contained(BlockList, Succ)) {
375
if (ExitBlock) {
376
ORE.emit([&]() {
377
return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
378
&Succ->front())
379
<< "Region dominated by "
380
<< ore::NV("Block", BlockList.front()->getName())
381
<< " has more than one region exit edge.";
382
});
383
return nullptr;
384
}
385
386
ExitBlock = Block;
387
}
388
}
389
}
390
return ExitBlock;
391
};
392
393
auto BBProfileCount = [BFI](BasicBlock *BB) {
394
return BFI->getBlockProfileCount(BB).value_or(0);
395
};
396
397
// Use the same computeBBInlineCost function to compute the cost savings of
398
// the outlining the candidate region.
399
TargetTransformInfo *FTTI = &GetTTI(F);
400
InstructionCost OverallFunctionCost = 0;
401
for (auto &BB : F)
402
OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
403
404
LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
405
<< "\n";);
406
407
InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
408
[&](auto Cost) { return Cost * MinRegionSizeRatio; });
409
410
BranchProbability MinBranchProbability(
411
static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
412
MinBlockCounterExecution);
413
bool ColdCandidateFound = false;
414
BasicBlock *CurrEntry = EntryBlock;
415
std::vector<BasicBlock *> DFS;
416
DenseMap<BasicBlock *, bool> VisitedMap;
417
DFS.push_back(CurrEntry);
418
VisitedMap[CurrEntry] = true;
419
420
// Use Depth First Search on the basic blocks to find CFG edges that are
421
// considered cold.
422
// Cold regions considered must also have its inline cost compared to the
423
// overall inline cost of the original function. The region is outlined only
424
// if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
425
// more.
426
while (!DFS.empty()) {
427
auto *ThisBB = DFS.back();
428
DFS.pop_back();
429
// Only consider regions with predecessor blocks that are considered
430
// not-cold (default: part of the top 99.99% of all block counters)
431
// AND greater than our minimum block execution count (default: 100).
432
if (PSI.isColdBlock(ThisBB, BFI) ||
433
BBProfileCount(ThisBB) < MinBlockCounterExecution)
434
continue;
435
for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
436
if (VisitedMap[*SI])
437
continue;
438
VisitedMap[*SI] = true;
439
DFS.push_back(*SI);
440
// If branch isn't cold, we skip to the next one.
441
BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
442
if (SuccProb > MinBranchProbability)
443
continue;
444
445
LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
446
<< SI->getName()
447
<< "\nBranch Probability = " << SuccProb << "\n";);
448
449
SmallVector<BasicBlock *, 8> DominateVector;
450
DT.getDescendants(*SI, DominateVector);
451
assert(!DominateVector.empty() &&
452
"SI should be reachable and have at least itself as descendant");
453
454
// We can only outline single entry regions (for now).
455
if (!DominateVector.front()->hasNPredecessors(1)) {
456
LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
457
<< " doesn't have a single predecessor in the "
458
"dominator tree\n";);
459
continue;
460
}
461
462
BasicBlock *ExitBlock = nullptr;
463
// We can only outline single exit regions (for now).
464
if (!(ExitBlock = IsSingleExit(DominateVector))) {
465
LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
466
<< " doesn't have a unique successor\n";);
467
continue;
468
}
469
470
InstructionCost OutlineRegionCost = 0;
471
for (auto *BB : DominateVector)
472
OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
473
474
LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
475
<< "\n";);
476
477
if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
478
ORE.emit([&]() {
479
return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
480
&SI->front())
481
<< ore::NV("Callee", &F)
482
<< " inline cost-savings smaller than "
483
<< ore::NV("Cost", MinOutlineRegionCost);
484
});
485
486
LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
487
<< MinOutlineRegionCost << "\n";);
488
continue;
489
}
490
491
// For now, ignore blocks that belong to a SISE region that is a
492
// candidate for outlining. In the future, we may want to look
493
// at inner regions because the outer region may have live-exit
494
// variables.
495
for (auto *BB : DominateVector)
496
VisitedMap[BB] = true;
497
498
// ReturnBlock here means the block after the outline call
499
BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
500
FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
501
DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
502
OutliningInfo->ORI.push_back(RegInfo);
503
LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
504
<< DominateVector.front()->getName() << "\n";);
505
ColdCandidateFound = true;
506
NumColdRegionsFound++;
507
}
508
}
509
510
if (ColdCandidateFound)
511
return OutliningInfo;
512
513
return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
514
}
515
516
std::unique_ptr<FunctionOutliningInfo>
517
PartialInlinerImpl::computeOutliningInfo(Function &F) const {
518
BasicBlock *EntryBlock = &F.front();
519
BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
520
if (!BR || BR->isUnconditional())
521
return std::unique_ptr<FunctionOutliningInfo>();
522
523
// Returns true if Succ is BB's successor
524
auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
525
return is_contained(successors(BB), Succ);
526
};
527
528
auto IsReturnBlock = [](BasicBlock *BB) {
529
Instruction *TI = BB->getTerminator();
530
return isa<ReturnInst>(TI);
531
};
532
533
auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
534
if (IsReturnBlock(Succ1))
535
return std::make_tuple(Succ1, Succ2);
536
if (IsReturnBlock(Succ2))
537
return std::make_tuple(Succ2, Succ1);
538
539
return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
540
};
541
542
// Detect a triangular shape:
543
auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
544
if (IsSuccessor(Succ1, Succ2))
545
return std::make_tuple(Succ1, Succ2);
546
if (IsSuccessor(Succ2, Succ1))
547
return std::make_tuple(Succ2, Succ1);
548
549
return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
550
};
551
552
std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
553
std::make_unique<FunctionOutliningInfo>();
554
555
BasicBlock *CurrEntry = EntryBlock;
556
bool CandidateFound = false;
557
do {
558
// The number of blocks to be inlined has already reached
559
// the limit. When MaxNumInlineBlocks is set to 0 or 1, this
560
// disables partial inlining for the function.
561
if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
562
break;
563
564
if (succ_size(CurrEntry) != 2)
565
break;
566
567
BasicBlock *Succ1 = *succ_begin(CurrEntry);
568
BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
569
570
BasicBlock *ReturnBlock, *NonReturnBlock;
571
std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
572
573
if (ReturnBlock) {
574
OutliningInfo->Entries.push_back(CurrEntry);
575
OutliningInfo->ReturnBlock = ReturnBlock;
576
OutliningInfo->NonReturnBlock = NonReturnBlock;
577
CandidateFound = true;
578
break;
579
}
580
581
BasicBlock *CommSucc, *OtherSucc;
582
std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
583
584
if (!CommSucc)
585
break;
586
587
OutliningInfo->Entries.push_back(CurrEntry);
588
CurrEntry = OtherSucc;
589
} while (true);
590
591
if (!CandidateFound)
592
return std::unique_ptr<FunctionOutliningInfo>();
593
594
// There should not be any successors (not in the entry set) other than
595
// {ReturnBlock, NonReturnBlock}
596
assert(OutliningInfo->Entries[0] == &F.front() &&
597
"Function Entry must be the first in Entries vector");
598
DenseSet<BasicBlock *> Entries;
599
for (BasicBlock *E : OutliningInfo->Entries)
600
Entries.insert(E);
601
602
// Returns true of BB has Predecessor which is not
603
// in Entries set.
604
auto HasNonEntryPred = [Entries](BasicBlock *BB) {
605
for (auto *Pred : predecessors(BB)) {
606
if (!Entries.count(Pred))
607
return true;
608
}
609
return false;
610
};
611
auto CheckAndNormalizeCandidate =
612
[Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
613
for (BasicBlock *E : OutliningInfo->Entries) {
614
for (auto *Succ : successors(E)) {
615
if (Entries.count(Succ))
616
continue;
617
if (Succ == OutliningInfo->ReturnBlock)
618
OutliningInfo->ReturnBlockPreds.push_back(E);
619
else if (Succ != OutliningInfo->NonReturnBlock)
620
return false;
621
}
622
// There should not be any outside incoming edges either:
623
if (HasNonEntryPred(E))
624
return false;
625
}
626
return true;
627
};
628
629
if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
630
return std::unique_ptr<FunctionOutliningInfo>();
631
632
// Now further growing the candidate's inlining region by
633
// peeling off dominating blocks from the outlining region:
634
while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
635
BasicBlock *Cand = OutliningInfo->NonReturnBlock;
636
if (succ_size(Cand) != 2)
637
break;
638
639
if (HasNonEntryPred(Cand))
640
break;
641
642
BasicBlock *Succ1 = *succ_begin(Cand);
643
BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
644
645
BasicBlock *ReturnBlock, *NonReturnBlock;
646
std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
647
if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
648
break;
649
650
if (NonReturnBlock->getSinglePredecessor() != Cand)
651
break;
652
653
// Now grow and update OutlininigInfo:
654
OutliningInfo->Entries.push_back(Cand);
655
OutliningInfo->NonReturnBlock = NonReturnBlock;
656
OutliningInfo->ReturnBlockPreds.push_back(Cand);
657
Entries.insert(Cand);
658
}
659
660
return OutliningInfo;
661
}
662
663
// Check if there is PGO data or user annotated branch data:
664
static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
665
if (F.hasProfileData())
666
return true;
667
// Now check if any of the entry block has MD_prof data:
668
for (auto *E : OI.Entries) {
669
BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
670
if (!BR || BR->isUnconditional())
671
continue;
672
if (hasBranchWeightMD(*BR))
673
return true;
674
}
675
return false;
676
}
677
678
BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
679
FunctionCloner &Cloner) const {
680
BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
681
auto EntryFreq =
682
Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
683
auto OutliningCallFreq =
684
Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
685
// FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
686
// we outlined any regions, so we may encounter situations where the
687
// OutliningCallFreq is *slightly* bigger than the EntryFreq.
688
if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
689
OutliningCallFreq = EntryFreq;
690
691
auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
692
OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
693
694
if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI))
695
return OutlineRegionRelFreq;
696
697
// When profile data is not available, we need to be conservative in
698
// estimating the overall savings. Static branch prediction can usually
699
// guess the branch direction right (taken/non-taken), but the guessed
700
// branch probability is usually not biased enough. In case when the
701
// outlined region is predicted to be likely, its probability needs
702
// to be made higher (more biased) to not under-estimate the cost of
703
// function outlining. On the other hand, if the outlined region
704
// is predicted to be less likely, the predicted probablity is usually
705
// higher than the actual. For instance, the actual probability of the
706
// less likely target is only 5%, but the guessed probablity can be
707
// 40%. In the latter case, there is no need for further adjustment.
708
// FIXME: add an option for this.
709
if (OutlineRegionRelFreq < BranchProbability(45, 100))
710
return OutlineRegionRelFreq;
711
712
OutlineRegionRelFreq = std::max(
713
OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
714
715
return OutlineRegionRelFreq;
716
}
717
718
bool PartialInlinerImpl::shouldPartialInline(
719
CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
720
OptimizationRemarkEmitter &ORE) const {
721
using namespace ore;
722
723
Function *Callee = CB.getCalledFunction();
724
assert(Callee == Cloner.ClonedFunc);
725
726
if (SkipCostAnalysis)
727
return isInlineViable(*Callee).isSuccess();
728
729
Function *Caller = CB.getCaller();
730
auto &CalleeTTI = GetTTI(*Callee);
731
bool RemarksEnabled =
732
Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
733
DEBUG_TYPE);
734
InlineCost IC =
735
getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
736
GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
737
738
if (IC.isAlways()) {
739
ORE.emit([&]() {
740
return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
741
<< NV("Callee", Cloner.OrigFunc)
742
<< " should always be fully inlined, not partially";
743
});
744
return false;
745
}
746
747
if (IC.isNever()) {
748
ORE.emit([&]() {
749
return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
750
<< NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
751
<< NV("Caller", Caller)
752
<< " because it should never be inlined (cost=never)";
753
});
754
return false;
755
}
756
757
if (!IC) {
758
ORE.emit([&]() {
759
return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
760
<< NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
761
<< NV("Caller", Caller) << " because too costly to inline (cost="
762
<< NV("Cost", IC.getCost()) << ", threshold="
763
<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
764
});
765
return false;
766
}
767
const DataLayout &DL = Caller->getDataLayout();
768
769
// The savings of eliminating the call:
770
int NonWeightedSavings = getCallsiteCost(CalleeTTI, CB, DL);
771
BlockFrequency NormWeightedSavings(NonWeightedSavings);
772
773
// Weighted saving is smaller than weighted cost, return false
774
if (NormWeightedSavings < WeightedOutliningRcost) {
775
ORE.emit([&]() {
776
return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
777
&CB)
778
<< NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
779
<< NV("Caller", Caller) << " runtime overhead (overhead="
780
<< NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
781
<< ", savings="
782
<< NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
783
<< ")"
784
<< " of making the outlined call is too high";
785
});
786
787
return false;
788
}
789
790
ORE.emit([&]() {
791
return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
792
<< NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
793
<< NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
794
<< " (threshold="
795
<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
796
});
797
return true;
798
}
799
800
// TODO: Ideally we should share Inliner's InlineCost Analysis code.
801
// For now use a simplified version. The returned 'InlineCost' will be used
802
// to esimate the size cost as well as runtime cost of the BB.
803
InstructionCost
804
PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
805
TargetTransformInfo *TTI) {
806
InstructionCost InlineCost = 0;
807
const DataLayout &DL = BB->getDataLayout();
808
int InstrCost = InlineConstants::getInstrCost();
809
for (Instruction &I : BB->instructionsWithoutDebug()) {
810
// Skip free instructions.
811
switch (I.getOpcode()) {
812
case Instruction::BitCast:
813
case Instruction::PtrToInt:
814
case Instruction::IntToPtr:
815
case Instruction::Alloca:
816
case Instruction::PHI:
817
continue;
818
case Instruction::GetElementPtr:
819
if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
820
continue;
821
break;
822
default:
823
break;
824
}
825
826
if (I.isLifetimeStartOrEnd())
827
continue;
828
829
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
830
Intrinsic::ID IID = II->getIntrinsicID();
831
SmallVector<Type *, 4> Tys;
832
FastMathFlags FMF;
833
for (Value *Val : II->args())
834
Tys.push_back(Val->getType());
835
836
if (auto *FPMO = dyn_cast<FPMathOperator>(II))
837
FMF = FPMO->getFastMathFlags();
838
839
IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
840
InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
841
continue;
842
}
843
844
if (CallInst *CI = dyn_cast<CallInst>(&I)) {
845
InlineCost += getCallsiteCost(*TTI, *CI, DL);
846
continue;
847
}
848
849
if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
850
InlineCost += getCallsiteCost(*TTI, *II, DL);
851
continue;
852
}
853
854
if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
855
InlineCost += (SI->getNumCases() + 1) * InstrCost;
856
continue;
857
}
858
InlineCost += InstrCost;
859
}
860
861
return InlineCost;
862
}
863
864
std::tuple<InstructionCost, InstructionCost>
865
PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
866
InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
867
for (auto FuncBBPair : Cloner.OutlinedFunctions) {
868
Function *OutlinedFunc = FuncBBPair.first;
869
BasicBlock* OutliningCallBB = FuncBBPair.second;
870
// Now compute the cost of the call sequence to the outlined function
871
// 'OutlinedFunction' in BB 'OutliningCallBB':
872
auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
873
OutliningFuncCallCost +=
874
computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
875
876
// Now compute the cost of the extracted/outlined function itself:
877
for (BasicBlock &BB : *OutlinedFunc)
878
OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
879
}
880
assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
881
"Outlined function cost should be no less than the outlined region");
882
883
// The code extractor introduces a new root and exit stub blocks with
884
// additional unconditional branches. Those branches will be eliminated
885
// later with bb layout. The cost should be adjusted accordingly:
886
OutlinedFunctionCost -=
887
2 * InlineConstants::getInstrCost() * Cloner.OutlinedFunctions.size();
888
889
InstructionCost OutliningRuntimeOverhead =
890
OutliningFuncCallCost +
891
(OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
892
ExtraOutliningPenalty.getValue();
893
894
return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
895
}
896
897
// Create the callsite to profile count map which is
898
// used to update the original function's entry count,
899
// after the function is partially inlined into the callsite.
900
void PartialInlinerImpl::computeCallsiteToProfCountMap(
901
Function *DuplicateFunction,
902
DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
903
std::vector<User *> Users(DuplicateFunction->user_begin(),
904
DuplicateFunction->user_end());
905
Function *CurrentCaller = nullptr;
906
std::unique_ptr<BlockFrequencyInfo> TempBFI;
907
BlockFrequencyInfo *CurrentCallerBFI = nullptr;
908
909
auto ComputeCurrBFI = [&,this](Function *Caller) {
910
// For the old pass manager:
911
if (!GetBFI) {
912
DominatorTree DT(*Caller);
913
LoopInfo LI(DT);
914
BranchProbabilityInfo BPI(*Caller, LI);
915
TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
916
CurrentCallerBFI = TempBFI.get();
917
} else {
918
// New pass manager:
919
CurrentCallerBFI = &(GetBFI(*Caller));
920
}
921
};
922
923
for (User *User : Users) {
924
// Don't bother with BlockAddress used by CallBr for asm goto.
925
if (isa<BlockAddress>(User))
926
continue;
927
CallBase *CB = getSupportedCallBase(User);
928
Function *Caller = CB->getCaller();
929
if (CurrentCaller != Caller) {
930
CurrentCaller = Caller;
931
ComputeCurrBFI(Caller);
932
} else {
933
assert(CurrentCallerBFI && "CallerBFI is not set");
934
}
935
BasicBlock *CallBB = CB->getParent();
936
auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
937
if (Count)
938
CallSiteToProfCountMap[User] = *Count;
939
else
940
CallSiteToProfCountMap[User] = 0;
941
}
942
}
943
944
PartialInlinerImpl::FunctionCloner::FunctionCloner(
945
Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
946
function_ref<AssumptionCache *(Function &)> LookupAC,
947
function_ref<TargetTransformInfo &(Function &)> GetTTI)
948
: OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
949
ClonedOI = std::make_unique<FunctionOutliningInfo>();
950
951
// Clone the function, so that we can hack away on it.
952
ValueToValueMapTy VMap;
953
ClonedFunc = CloneFunction(F, VMap);
954
955
ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
956
ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
957
for (BasicBlock *BB : OI->Entries)
958
ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
959
960
for (BasicBlock *E : OI->ReturnBlockPreds) {
961
BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
962
ClonedOI->ReturnBlockPreds.push_back(NewE);
963
}
964
// Go ahead and update all uses to the duplicate, so that we can just
965
// use the inliner functionality when we're done hacking.
966
F->replaceAllUsesWith(ClonedFunc);
967
}
968
969
PartialInlinerImpl::FunctionCloner::FunctionCloner(
970
Function *F, FunctionOutliningMultiRegionInfo *OI,
971
OptimizationRemarkEmitter &ORE,
972
function_ref<AssumptionCache *(Function &)> LookupAC,
973
function_ref<TargetTransformInfo &(Function &)> GetTTI)
974
: OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
975
ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
976
977
// Clone the function, so that we can hack away on it.
978
ValueToValueMapTy VMap;
979
ClonedFunc = CloneFunction(F, VMap);
980
981
// Go through all Outline Candidate Regions and update all BasicBlock
982
// information.
983
for (const FunctionOutliningMultiRegionInfo::OutlineRegionInfo &RegionInfo :
984
OI->ORI) {
985
SmallVector<BasicBlock *, 8> Region;
986
for (BasicBlock *BB : RegionInfo.Region)
987
Region.push_back(cast<BasicBlock>(VMap[BB]));
988
989
BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
990
BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
991
BasicBlock *NewReturnBlock = nullptr;
992
if (RegionInfo.ReturnBlock)
993
NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
994
FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
995
Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
996
ClonedOMRI->ORI.push_back(MappedRegionInfo);
997
}
998
// Go ahead and update all uses to the duplicate, so that we can just
999
// use the inliner functionality when we're done hacking.
1000
F->replaceAllUsesWith(ClonedFunc);
1001
}
1002
1003
void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
1004
auto GetFirstPHI = [](BasicBlock *BB) {
1005
BasicBlock::iterator I = BB->begin();
1006
PHINode *FirstPhi = nullptr;
1007
while (I != BB->end()) {
1008
PHINode *Phi = dyn_cast<PHINode>(I);
1009
if (!Phi)
1010
break;
1011
if (!FirstPhi) {
1012
FirstPhi = Phi;
1013
break;
1014
}
1015
}
1016
return FirstPhi;
1017
};
1018
1019
// Shouldn't need to normalize PHIs if we're not outlining non-early return
1020
// blocks.
1021
if (!ClonedOI)
1022
return;
1023
1024
// Special hackery is needed with PHI nodes that have inputs from more than
1025
// one extracted block. For simplicity, just split the PHIs into a two-level
1026
// sequence of PHIs, some of which will go in the extracted region, and some
1027
// of which will go outside.
1028
BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1029
// only split block when necessary:
1030
PHINode *FirstPhi = GetFirstPHI(PreReturn);
1031
unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1032
1033
if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1034
return;
1035
1036
auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1037
if (llvm::all_equal(PN->incoming_values()))
1038
return PN->getIncomingValue(0);
1039
return nullptr;
1040
};
1041
1042
ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1043
ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1044
BasicBlock::iterator I = PreReturn->begin();
1045
BasicBlock::iterator Ins = ClonedOI->ReturnBlock->begin();
1046
SmallVector<Instruction *, 4> DeadPhis;
1047
while (I != PreReturn->end()) {
1048
PHINode *OldPhi = dyn_cast<PHINode>(I);
1049
if (!OldPhi)
1050
break;
1051
1052
PHINode *RetPhi =
1053
PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "");
1054
RetPhi->insertBefore(Ins);
1055
OldPhi->replaceAllUsesWith(RetPhi);
1056
Ins = ClonedOI->ReturnBlock->getFirstNonPHIIt();
1057
1058
RetPhi->addIncoming(&*I, PreReturn);
1059
for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1060
RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1061
OldPhi->removeIncomingValue(E);
1062
}
1063
1064
// After incoming values splitting, the old phi may become trivial.
1065
// Keeping the trivial phi can introduce definition inside the outline
1066
// region which is live-out, causing necessary overhead (load, store
1067
// arg passing etc).
1068
if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1069
OldPhi->replaceAllUsesWith(OldPhiVal);
1070
DeadPhis.push_back(OldPhi);
1071
}
1072
++I;
1073
}
1074
for (auto *DP : DeadPhis)
1075
DP->eraseFromParent();
1076
1077
for (auto *E : ClonedOI->ReturnBlockPreds)
1078
E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1079
}
1080
1081
bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1082
1083
auto ComputeRegionCost =
1084
[&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost {
1085
InstructionCost Cost = 0;
1086
for (BasicBlock* BB : Region)
1087
Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1088
return Cost;
1089
};
1090
1091
assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1092
1093
if (ClonedOMRI->ORI.empty())
1094
return false;
1095
1096
// The CodeExtractor needs a dominator tree.
1097
DominatorTree DT;
1098
DT.recalculate(*ClonedFunc);
1099
1100
// Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1101
LoopInfo LI(DT);
1102
BranchProbabilityInfo BPI(*ClonedFunc, LI);
1103
ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1104
1105
// Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1106
CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1107
1108
SetVector<Value *> Inputs, Outputs, Sinks;
1109
for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1110
ClonedOMRI->ORI) {
1111
InstructionCost CurrentOutlinedRegionCost =
1112
ComputeRegionCost(RegionInfo.Region);
1113
1114
CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1115
ClonedFuncBFI.get(), &BPI,
1116
LookupAC(*RegionInfo.EntryBlock->getParent()),
1117
/* AllowVarargs */ false);
1118
1119
CE.findInputsOutputs(Inputs, Outputs, Sinks);
1120
1121
LLVM_DEBUG({
1122
dbgs() << "inputs: " << Inputs.size() << "\n";
1123
dbgs() << "outputs: " << Outputs.size() << "\n";
1124
for (Value *value : Inputs)
1125
dbgs() << "value used in func: " << *value << "\n";
1126
for (Value *output : Outputs)
1127
dbgs() << "instr used in func: " << *output << "\n";
1128
});
1129
1130
// Do not extract regions that have live exit variables.
1131
if (Outputs.size() > 0 && !ForceLiveExit)
1132
continue;
1133
1134
if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1135
CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1136
BasicBlock *OutliningCallBB = OCS->getParent();
1137
assert(OutliningCallBB->getParent() == ClonedFunc);
1138
OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1139
NumColdRegionsOutlined++;
1140
OutlinedRegionCost += CurrentOutlinedRegionCost;
1141
1142
if (MarkOutlinedColdCC) {
1143
OutlinedFunc->setCallingConv(CallingConv::Cold);
1144
OCS->setCallingConv(CallingConv::Cold);
1145
}
1146
} else
1147
ORE.emit([&]() {
1148
return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1149
&RegionInfo.Region.front()->front())
1150
<< "Failed to extract region at block "
1151
<< ore::NV("Block", RegionInfo.Region.front());
1152
});
1153
}
1154
1155
return !OutlinedFunctions.empty();
1156
}
1157
1158
Function *
1159
PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1160
// Returns true if the block is to be partial inlined into the caller
1161
// (i.e. not to be extracted to the out of line function)
1162
auto ToBeInlined = [&, this](BasicBlock *BB) {
1163
return BB == ClonedOI->ReturnBlock ||
1164
llvm::is_contained(ClonedOI->Entries, BB);
1165
};
1166
1167
assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1168
// The CodeExtractor needs a dominator tree.
1169
DominatorTree DT;
1170
DT.recalculate(*ClonedFunc);
1171
1172
// Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1173
LoopInfo LI(DT);
1174
BranchProbabilityInfo BPI(*ClonedFunc, LI);
1175
ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1176
1177
// Gather up the blocks that we're going to extract.
1178
std::vector<BasicBlock *> ToExtract;
1179
auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1180
ToExtract.push_back(ClonedOI->NonReturnBlock);
1181
OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1182
ClonedOI->NonReturnBlock, ClonedFuncTTI);
1183
for (BasicBlock *BB : depth_first(&ClonedFunc->getEntryBlock()))
1184
if (!ToBeInlined(BB) && BB != ClonedOI->NonReturnBlock) {
1185
ToExtract.push_back(BB);
1186
// FIXME: the code extractor may hoist/sink more code
1187
// into the outlined function which may make the outlining
1188
// overhead (the difference of the outlined function cost
1189
// and OutliningRegionCost) look larger.
1190
OutlinedRegionCost += computeBBInlineCost(BB, ClonedFuncTTI);
1191
}
1192
1193
// Extract the body of the if.
1194
CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1195
Function *OutlinedFunc =
1196
CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1197
ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1198
/* AllowVarargs */ true)
1199
.extractCodeRegion(CEAC);
1200
1201
if (OutlinedFunc) {
1202
BasicBlock *OutliningCallBB =
1203
PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1204
assert(OutliningCallBB->getParent() == ClonedFunc);
1205
OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1206
} else
1207
ORE.emit([&]() {
1208
return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1209
&ToExtract.front()->front())
1210
<< "Failed to extract region at block "
1211
<< ore::NV("Block", ToExtract.front());
1212
});
1213
1214
return OutlinedFunc;
1215
}
1216
1217
PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1218
// Ditch the duplicate, since we're done with it, and rewrite all remaining
1219
// users (function pointers, etc.) back to the original function.
1220
ClonedFunc->replaceAllUsesWith(OrigFunc);
1221
ClonedFunc->eraseFromParent();
1222
if (!IsFunctionInlined) {
1223
// Remove each function that was speculatively created if there is no
1224
// reference.
1225
for (auto FuncBBPair : OutlinedFunctions) {
1226
Function *Func = FuncBBPair.first;
1227
Func->eraseFromParent();
1228
}
1229
}
1230
}
1231
1232
std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1233
if (F.hasAddressTaken())
1234
return {false, nullptr};
1235
1236
// Let inliner handle it
1237
if (F.hasFnAttribute(Attribute::AlwaysInline))
1238
return {false, nullptr};
1239
1240
if (F.hasFnAttribute(Attribute::NoInline))
1241
return {false, nullptr};
1242
1243
if (PSI.isFunctionEntryCold(&F))
1244
return {false, nullptr};
1245
1246
if (F.users().empty())
1247
return {false, nullptr};
1248
1249
OptimizationRemarkEmitter ORE(&F);
1250
1251
// Only try to outline cold regions if we have a profile summary, which
1252
// implies we have profiling information.
1253
if (PSI.hasProfileSummary() && F.hasProfileData() &&
1254
!DisableMultiRegionPartialInline) {
1255
std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1256
computeOutliningColdRegionsInfo(F, ORE);
1257
if (OMRI) {
1258
FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1259
1260
LLVM_DEBUG({
1261
dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1262
dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1263
<< "\n";
1264
});
1265
1266
bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1267
1268
if (DidOutline) {
1269
LLVM_DEBUG({
1270
dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1271
Cloner.ClonedFunc->print(dbgs());
1272
dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1273
});
1274
1275
if (tryPartialInline(Cloner))
1276
return {true, nullptr};
1277
}
1278
}
1279
}
1280
1281
// Fall-thru to regular partial inlining if we:
1282
// i) can't find any cold regions to outline, or
1283
// ii) can't inline the outlined function anywhere.
1284
std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1285
if (!OI)
1286
return {false, nullptr};
1287
1288
FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1289
Cloner.normalizeReturnBlock();
1290
1291
Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1292
1293
if (!OutlinedFunction)
1294
return {false, nullptr};
1295
1296
if (tryPartialInline(Cloner))
1297
return {true, OutlinedFunction};
1298
1299
return {false, nullptr};
1300
}
1301
1302
bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1303
if (Cloner.OutlinedFunctions.empty())
1304
return false;
1305
1306
auto OutliningCosts = computeOutliningCosts(Cloner);
1307
1308
InstructionCost SizeCost = std::get<0>(OutliningCosts);
1309
InstructionCost NonWeightedRcost = std::get<1>(OutliningCosts);
1310
1311
assert(SizeCost.isValid() && NonWeightedRcost.isValid() &&
1312
"Expected valid costs");
1313
1314
// Only calculate RelativeToEntryFreq when we are doing single region
1315
// outlining.
1316
BranchProbability RelativeToEntryFreq;
1317
if (Cloner.ClonedOI)
1318
RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1319
else
1320
// RelativeToEntryFreq doesn't make sense when we have more than one
1321
// outlined call because each call will have a different relative frequency
1322
// to the entry block. We can consider using the average, but the
1323
// usefulness of that information is questionable. For now, assume we never
1324
// execute the calls to outlined functions.
1325
RelativeToEntryFreq = BranchProbability(0, 1);
1326
1327
BlockFrequency WeightedRcost =
1328
BlockFrequency(*NonWeightedRcost.getValue()) * RelativeToEntryFreq;
1329
1330
// The call sequence(s) to the outlined function(s) are larger than the sum of
1331
// the original outlined region size(s), it does not increase the chances of
1332
// inlining the function with outlining (The inliner uses the size increase to
1333
// model the cost of inlining a callee).
1334
if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1335
OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1336
DebugLoc DLoc;
1337
BasicBlock *Block;
1338
std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1339
OrigFuncORE.emit([&]() {
1340
return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1341
DLoc, Block)
1342
<< ore::NV("Function", Cloner.OrigFunc)
1343
<< " not partially inlined into callers (Original Size = "
1344
<< ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1345
<< ", Size of call sequence to outlined function = "
1346
<< ore::NV("NewSize", SizeCost) << ")";
1347
});
1348
return false;
1349
}
1350
1351
assert(Cloner.OrigFunc->users().empty() &&
1352
"F's users should all be replaced!");
1353
1354
std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1355
Cloner.ClonedFunc->user_end());
1356
1357
DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1358
auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1359
if (CalleeEntryCount)
1360
computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1361
1362
uint64_t CalleeEntryCountV =
1363
(CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1364
1365
bool AnyInline = false;
1366
for (User *User : Users) {
1367
// Don't bother with BlockAddress used by CallBr for asm goto.
1368
if (isa<BlockAddress>(User))
1369
continue;
1370
1371
CallBase *CB = getSupportedCallBase(User);
1372
1373
if (isLimitReached())
1374
continue;
1375
1376
OptimizationRemarkEmitter CallerORE(CB->getCaller());
1377
if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1378
continue;
1379
1380
// Construct remark before doing the inlining, as after successful inlining
1381
// the callsite is removed.
1382
OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1383
OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1384
<< ore::NV("Caller", CB->getCaller());
1385
1386
InlineFunctionInfo IFI(GetAssumptionCache, &PSI);
1387
// We can only forward varargs when we outlined a single region, else we
1388
// bail on vararg functions.
1389
if (!InlineFunction(*CB, IFI, /*MergeAttributes=*/false, nullptr, true,
1390
(Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1391
: nullptr))
1392
.isSuccess())
1393
continue;
1394
1395
CallerORE.emit(OR);
1396
1397
// Now update the entry count:
1398
if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1399
uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1400
CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1401
}
1402
1403
AnyInline = true;
1404
NumPartialInlining++;
1405
// Update the stats
1406
if (Cloner.ClonedOI)
1407
NumPartialInlined++;
1408
else
1409
NumColdOutlinePartialInlined++;
1410
}
1411
1412
if (AnyInline) {
1413
Cloner.IsFunctionInlined = true;
1414
if (CalleeEntryCount)
1415
Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1416
CalleeEntryCountV, CalleeEntryCount->getType()));
1417
OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1418
OrigFuncORE.emit([&]() {
1419
return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1420
<< "Partially inlined into at least one caller";
1421
});
1422
}
1423
1424
return AnyInline;
1425
}
1426
1427
bool PartialInlinerImpl::run(Module &M) {
1428
if (DisablePartialInlining)
1429
return false;
1430
1431
std::vector<Function *> Worklist;
1432
Worklist.reserve(M.size());
1433
for (Function &F : M)
1434
if (!F.use_empty() && !F.isDeclaration())
1435
Worklist.push_back(&F);
1436
1437
bool Changed = false;
1438
while (!Worklist.empty()) {
1439
Function *CurrFunc = Worklist.back();
1440
Worklist.pop_back();
1441
1442
if (CurrFunc->use_empty())
1443
continue;
1444
1445
std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
1446
if (Result.second)
1447
Worklist.push_back(Result.second);
1448
Changed |= Result.first;
1449
}
1450
1451
return Changed;
1452
}
1453
1454
PreservedAnalyses PartialInlinerPass::run(Module &M,
1455
ModuleAnalysisManager &AM) {
1456
auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1457
1458
auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1459
return FAM.getResult<AssumptionAnalysis>(F);
1460
};
1461
1462
auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1463
return FAM.getCachedResult<AssumptionAnalysis>(F);
1464
};
1465
1466
auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1467
return FAM.getResult<BlockFrequencyAnalysis>(F);
1468
};
1469
1470
auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1471
return FAM.getResult<TargetIRAnalysis>(F);
1472
};
1473
1474
auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1475
return FAM.getResult<TargetLibraryAnalysis>(F);
1476
};
1477
1478
ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
1479
1480
if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1481
GetTLI, PSI, GetBFI)
1482
.run(M))
1483
return PreservedAnalyses::none();
1484
return PreservedAnalyses::all();
1485
}
1486
1487