Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Analysis/InlineCost.cpp
35233 views
1
//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file implements inline cost analysis.
10
//
11
//===----------------------------------------------------------------------===//
12
13
#include "llvm/Analysis/InlineCost.h"
14
#include "llvm/ADT/STLExtras.h"
15
#include "llvm/ADT/SetVector.h"
16
#include "llvm/ADT/SmallPtrSet.h"
17
#include "llvm/ADT/SmallVector.h"
18
#include "llvm/ADT/Statistic.h"
19
#include "llvm/Analysis/AssumptionCache.h"
20
#include "llvm/Analysis/BlockFrequencyInfo.h"
21
#include "llvm/Analysis/CodeMetrics.h"
22
#include "llvm/Analysis/ConstantFolding.h"
23
#include "llvm/Analysis/InstructionSimplify.h"
24
#include "llvm/Analysis/LoopInfo.h"
25
#include "llvm/Analysis/MemoryBuiltins.h"
26
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
27
#include "llvm/Analysis/ProfileSummaryInfo.h"
28
#include "llvm/Analysis/TargetLibraryInfo.h"
29
#include "llvm/Analysis/TargetTransformInfo.h"
30
#include "llvm/Analysis/ValueTracking.h"
31
#include "llvm/Config/llvm-config.h"
32
#include "llvm/IR/AssemblyAnnotationWriter.h"
33
#include "llvm/IR/CallingConv.h"
34
#include "llvm/IR/DataLayout.h"
35
#include "llvm/IR/Dominators.h"
36
#include "llvm/IR/GetElementPtrTypeIterator.h"
37
#include "llvm/IR/GlobalAlias.h"
38
#include "llvm/IR/InstVisitor.h"
39
#include "llvm/IR/IntrinsicInst.h"
40
#include "llvm/IR/Operator.h"
41
#include "llvm/IR/PatternMatch.h"
42
#include "llvm/Support/CommandLine.h"
43
#include "llvm/Support/Debug.h"
44
#include "llvm/Support/FormattedStream.h"
45
#include "llvm/Support/raw_ostream.h"
46
#include <climits>
47
#include <limits>
48
#include <optional>
49
50
using namespace llvm;
51
52
#define DEBUG_TYPE "inline-cost"
53
54
STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
55
56
static cl::opt<int>
57
DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
58
cl::desc("Default amount of inlining to perform"));
59
60
// We introduce this option since there is a minor compile-time win by avoiding
61
// addition of TTI attributes (target-features in particular) to inline
62
// candidates when they are guaranteed to be the same as top level methods in
63
// some use cases. If we avoid adding the attribute, we need an option to avoid
64
// checking these attributes.
65
static cl::opt<bool> IgnoreTTIInlineCompatible(
66
"ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
67
cl::desc("Ignore TTI attributes compatibility check between callee/caller "
68
"during inline cost calculation"));
69
70
static cl::opt<bool> PrintInstructionComments(
71
"print-instruction-comments", cl::Hidden, cl::init(false),
72
cl::desc("Prints comments for instruction based on inline cost analysis"));
73
74
static cl::opt<int> InlineThreshold(
75
"inline-threshold", cl::Hidden, cl::init(225),
76
cl::desc("Control the amount of inlining to perform (default = 225)"));
77
78
static cl::opt<int> HintThreshold(
79
"inlinehint-threshold", cl::Hidden, cl::init(325),
80
cl::desc("Threshold for inlining functions with inline hint"));
81
82
static cl::opt<int>
83
ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
84
cl::init(45),
85
cl::desc("Threshold for inlining cold callsites"));
86
87
static cl::opt<bool> InlineEnableCostBenefitAnalysis(
88
"inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
89
cl::desc("Enable the cost-benefit analysis for the inliner"));
90
91
// InlineSavingsMultiplier overrides per TTI multipliers iff it is
92
// specified explicitly in command line options. This option is exposed
93
// for tuning and testing.
94
static cl::opt<int> InlineSavingsMultiplier(
95
"inline-savings-multiplier", cl::Hidden, cl::init(8),
96
cl::desc("Multiplier to multiply cycle savings by during inlining"));
97
98
// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
99
// specified explicitly in command line options. This option is exposed
100
// for tuning and testing.
101
static cl::opt<int> InlineSavingsProfitableMultiplier(
102
"inline-savings-profitable-multiplier", cl::Hidden, cl::init(4),
103
cl::desc("A multiplier on top of cycle savings to decide whether the "
104
"savings won't justify the cost"));
105
106
static cl::opt<int>
107
InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
108
cl::desc("The maximum size of a callee that get's "
109
"inlined without sufficient cycle savings"));
110
111
// We introduce this threshold to help performance of instrumentation based
112
// PGO before we actually hook up inliner with analysis passes such as BPI and
113
// BFI.
114
static cl::opt<int> ColdThreshold(
115
"inlinecold-threshold", cl::Hidden, cl::init(45),
116
cl::desc("Threshold for inlining functions with cold attribute"));
117
118
static cl::opt<int>
119
HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
120
cl::desc("Threshold for hot callsites "));
121
122
static cl::opt<int> LocallyHotCallSiteThreshold(
123
"locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
124
cl::desc("Threshold for locally hot callsites "));
125
126
static cl::opt<int> ColdCallSiteRelFreq(
127
"cold-callsite-rel-freq", cl::Hidden, cl::init(2),
128
cl::desc("Maximum block frequency, expressed as a percentage of caller's "
129
"entry frequency, for a callsite to be cold in the absence of "
130
"profile information."));
131
132
static cl::opt<uint64_t> HotCallSiteRelFreq(
133
"hot-callsite-rel-freq", cl::Hidden, cl::init(60),
134
cl::desc("Minimum block frequency, expressed as a multiple of caller's "
135
"entry frequency, for a callsite to be hot in the absence of "
136
"profile information."));
137
138
static cl::opt<int>
139
InstrCost("inline-instr-cost", cl::Hidden, cl::init(5),
140
cl::desc("Cost of a single instruction when inlining"));
141
142
static cl::opt<int>
143
MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0),
144
cl::desc("Cost of load/store instruction when inlining"));
145
146
static cl::opt<int> CallPenalty(
147
"inline-call-penalty", cl::Hidden, cl::init(25),
148
cl::desc("Call penalty that is applied per callsite when inlining"));
149
150
static cl::opt<size_t>
151
StackSizeThreshold("inline-max-stacksize", cl::Hidden,
152
cl::init(std::numeric_limits<size_t>::max()),
153
cl::desc("Do not inline functions with a stack size "
154
"that exceeds the specified limit"));
155
156
static cl::opt<size_t> RecurStackSizeThreshold(
157
"recursive-inline-max-stacksize", cl::Hidden,
158
cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller),
159
cl::desc("Do not inline recursive functions with a stack "
160
"size that exceeds the specified limit"));
161
162
static cl::opt<bool> OptComputeFullInlineCost(
163
"inline-cost-full", cl::Hidden,
164
cl::desc("Compute the full inline cost of a call site even when the cost "
165
"exceeds the threshold."));
166
167
static cl::opt<bool> InlineCallerSupersetNoBuiltin(
168
"inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
169
cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
170
"attributes."));
171
172
static cl::opt<bool> DisableGEPConstOperand(
173
"disable-gep-const-evaluation", cl::Hidden, cl::init(false),
174
cl::desc("Disables evaluation of GetElementPtr with constant operands"));
175
176
namespace llvm {
177
std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
178
if (Attr.isValid()) {
179
int AttrValue = 0;
180
if (!Attr.getValueAsString().getAsInteger(10, AttrValue))
181
return AttrValue;
182
}
183
return std::nullopt;
184
}
185
186
std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
187
return getStringFnAttrAsInt(CB.getFnAttr(AttrKind));
188
}
189
190
std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
191
return getStringFnAttrAsInt(F->getFnAttribute(AttrKind));
192
}
193
194
namespace InlineConstants {
195
int getInstrCost() { return InstrCost; }
196
197
} // namespace InlineConstants
198
199
} // namespace llvm
200
201
namespace {
202
class InlineCostCallAnalyzer;
203
204
// This struct is used to store information about inline cost of a
205
// particular instruction
206
struct InstructionCostDetail {
207
int CostBefore = 0;
208
int CostAfter = 0;
209
int ThresholdBefore = 0;
210
int ThresholdAfter = 0;
211
212
int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
213
214
int getCostDelta() const { return CostAfter - CostBefore; }
215
216
bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
217
};
218
219
class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
220
private:
221
InlineCostCallAnalyzer *const ICCA;
222
223
public:
224
InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
225
void emitInstructionAnnot(const Instruction *I,
226
formatted_raw_ostream &OS) override;
227
};
228
229
/// Carry out call site analysis, in order to evaluate inlinability.
230
/// NOTE: the type is currently used as implementation detail of functions such
231
/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
232
/// expectation is that they come from the outer scope, from the wrapper
233
/// functions. If we want to support constructing CallAnalyzer objects where
234
/// lambdas are provided inline at construction, or where the object needs to
235
/// otherwise survive past the scope of the provided functions, we need to
236
/// revisit the argument types.
237
class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
238
typedef InstVisitor<CallAnalyzer, bool> Base;
239
friend class InstVisitor<CallAnalyzer, bool>;
240
241
protected:
242
virtual ~CallAnalyzer() = default;
243
/// The TargetTransformInfo available for this compilation.
244
const TargetTransformInfo &TTI;
245
246
/// Getter for the cache of @llvm.assume intrinsics.
247
function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
248
249
/// Getter for BlockFrequencyInfo
250
function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
251
252
/// Profile summary information.
253
ProfileSummaryInfo *PSI;
254
255
/// The called function.
256
Function &F;
257
258
// Cache the DataLayout since we use it a lot.
259
const DataLayout &DL;
260
261
/// The OptimizationRemarkEmitter available for this compilation.
262
OptimizationRemarkEmitter *ORE;
263
264
/// The candidate callsite being analyzed. Please do not use this to do
265
/// analysis in the caller function; we want the inline cost query to be
266
/// easily cacheable. Instead, use the cover function paramHasAttr.
267
CallBase &CandidateCall;
268
269
/// Extension points for handling callsite features.
270
// Called before a basic block was analyzed.
271
virtual void onBlockStart(const BasicBlock *BB) {}
272
273
/// Called after a basic block was analyzed.
274
virtual void onBlockAnalyzed(const BasicBlock *BB) {}
275
276
/// Called before an instruction was analyzed
277
virtual void onInstructionAnalysisStart(const Instruction *I) {}
278
279
/// Called after an instruction was analyzed
280
virtual void onInstructionAnalysisFinish(const Instruction *I) {}
281
282
/// Called at the end of the analysis of the callsite. Return the outcome of
283
/// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
284
/// the reason it can't.
285
virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
286
/// Called when we're about to start processing a basic block, and every time
287
/// we are done processing an instruction. Return true if there is no point in
288
/// continuing the analysis (e.g. we've determined already the call site is
289
/// too expensive to inline)
290
virtual bool shouldStop() { return false; }
291
292
/// Called before the analysis of the callee body starts (with callsite
293
/// contexts propagated). It checks callsite-specific information. Return a
294
/// reason analysis can't continue if that's the case, or 'true' if it may
295
/// continue.
296
virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
297
/// Called if the analysis engine decides SROA cannot be done for the given
298
/// alloca.
299
virtual void onDisableSROA(AllocaInst *Arg) {}
300
301
/// Called the analysis engine determines load elimination won't happen.
302
virtual void onDisableLoadElimination() {}
303
304
/// Called when we visit a CallBase, before the analysis starts. Return false
305
/// to stop further processing of the instruction.
306
virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
307
308
/// Called to account for a call.
309
virtual void onCallPenalty() {}
310
311
/// Called to account for a load or store.
312
virtual void onMemAccess(){};
313
314
/// Called to account for the expectation the inlining would result in a load
315
/// elimination.
316
virtual void onLoadEliminationOpportunity() {}
317
318
/// Called to account for the cost of argument setup for the Call in the
319
/// callee's body (not the callsite currently under analysis).
320
virtual void onCallArgumentSetup(const CallBase &Call) {}
321
322
/// Called to account for a load relative intrinsic.
323
virtual void onLoadRelativeIntrinsic() {}
324
325
/// Called to account for a lowered call.
326
virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
327
}
328
329
/// Account for a jump table of given size. Return false to stop further
330
/// processing the switch instruction
331
virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
332
333
/// Account for a case cluster of given size. Return false to stop further
334
/// processing of the instruction.
335
virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
336
337
/// Called at the end of processing a switch instruction, with the given
338
/// number of case clusters.
339
virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
340
bool DefaultDestUndefined) {}
341
342
/// Called to account for any other instruction not specifically accounted
343
/// for.
344
virtual void onMissedSimplification() {}
345
346
/// Start accounting potential benefits due to SROA for the given alloca.
347
virtual void onInitializeSROAArg(AllocaInst *Arg) {}
348
349
/// Account SROA savings for the AllocaInst value.
350
virtual void onAggregateSROAUse(AllocaInst *V) {}
351
352
bool handleSROA(Value *V, bool DoNotDisable) {
353
// Check for SROA candidates in comparisons.
354
if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
355
if (DoNotDisable) {
356
onAggregateSROAUse(SROAArg);
357
return true;
358
}
359
disableSROAForArg(SROAArg);
360
}
361
return false;
362
}
363
364
bool IsCallerRecursive = false;
365
bool IsRecursiveCall = false;
366
bool ExposesReturnsTwice = false;
367
bool HasDynamicAlloca = false;
368
bool ContainsNoDuplicateCall = false;
369
bool HasReturn = false;
370
bool HasIndirectBr = false;
371
bool HasUninlineableIntrinsic = false;
372
bool InitsVargArgs = false;
373
374
/// Number of bytes allocated statically by the callee.
375
uint64_t AllocatedSize = 0;
376
unsigned NumInstructions = 0;
377
unsigned NumVectorInstructions = 0;
378
379
/// While we walk the potentially-inlined instructions, we build up and
380
/// maintain a mapping of simplified values specific to this callsite. The
381
/// idea is to propagate any special information we have about arguments to
382
/// this call through the inlinable section of the function, and account for
383
/// likely simplifications post-inlining. The most important aspect we track
384
/// is CFG altering simplifications -- when we prove a basic block dead, that
385
/// can cause dramatic shifts in the cost of inlining a function.
386
DenseMap<Value *, Constant *> SimplifiedValues;
387
388
/// Keep track of the values which map back (through function arguments) to
389
/// allocas on the caller stack which could be simplified through SROA.
390
DenseMap<Value *, AllocaInst *> SROAArgValues;
391
392
/// Keep track of Allocas for which we believe we may get SROA optimization.
393
DenseSet<AllocaInst *> EnabledSROAAllocas;
394
395
/// Keep track of values which map to a pointer base and constant offset.
396
DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
397
398
/// Keep track of dead blocks due to the constant arguments.
399
SmallPtrSet<BasicBlock *, 16> DeadBlocks;
400
401
/// The mapping of the blocks to their known unique successors due to the
402
/// constant arguments.
403
DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
404
405
/// Model the elimination of repeated loads that is expected to happen
406
/// whenever we simplify away the stores that would otherwise cause them to be
407
/// loads.
408
bool EnableLoadElimination = true;
409
410
/// Whether we allow inlining for recursive call.
411
bool AllowRecursiveCall = false;
412
413
SmallPtrSet<Value *, 16> LoadAddrSet;
414
415
AllocaInst *getSROAArgForValueOrNull(Value *V) const {
416
auto It = SROAArgValues.find(V);
417
if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
418
return nullptr;
419
return It->second;
420
}
421
422
// Custom simplification helper routines.
423
bool isAllocaDerivedArg(Value *V);
424
void disableSROAForArg(AllocaInst *SROAArg);
425
void disableSROA(Value *V);
426
void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
427
void disableLoadElimination();
428
bool isGEPFree(GetElementPtrInst &GEP);
429
bool canFoldInboundsGEP(GetElementPtrInst &I);
430
bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
431
bool simplifyCallSite(Function *F, CallBase &Call);
432
bool simplifyInstruction(Instruction &I);
433
bool simplifyIntrinsicCallIsConstant(CallBase &CB);
434
bool simplifyIntrinsicCallObjectSize(CallBase &CB);
435
ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
436
437
/// Return true if the given argument to the function being considered for
438
/// inlining has the given attribute set either at the call site or the
439
/// function declaration. Primarily used to inspect call site specific
440
/// attributes since these can be more precise than the ones on the callee
441
/// itself.
442
bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
443
444
/// Return true if the given value is known non null within the callee if
445
/// inlined through this particular callsite.
446
bool isKnownNonNullInCallee(Value *V);
447
448
/// Return true if size growth is allowed when inlining the callee at \p Call.
449
bool allowSizeGrowth(CallBase &Call);
450
451
// Custom analysis routines.
452
InlineResult analyzeBlock(BasicBlock *BB,
453
SmallPtrSetImpl<const Value *> &EphValues);
454
455
// Disable several entry points to the visitor so we don't accidentally use
456
// them by declaring but not defining them here.
457
void visit(Module *);
458
void visit(Module &);
459
void visit(Function *);
460
void visit(Function &);
461
void visit(BasicBlock *);
462
void visit(BasicBlock &);
463
464
// Provide base case for our instruction visit.
465
bool visitInstruction(Instruction &I);
466
467
// Our visit overrides.
468
bool visitAlloca(AllocaInst &I);
469
bool visitPHI(PHINode &I);
470
bool visitGetElementPtr(GetElementPtrInst &I);
471
bool visitBitCast(BitCastInst &I);
472
bool visitPtrToInt(PtrToIntInst &I);
473
bool visitIntToPtr(IntToPtrInst &I);
474
bool visitCastInst(CastInst &I);
475
bool visitCmpInst(CmpInst &I);
476
bool visitSub(BinaryOperator &I);
477
bool visitBinaryOperator(BinaryOperator &I);
478
bool visitFNeg(UnaryOperator &I);
479
bool visitLoad(LoadInst &I);
480
bool visitStore(StoreInst &I);
481
bool visitExtractValue(ExtractValueInst &I);
482
bool visitInsertValue(InsertValueInst &I);
483
bool visitCallBase(CallBase &Call);
484
bool visitReturnInst(ReturnInst &RI);
485
bool visitBranchInst(BranchInst &BI);
486
bool visitSelectInst(SelectInst &SI);
487
bool visitSwitchInst(SwitchInst &SI);
488
bool visitIndirectBrInst(IndirectBrInst &IBI);
489
bool visitResumeInst(ResumeInst &RI);
490
bool visitCleanupReturnInst(CleanupReturnInst &RI);
491
bool visitCatchReturnInst(CatchReturnInst &RI);
492
bool visitUnreachableInst(UnreachableInst &I);
493
494
public:
495
CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
496
function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
497
function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
498
ProfileSummaryInfo *PSI = nullptr,
499
OptimizationRemarkEmitter *ORE = nullptr)
500
: TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
501
PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
502
CandidateCall(Call) {}
503
504
InlineResult analyze();
505
506
std::optional<Constant *> getSimplifiedValue(Instruction *I) {
507
if (SimplifiedValues.contains(I))
508
return SimplifiedValues[I];
509
return std::nullopt;
510
}
511
512
// Keep a bunch of stats about the cost savings found so we can print them
513
// out when debugging.
514
unsigned NumConstantArgs = 0;
515
unsigned NumConstantOffsetPtrArgs = 0;
516
unsigned NumAllocaArgs = 0;
517
unsigned NumConstantPtrCmps = 0;
518
unsigned NumConstantPtrDiffs = 0;
519
unsigned NumInstructionsSimplified = 0;
520
521
void dump();
522
};
523
524
// Considering forming a binary search, we should find the number of nodes
525
// which is same as the number of comparisons when lowered. For a given
526
// number of clusters, n, we can define a recursive function, f(n), to find
527
// the number of nodes in the tree. The recursion is :
528
// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
529
// and f(n) = n, when n <= 3.
530
// This will lead a binary tree where the leaf should be either f(2) or f(3)
531
// when n > 3. So, the number of comparisons from leaves should be n, while
532
// the number of non-leaf should be :
533
// 2^(log2(n) - 1) - 1
534
// = 2^log2(n) * 2^-1 - 1
535
// = n / 2 - 1.
536
// Considering comparisons from leaf and non-leaf nodes, we can estimate the
537
// number of comparisons in a simple closed form :
538
// n + n / 2 - 1 = n * 3 / 2 - 1
539
int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
540
return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
541
}
542
543
/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
544
/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
545
class InlineCostCallAnalyzer final : public CallAnalyzer {
546
const bool ComputeFullInlineCost;
547
int LoadEliminationCost = 0;
548
/// Bonus to be applied when percentage of vector instructions in callee is
549
/// high (see more details in updateThreshold).
550
int VectorBonus = 0;
551
/// Bonus to be applied when the callee has only one reachable basic block.
552
int SingleBBBonus = 0;
553
554
/// Tunable parameters that control the analysis.
555
const InlineParams &Params;
556
557
// This DenseMap stores the delta change in cost and threshold after
558
// accounting for the given instruction. The map is filled only with the
559
// flag PrintInstructionComments on.
560
DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
561
562
/// Upper bound for the inlining cost. Bonuses are being applied to account
563
/// for speculative "expected profit" of the inlining decision.
564
int Threshold = 0;
565
566
/// The amount of StaticBonus applied.
567
int StaticBonusApplied = 0;
568
569
/// Attempt to evaluate indirect calls to boost its inline cost.
570
const bool BoostIndirectCalls;
571
572
/// Ignore the threshold when finalizing analysis.
573
const bool IgnoreThreshold;
574
575
// True if the cost-benefit-analysis-based inliner is enabled.
576
const bool CostBenefitAnalysisEnabled;
577
578
/// Inlining cost measured in abstract units, accounts for all the
579
/// instructions expected to be executed for a given function invocation.
580
/// Instructions that are statically proven to be dead based on call-site
581
/// arguments are not counted here.
582
int Cost = 0;
583
584
// The cumulative cost at the beginning of the basic block being analyzed. At
585
// the end of analyzing each basic block, "Cost - CostAtBBStart" represents
586
// the size of that basic block.
587
int CostAtBBStart = 0;
588
589
// The static size of live but cold basic blocks. This is "static" in the
590
// sense that it's not weighted by profile counts at all.
591
int ColdSize = 0;
592
593
// Whether inlining is decided by cost-threshold analysis.
594
bool DecidedByCostThreshold = false;
595
596
// Whether inlining is decided by cost-benefit analysis.
597
bool DecidedByCostBenefit = false;
598
599
// The cost-benefit pair computed by cost-benefit analysis.
600
std::optional<CostBenefitPair> CostBenefit;
601
602
bool SingleBB = true;
603
604
unsigned SROACostSavings = 0;
605
unsigned SROACostSavingsLost = 0;
606
607
/// The mapping of caller Alloca values to their accumulated cost savings. If
608
/// we have to disable SROA for one of the allocas, this tells us how much
609
/// cost must be added.
610
DenseMap<AllocaInst *, int> SROAArgCosts;
611
612
/// Return true if \p Call is a cold callsite.
613
bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
614
615
/// Update Threshold based on callsite properties such as callee
616
/// attributes and callee hotness for PGO builds. The Callee is explicitly
617
/// passed to support analyzing indirect calls whose target is inferred by
618
/// analysis.
619
void updateThreshold(CallBase &Call, Function &Callee);
620
/// Return a higher threshold if \p Call is a hot callsite.
621
std::optional<int> getHotCallSiteThreshold(CallBase &Call,
622
BlockFrequencyInfo *CallerBFI);
623
624
/// Handle a capped 'int' increment for Cost.
625
void addCost(int64_t Inc) {
626
Inc = std::clamp<int64_t>(Inc, INT_MIN, INT_MAX);
627
Cost = std::clamp<int64_t>(Inc + Cost, INT_MIN, INT_MAX);
628
}
629
630
void onDisableSROA(AllocaInst *Arg) override {
631
auto CostIt = SROAArgCosts.find(Arg);
632
if (CostIt == SROAArgCosts.end())
633
return;
634
addCost(CostIt->second);
635
SROACostSavings -= CostIt->second;
636
SROACostSavingsLost += CostIt->second;
637
SROAArgCosts.erase(CostIt);
638
}
639
640
void onDisableLoadElimination() override {
641
addCost(LoadEliminationCost);
642
LoadEliminationCost = 0;
643
}
644
645
bool onCallBaseVisitStart(CallBase &Call) override {
646
if (std::optional<int> AttrCallThresholdBonus =
647
getStringFnAttrAsInt(Call, "call-threshold-bonus"))
648
Threshold += *AttrCallThresholdBonus;
649
650
if (std::optional<int> AttrCallCost =
651
getStringFnAttrAsInt(Call, "call-inline-cost")) {
652
addCost(*AttrCallCost);
653
// Prevent further processing of the call since we want to override its
654
// inline cost, not just add to it.
655
return false;
656
}
657
return true;
658
}
659
660
void onCallPenalty() override { addCost(CallPenalty); }
661
662
void onMemAccess() override { addCost(MemAccessCost); }
663
664
void onCallArgumentSetup(const CallBase &Call) override {
665
// Pay the price of the argument setup. We account for the average 1
666
// instruction per call argument setup here.
667
addCost(Call.arg_size() * InstrCost);
668
}
669
void onLoadRelativeIntrinsic() override {
670
// This is normally lowered to 4 LLVM instructions.
671
addCost(3 * InstrCost);
672
}
673
void onLoweredCall(Function *F, CallBase &Call,
674
bool IsIndirectCall) override {
675
// We account for the average 1 instruction per call argument setup here.
676
addCost(Call.arg_size() * InstrCost);
677
678
// If we have a constant that we are calling as a function, we can peer
679
// through it and see the function target. This happens not infrequently
680
// during devirtualization and so we want to give it a hefty bonus for
681
// inlining, but cap that bonus in the event that inlining wouldn't pan out.
682
// Pretend to inline the function, with a custom threshold.
683
if (IsIndirectCall && BoostIndirectCalls) {
684
auto IndirectCallParams = Params;
685
IndirectCallParams.DefaultThreshold =
686
InlineConstants::IndirectCallThreshold;
687
/// FIXME: if InlineCostCallAnalyzer is derived from, this may need
688
/// to instantiate the derived class.
689
InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
690
GetAssumptionCache, GetBFI, PSI, ORE, false);
691
if (CA.analyze().isSuccess()) {
692
// We were able to inline the indirect call! Subtract the cost from the
693
// threshold to get the bonus we want to apply, but don't go below zero.
694
Cost -= std::max(0, CA.getThreshold() - CA.getCost());
695
}
696
} else
697
// Otherwise simply add the cost for merely making the call.
698
addCost(TTI.getInlineCallPenalty(CandidateCall.getCaller(), Call,
699
CallPenalty));
700
}
701
702
void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
703
bool DefaultDestUndefined) override {
704
// If suitable for a jump table, consider the cost for the table size and
705
// branch to destination.
706
// Maximum valid cost increased in this function.
707
if (JumpTableSize) {
708
// Suppose a default branch includes one compare and one conditional
709
// branch if it's reachable.
710
if (!DefaultDestUndefined)
711
addCost(2 * InstrCost);
712
// Suppose a jump table requires one load and one jump instruction.
713
int64_t JTCost =
714
static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost;
715
addCost(JTCost);
716
return;
717
}
718
719
if (NumCaseCluster <= 3) {
720
// Suppose a comparison includes one compare and one conditional branch.
721
// We can reduce a set of instructions if the default branch is
722
// undefined.
723
addCost((NumCaseCluster - DefaultDestUndefined) * 2 * InstrCost);
724
return;
725
}
726
727
int64_t ExpectedNumberOfCompare =
728
getExpectedNumberOfCompare(NumCaseCluster);
729
int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
730
731
addCost(SwitchCost);
732
}
733
void onMissedSimplification() override { addCost(InstrCost); }
734
735
void onInitializeSROAArg(AllocaInst *Arg) override {
736
assert(Arg != nullptr &&
737
"Should not initialize SROA costs for null value.");
738
auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
739
SROACostSavings += SROAArgCost;
740
SROAArgCosts[Arg] = SROAArgCost;
741
}
742
743
void onAggregateSROAUse(AllocaInst *SROAArg) override {
744
auto CostIt = SROAArgCosts.find(SROAArg);
745
assert(CostIt != SROAArgCosts.end() &&
746
"expected this argument to have a cost");
747
CostIt->second += InstrCost;
748
SROACostSavings += InstrCost;
749
}
750
751
void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
752
753
void onBlockAnalyzed(const BasicBlock *BB) override {
754
if (CostBenefitAnalysisEnabled) {
755
// Keep track of the static size of live but cold basic blocks. For now,
756
// we define a cold basic block to be one that's never executed.
757
assert(GetBFI && "GetBFI must be available");
758
BlockFrequencyInfo *BFI = &(GetBFI(F));
759
assert(BFI && "BFI must be available");
760
auto ProfileCount = BFI->getBlockProfileCount(BB);
761
if (*ProfileCount == 0)
762
ColdSize += Cost - CostAtBBStart;
763
}
764
765
auto *TI = BB->getTerminator();
766
// If we had any successors at this point, than post-inlining is likely to
767
// have them as well. Note that we assume any basic blocks which existed
768
// due to branches or switches which folded above will also fold after
769
// inlining.
770
if (SingleBB && TI->getNumSuccessors() > 1) {
771
// Take off the bonus we applied to the threshold.
772
Threshold -= SingleBBBonus;
773
SingleBB = false;
774
}
775
}
776
777
void onInstructionAnalysisStart(const Instruction *I) override {
778
// This function is called to store the initial cost of inlining before
779
// the given instruction was assessed.
780
if (!PrintInstructionComments)
781
return;
782
InstructionCostDetailMap[I].CostBefore = Cost;
783
InstructionCostDetailMap[I].ThresholdBefore = Threshold;
784
}
785
786
void onInstructionAnalysisFinish(const Instruction *I) override {
787
// This function is called to find new values of cost and threshold after
788
// the instruction has been assessed.
789
if (!PrintInstructionComments)
790
return;
791
InstructionCostDetailMap[I].CostAfter = Cost;
792
InstructionCostDetailMap[I].ThresholdAfter = Threshold;
793
}
794
795
bool isCostBenefitAnalysisEnabled() {
796
if (!PSI || !PSI->hasProfileSummary())
797
return false;
798
799
if (!GetBFI)
800
return false;
801
802
if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
803
// Honor the explicit request from the user.
804
if (!InlineEnableCostBenefitAnalysis)
805
return false;
806
} else {
807
// Otherwise, require instrumentation profile.
808
if (!PSI->hasInstrumentationProfile())
809
return false;
810
}
811
812
auto *Caller = CandidateCall.getParent()->getParent();
813
if (!Caller->getEntryCount())
814
return false;
815
816
BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
817
if (!CallerBFI)
818
return false;
819
820
// For now, limit to hot call site.
821
if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
822
return false;
823
824
// Make sure we have a nonzero entry count.
825
auto EntryCount = F.getEntryCount();
826
if (!EntryCount || !EntryCount->getCount())
827
return false;
828
829
BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
830
if (!CalleeBFI)
831
return false;
832
833
return true;
834
}
835
836
// A helper function to choose between command line override and default.
837
unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
838
if (InlineSavingsMultiplier.getNumOccurrences())
839
return InlineSavingsMultiplier;
840
return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();
841
}
842
843
// A helper function to choose between command line override and default.
844
unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
845
if (InlineSavingsProfitableMultiplier.getNumOccurrences())
846
return InlineSavingsProfitableMultiplier;
847
return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();
848
}
849
850
void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
851
if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
852
CandidateCall, "inline-cycle-savings-for-test")) {
853
CycleSavings = *AttrCycleSavings;
854
}
855
856
if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
857
CandidateCall, "inline-runtime-cost-for-test")) {
858
Size = *AttrRuntimeCost;
859
}
860
}
861
862
// Determine whether we should inline the given call site, taking into account
863
// both the size cost and the cycle savings. Return std::nullopt if we don't
864
// have sufficient profiling information to determine.
865
std::optional<bool> costBenefitAnalysis() {
866
if (!CostBenefitAnalysisEnabled)
867
return std::nullopt;
868
869
// buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
870
// for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
871
// falling back to the cost-based metric.
872
// TODO: Improve this hacky condition.
873
if (Threshold == 0)
874
return std::nullopt;
875
876
assert(GetBFI);
877
BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
878
assert(CalleeBFI);
879
880
// The cycle savings expressed as the sum of InstrCost
881
// multiplied by the estimated dynamic count of each instruction we can
882
// avoid. Savings come from the call site cost, such as argument setup and
883
// the call instruction, as well as the instructions that are folded.
884
//
885
// We use 128-bit APInt here to avoid potential overflow. This variable
886
// should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
887
// assumes that we can avoid or fold a billion instructions, each with a
888
// profile count of 10^^15 -- roughly the number of cycles for a 24-hour
889
// period on a 4GHz machine.
890
APInt CycleSavings(128, 0);
891
892
for (auto &BB : F) {
893
APInt CurrentSavings(128, 0);
894
for (auto &I : BB) {
895
if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
896
// Count a conditional branch as savings if it becomes unconditional.
897
if (BI->isConditional() &&
898
isa_and_nonnull<ConstantInt>(
899
SimplifiedValues.lookup(BI->getCondition()))) {
900
CurrentSavings += InstrCost;
901
}
902
} else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
903
if (isa_and_present<ConstantInt>(SimplifiedValues.lookup(SI->getCondition())))
904
CurrentSavings += InstrCost;
905
} else if (Value *V = dyn_cast<Value>(&I)) {
906
// Count an instruction as savings if we can fold it.
907
if (SimplifiedValues.count(V)) {
908
CurrentSavings += InstrCost;
909
}
910
}
911
}
912
913
auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
914
CurrentSavings *= *ProfileCount;
915
CycleSavings += CurrentSavings;
916
}
917
918
// Compute the cycle savings per call.
919
auto EntryProfileCount = F.getEntryCount();
920
assert(EntryProfileCount && EntryProfileCount->getCount());
921
auto EntryCount = EntryProfileCount->getCount();
922
CycleSavings += EntryCount / 2;
923
CycleSavings = CycleSavings.udiv(EntryCount);
924
925
// Compute the total savings for the call site.
926
auto *CallerBB = CandidateCall.getParent();
927
BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
928
CycleSavings += getCallsiteCost(TTI, this->CandidateCall, DL);
929
CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
930
931
// Remove the cost of the cold basic blocks to model the runtime cost more
932
// accurately. Both machine block placement and function splitting could
933
// place cold blocks further from hot blocks.
934
int Size = Cost - ColdSize;
935
936
// Allow tiny callees to be inlined regardless of whether they meet the
937
// savings threshold.
938
Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
939
940
OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
941
CostBenefit.emplace(APInt(128, Size), CycleSavings);
942
943
// Let R be the ratio of CycleSavings to Size. We accept the inlining
944
// opportunity if R is really high and reject if R is really low. If R is
945
// somewhere in the middle, we fall back to the cost-based analysis.
946
//
947
// Specifically, let R = CycleSavings / Size, we accept the inlining
948
// opportunity if:
949
//
950
// PSI->getOrCompHotCountThreshold()
951
// R > -------------------------------------------------
952
// getInliningCostBenefitAnalysisSavingsMultiplier()
953
//
954
// and reject the inlining opportunity if:
955
//
956
// PSI->getOrCompHotCountThreshold()
957
// R <= ----------------------------------------------------
958
// getInliningCostBenefitAnalysisProfitableMultiplier()
959
//
960
// Otherwise, we fall back to the cost-based analysis.
961
//
962
// Implementation-wise, use multiplication (CycleSavings * Multiplier,
963
// HotCountThreshold * Size) rather than division to avoid precision loss.
964
APInt Threshold(128, PSI->getOrCompHotCountThreshold());
965
Threshold *= Size;
966
967
APInt UpperBoundCycleSavings = CycleSavings;
968
UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
969
if (UpperBoundCycleSavings.uge(Threshold))
970
return true;
971
972
APInt LowerBoundCycleSavings = CycleSavings;
973
LowerBoundCycleSavings *=
974
getInliningCostBenefitAnalysisProfitableMultiplier();
975
if (LowerBoundCycleSavings.ult(Threshold))
976
return false;
977
978
// Otherwise, fall back to the cost-based analysis.
979
return std::nullopt;
980
}
981
982
InlineResult finalizeAnalysis() override {
983
// Loops generally act a lot like calls in that they act like barriers to
984
// movement, require a certain amount of setup, etc. So when optimising for
985
// size, we penalise any call sites that perform loops. We do this after all
986
// other costs here, so will likely only be dealing with relatively small
987
// functions (and hence DT and LI will hopefully be cheap).
988
auto *Caller = CandidateCall.getFunction();
989
if (Caller->hasMinSize()) {
990
DominatorTree DT(F);
991
LoopInfo LI(DT);
992
int NumLoops = 0;
993
for (Loop *L : LI) {
994
// Ignore loops that will not be executed
995
if (DeadBlocks.count(L->getHeader()))
996
continue;
997
NumLoops++;
998
}
999
addCost(NumLoops * InlineConstants::LoopPenalty);
1000
}
1001
1002
// We applied the maximum possible vector bonus at the beginning. Now,
1003
// subtract the excess bonus, if any, from the Threshold before
1004
// comparing against Cost.
1005
if (NumVectorInstructions <= NumInstructions / 10)
1006
Threshold -= VectorBonus;
1007
else if (NumVectorInstructions <= NumInstructions / 2)
1008
Threshold -= VectorBonus / 2;
1009
1010
if (std::optional<int> AttrCost =
1011
getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
1012
Cost = *AttrCost;
1013
1014
if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1015
CandidateCall,
1016
InlineConstants::FunctionInlineCostMultiplierAttributeName))
1017
Cost *= *AttrCostMult;
1018
1019
if (std::optional<int> AttrThreshold =
1020
getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
1021
Threshold = *AttrThreshold;
1022
1023
if (auto Result = costBenefitAnalysis()) {
1024
DecidedByCostBenefit = true;
1025
if (*Result)
1026
return InlineResult::success();
1027
else
1028
return InlineResult::failure("Cost over threshold.");
1029
}
1030
1031
if (IgnoreThreshold)
1032
return InlineResult::success();
1033
1034
DecidedByCostThreshold = true;
1035
return Cost < std::max(1, Threshold)
1036
? InlineResult::success()
1037
: InlineResult::failure("Cost over threshold.");
1038
}
1039
1040
bool shouldStop() override {
1041
if (IgnoreThreshold || ComputeFullInlineCost)
1042
return false;
1043
// Bail out the moment we cross the threshold. This means we'll under-count
1044
// the cost, but only when undercounting doesn't matter.
1045
if (Cost < Threshold)
1046
return false;
1047
DecidedByCostThreshold = true;
1048
return true;
1049
}
1050
1051
void onLoadEliminationOpportunity() override {
1052
LoadEliminationCost += InstrCost;
1053
}
1054
1055
InlineResult onAnalysisStart() override {
1056
// Perform some tweaks to the cost and threshold based on the direct
1057
// callsite information.
1058
1059
// We want to more aggressively inline vector-dense kernels, so up the
1060
// threshold, and we'll lower it if the % of vector instructions gets too
1061
// low. Note that these bonuses are some what arbitrary and evolved over
1062
// time by accident as much as because they are principled bonuses.
1063
//
1064
// FIXME: It would be nice to remove all such bonuses. At least it would be
1065
// nice to base the bonus values on something more scientific.
1066
assert(NumInstructions == 0);
1067
assert(NumVectorInstructions == 0);
1068
1069
// Update the threshold based on callsite properties
1070
updateThreshold(CandidateCall, F);
1071
1072
// While Threshold depends on commandline options that can take negative
1073
// values, we want to enforce the invariant that the computed threshold and
1074
// bonuses are non-negative.
1075
assert(Threshold >= 0);
1076
assert(SingleBBBonus >= 0);
1077
assert(VectorBonus >= 0);
1078
1079
// Speculatively apply all possible bonuses to Threshold. If cost exceeds
1080
// this Threshold any time, and cost cannot decrease, we can stop processing
1081
// the rest of the function body.
1082
Threshold += (SingleBBBonus + VectorBonus);
1083
1084
// Give out bonuses for the callsite, as the instructions setting them up
1085
// will be gone after inlining.
1086
addCost(-getCallsiteCost(TTI, this->CandidateCall, DL));
1087
1088
// If this function uses the coldcc calling convention, prefer not to inline
1089
// it.
1090
if (F.getCallingConv() == CallingConv::Cold)
1091
Cost += InlineConstants::ColdccPenalty;
1092
1093
LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1094
1095
// Check if we're done. This can happen due to bonuses and penalties.
1096
if (Cost >= Threshold && !ComputeFullInlineCost)
1097
return InlineResult::failure("high cost");
1098
1099
return InlineResult::success();
1100
}
1101
1102
public:
1103
InlineCostCallAnalyzer(
1104
Function &Callee, CallBase &Call, const InlineParams &Params,
1105
const TargetTransformInfo &TTI,
1106
function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1107
function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1108
ProfileSummaryInfo *PSI = nullptr,
1109
OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1110
bool IgnoreThreshold = false)
1111
: CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
1112
ComputeFullInlineCost(OptComputeFullInlineCost ||
1113
Params.ComputeFullInlineCost || ORE ||
1114
isCostBenefitAnalysisEnabled()),
1115
Params(Params), Threshold(Params.DefaultThreshold),
1116
BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1117
CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1118
Writer(this) {
1119
AllowRecursiveCall = *Params.AllowRecursiveCall;
1120
}
1121
1122
/// Annotation Writer for instruction details
1123
InlineCostAnnotationWriter Writer;
1124
1125
void dump();
1126
1127
// Prints the same analysis as dump(), but its definition is not dependent
1128
// on the build.
1129
void print(raw_ostream &OS);
1130
1131
std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1132
if (InstructionCostDetailMap.contains(I))
1133
return InstructionCostDetailMap[I];
1134
return std::nullopt;
1135
}
1136
1137
virtual ~InlineCostCallAnalyzer() = default;
1138
int getThreshold() const { return Threshold; }
1139
int getCost() const { return Cost; }
1140
int getStaticBonusApplied() const { return StaticBonusApplied; }
1141
std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1142
bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1143
bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1144
};
1145
1146
// Return true if CB is the sole call to local function Callee.
1147
static bool isSoleCallToLocalFunction(const CallBase &CB,
1148
const Function &Callee) {
1149
return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1150
&Callee == CB.getCalledFunction();
1151
}
1152
1153
class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1154
private:
1155
InlineCostFeatures Cost = {};
1156
1157
// FIXME: These constants are taken from the heuristic-based cost visitor.
1158
// These should be removed entirely in a later revision to avoid reliance on
1159
// heuristics in the ML inliner.
1160
static constexpr int JTCostMultiplier = 2;
1161
static constexpr int CaseClusterCostMultiplier = 2;
1162
static constexpr int SwitchDefaultDestCostMultiplier = 2;
1163
static constexpr int SwitchCostMultiplier = 2;
1164
1165
// FIXME: These are taken from the heuristic-based cost visitor: we should
1166
// eventually abstract these to the CallAnalyzer to avoid duplication.
1167
unsigned SROACostSavingOpportunities = 0;
1168
int VectorBonus = 0;
1169
int SingleBBBonus = 0;
1170
int Threshold = 5;
1171
1172
DenseMap<AllocaInst *, unsigned> SROACosts;
1173
1174
void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1175
Cost[static_cast<size_t>(Feature)] += Delta;
1176
}
1177
1178
void set(InlineCostFeatureIndex Feature, int64_t Value) {
1179
Cost[static_cast<size_t>(Feature)] = Value;
1180
}
1181
1182
void onDisableSROA(AllocaInst *Arg) override {
1183
auto CostIt = SROACosts.find(Arg);
1184
if (CostIt == SROACosts.end())
1185
return;
1186
1187
increment(InlineCostFeatureIndex::sroa_losses, CostIt->second);
1188
SROACostSavingOpportunities -= CostIt->second;
1189
SROACosts.erase(CostIt);
1190
}
1191
1192
void onDisableLoadElimination() override {
1193
set(InlineCostFeatureIndex::load_elimination, 1);
1194
}
1195
1196
void onCallPenalty() override {
1197
increment(InlineCostFeatureIndex::call_penalty, CallPenalty);
1198
}
1199
1200
void onCallArgumentSetup(const CallBase &Call) override {
1201
increment(InlineCostFeatureIndex::call_argument_setup,
1202
Call.arg_size() * InstrCost);
1203
}
1204
1205
void onLoadRelativeIntrinsic() override {
1206
increment(InlineCostFeatureIndex::load_relative_intrinsic, 3 * InstrCost);
1207
}
1208
1209
void onLoweredCall(Function *F, CallBase &Call,
1210
bool IsIndirectCall) override {
1211
increment(InlineCostFeatureIndex::lowered_call_arg_setup,
1212
Call.arg_size() * InstrCost);
1213
1214
if (IsIndirectCall) {
1215
InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1216
/*HintThreshold*/ {},
1217
/*ColdThreshold*/ {},
1218
/*OptSizeThreshold*/ {},
1219
/*OptMinSizeThreshold*/ {},
1220
/*HotCallSiteThreshold*/ {},
1221
/*LocallyHotCallSiteThreshold*/ {},
1222
/*ColdCallSiteThreshold*/ {},
1223
/*ComputeFullInlineCost*/ true,
1224
/*EnableDeferral*/ true};
1225
IndirectCallParams.DefaultThreshold =
1226
InlineConstants::IndirectCallThreshold;
1227
1228
InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1229
GetAssumptionCache, GetBFI, PSI, ORE, false,
1230
true);
1231
if (CA.analyze().isSuccess()) {
1232
increment(InlineCostFeatureIndex::nested_inline_cost_estimate,
1233
CA.getCost());
1234
increment(InlineCostFeatureIndex::nested_inlines, 1);
1235
}
1236
} else {
1237
onCallPenalty();
1238
}
1239
}
1240
1241
void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
1242
bool DefaultDestUndefined) override {
1243
if (JumpTableSize) {
1244
if (!DefaultDestUndefined)
1245
increment(InlineCostFeatureIndex::switch_default_dest_penalty,
1246
SwitchDefaultDestCostMultiplier * InstrCost);
1247
int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1248
JTCostMultiplier * InstrCost;
1249
increment(InlineCostFeatureIndex::jump_table_penalty, JTCost);
1250
return;
1251
}
1252
1253
if (NumCaseCluster <= 3) {
1254
increment(InlineCostFeatureIndex::case_cluster_penalty,
1255
(NumCaseCluster - DefaultDestUndefined) *
1256
CaseClusterCostMultiplier * InstrCost);
1257
return;
1258
}
1259
1260
int64_t ExpectedNumberOfCompare =
1261
getExpectedNumberOfCompare(NumCaseCluster);
1262
1263
int64_t SwitchCost =
1264
ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1265
increment(InlineCostFeatureIndex::switch_penalty, SwitchCost);
1266
}
1267
1268
void onMissedSimplification() override {
1269
increment(InlineCostFeatureIndex::unsimplified_common_instructions,
1270
InstrCost);
1271
}
1272
1273
void onInitializeSROAArg(AllocaInst *Arg) override {
1274
auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
1275
SROACosts[Arg] = SROAArgCost;
1276
SROACostSavingOpportunities += SROAArgCost;
1277
}
1278
1279
void onAggregateSROAUse(AllocaInst *Arg) override {
1280
SROACosts.find(Arg)->second += InstrCost;
1281
SROACostSavingOpportunities += InstrCost;
1282
}
1283
1284
void onBlockAnalyzed(const BasicBlock *BB) override {
1285
if (BB->getTerminator()->getNumSuccessors() > 1)
1286
set(InlineCostFeatureIndex::is_multiple_blocks, 1);
1287
Threshold -= SingleBBBonus;
1288
}
1289
1290
InlineResult finalizeAnalysis() override {
1291
auto *Caller = CandidateCall.getFunction();
1292
if (Caller->hasMinSize()) {
1293
DominatorTree DT(F);
1294
LoopInfo LI(DT);
1295
for (Loop *L : LI) {
1296
// Ignore loops that will not be executed
1297
if (DeadBlocks.count(L->getHeader()))
1298
continue;
1299
increment(InlineCostFeatureIndex::num_loops,
1300
InlineConstants::LoopPenalty);
1301
}
1302
}
1303
set(InlineCostFeatureIndex::dead_blocks, DeadBlocks.size());
1304
set(InlineCostFeatureIndex::simplified_instructions,
1305
NumInstructionsSimplified);
1306
set(InlineCostFeatureIndex::constant_args, NumConstantArgs);
1307
set(InlineCostFeatureIndex::constant_offset_ptr_args,
1308
NumConstantOffsetPtrArgs);
1309
set(InlineCostFeatureIndex::sroa_savings, SROACostSavingOpportunities);
1310
1311
if (NumVectorInstructions <= NumInstructions / 10)
1312
Threshold -= VectorBonus;
1313
else if (NumVectorInstructions <= NumInstructions / 2)
1314
Threshold -= VectorBonus / 2;
1315
1316
set(InlineCostFeatureIndex::threshold, Threshold);
1317
1318
return InlineResult::success();
1319
}
1320
1321
bool shouldStop() override { return false; }
1322
1323
void onLoadEliminationOpportunity() override {
1324
increment(InlineCostFeatureIndex::load_elimination, 1);
1325
}
1326
1327
InlineResult onAnalysisStart() override {
1328
increment(InlineCostFeatureIndex::callsite_cost,
1329
-1 * getCallsiteCost(TTI, this->CandidateCall, DL));
1330
1331
set(InlineCostFeatureIndex::cold_cc_penalty,
1332
(F.getCallingConv() == CallingConv::Cold));
1333
1334
set(InlineCostFeatureIndex::last_call_to_static_bonus,
1335
isSoleCallToLocalFunction(CandidateCall, F));
1336
1337
// FIXME: we shouldn't repeat this logic in both the Features and Cost
1338
// analyzer - instead, we should abstract it to a common method in the
1339
// CallAnalyzer
1340
int SingleBBBonusPercent = 50;
1341
int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1342
Threshold += TTI.adjustInliningThreshold(&CandidateCall);
1343
Threshold *= TTI.getInliningThresholdMultiplier();
1344
SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1345
VectorBonus = Threshold * VectorBonusPercent / 100;
1346
Threshold += (SingleBBBonus + VectorBonus);
1347
1348
return InlineResult::success();
1349
}
1350
1351
public:
1352
InlineCostFeaturesAnalyzer(
1353
const TargetTransformInfo &TTI,
1354
function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1355
function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
1356
ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
1357
CallBase &Call)
1358
: CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
1359
1360
const InlineCostFeatures &features() const { return Cost; }
1361
};
1362
1363
} // namespace
1364
1365
/// Test whether the given value is an Alloca-derived function argument.
1366
bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1367
return SROAArgValues.count(V);
1368
}
1369
1370
void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1371
onDisableSROA(SROAArg);
1372
EnabledSROAAllocas.erase(SROAArg);
1373
disableLoadElimination();
1374
}
1375
1376
void InlineCostAnnotationWriter::emitInstructionAnnot(
1377
const Instruction *I, formatted_raw_ostream &OS) {
1378
// The cost of inlining of the given instruction is printed always.
1379
// The threshold delta is printed only when it is non-zero. It happens
1380
// when we decided to give a bonus at a particular instruction.
1381
std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1382
if (!Record)
1383
OS << "; No analysis for the instruction";
1384
else {
1385
OS << "; cost before = " << Record->CostBefore
1386
<< ", cost after = " << Record->CostAfter
1387
<< ", threshold before = " << Record->ThresholdBefore
1388
<< ", threshold after = " << Record->ThresholdAfter << ", ";
1389
OS << "cost delta = " << Record->getCostDelta();
1390
if (Record->hasThresholdChanged())
1391
OS << ", threshold delta = " << Record->getThresholdDelta();
1392
}
1393
auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
1394
if (C) {
1395
OS << ", simplified to ";
1396
(*C)->print(OS, true);
1397
}
1398
OS << "\n";
1399
}
1400
1401
/// If 'V' maps to a SROA candidate, disable SROA for it.
1402
void CallAnalyzer::disableSROA(Value *V) {
1403
if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1404
disableSROAForArg(SROAArg);
1405
}
1406
}
1407
1408
void CallAnalyzer::disableLoadElimination() {
1409
if (EnableLoadElimination) {
1410
onDisableLoadElimination();
1411
EnableLoadElimination = false;
1412
}
1413
}
1414
1415
/// Accumulate a constant GEP offset into an APInt if possible.
1416
///
1417
/// Returns false if unable to compute the offset for any reason. Respects any
1418
/// simplified values known during the analysis of this callsite.
1419
bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1420
unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
1421
assert(IntPtrWidth == Offset.getBitWidth());
1422
1423
for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
1424
GTI != GTE; ++GTI) {
1425
ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
1426
if (!OpC)
1427
if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
1428
OpC = dyn_cast<ConstantInt>(SimpleOp);
1429
if (!OpC)
1430
return false;
1431
if (OpC->isZero())
1432
continue;
1433
1434
// Handle a struct index, which adds its field offset to the pointer.
1435
if (StructType *STy = GTI.getStructTypeOrNull()) {
1436
unsigned ElementIdx = OpC->getZExtValue();
1437
const StructLayout *SL = DL.getStructLayout(STy);
1438
Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
1439
continue;
1440
}
1441
1442
APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1443
Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
1444
}
1445
return true;
1446
}
1447
1448
/// Use TTI to check whether a GEP is free.
1449
///
1450
/// Respects any simplified values known during the analysis of this callsite.
1451
bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1452
SmallVector<Value *, 4> Operands;
1453
Operands.push_back(GEP.getOperand(0));
1454
for (const Use &Op : GEP.indices())
1455
if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
1456
Operands.push_back(SimpleOp);
1457
else
1458
Operands.push_back(Op);
1459
return TTI.getInstructionCost(&GEP, Operands,
1460
TargetTransformInfo::TCK_SizeAndLatency) ==
1461
TargetTransformInfo::TCC_Free;
1462
}
1463
1464
bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1465
disableSROA(I.getOperand(0));
1466
1467
// Check whether inlining will turn a dynamic alloca into a static
1468
// alloca and handle that case.
1469
if (I.isArrayAllocation()) {
1470
Constant *Size = SimplifiedValues.lookup(I.getArraySize());
1471
if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
1472
// Sometimes a dynamic alloca could be converted into a static alloca
1473
// after this constant prop, and become a huge static alloca on an
1474
// unconditional CFG path. Avoid inlining if this is going to happen above
1475
// a threshold.
1476
// FIXME: If the threshold is removed or lowered too much, we could end up
1477
// being too pessimistic and prevent inlining non-problematic code. This
1478
// could result in unintended perf regressions. A better overall strategy
1479
// is needed to track stack usage during inlining.
1480
Type *Ty = I.getAllocatedType();
1481
AllocatedSize = SaturatingMultiplyAdd(
1482
AllocSize->getLimitedValue(),
1483
DL.getTypeAllocSize(Ty).getKnownMinValue(), AllocatedSize);
1484
if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
1485
HasDynamicAlloca = true;
1486
return false;
1487
}
1488
}
1489
1490
// Accumulate the allocated size.
1491
if (I.isStaticAlloca()) {
1492
Type *Ty = I.getAllocatedType();
1493
AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinValue(),
1494
AllocatedSize);
1495
}
1496
1497
// FIXME: This is overly conservative. Dynamic allocas are inefficient for
1498
// a variety of reasons, and so we would like to not inline them into
1499
// functions which don't currently have a dynamic alloca. This simply
1500
// disables inlining altogether in the presence of a dynamic alloca.
1501
if (!I.isStaticAlloca())
1502
HasDynamicAlloca = true;
1503
1504
return false;
1505
}
1506
1507
bool CallAnalyzer::visitPHI(PHINode &I) {
1508
// FIXME: We need to propagate SROA *disabling* through phi nodes, even
1509
// though we don't want to propagate it's bonuses. The idea is to disable
1510
// SROA if it *might* be used in an inappropriate manner.
1511
1512
// Phi nodes are always zero-cost.
1513
// FIXME: Pointer sizes may differ between different address spaces, so do we
1514
// need to use correct address space in the call to getPointerSizeInBits here?
1515
// Or could we skip the getPointerSizeInBits call completely? As far as I can
1516
// see the ZeroOffset is used as a dummy value, so we can probably use any
1517
// bit width for the ZeroOffset?
1518
APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
1519
bool CheckSROA = I.getType()->isPointerTy();
1520
1521
// Track the constant or pointer with constant offset we've seen so far.
1522
Constant *FirstC = nullptr;
1523
std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1524
Value *FirstV = nullptr;
1525
1526
for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1527
BasicBlock *Pred = I.getIncomingBlock(i);
1528
// If the incoming block is dead, skip the incoming block.
1529
if (DeadBlocks.count(Pred))
1530
continue;
1531
// If the parent block of phi is not the known successor of the incoming
1532
// block, skip the incoming block.
1533
BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1534
if (KnownSuccessor && KnownSuccessor != I.getParent())
1535
continue;
1536
1537
Value *V = I.getIncomingValue(i);
1538
// If the incoming value is this phi itself, skip the incoming value.
1539
if (&I == V)
1540
continue;
1541
1542
Constant *C = dyn_cast<Constant>(V);
1543
if (!C)
1544
C = SimplifiedValues.lookup(V);
1545
1546
std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1547
if (!C && CheckSROA)
1548
BaseAndOffset = ConstantOffsetPtrs.lookup(V);
1549
1550
if (!C && !BaseAndOffset.first)
1551
// The incoming value is neither a constant nor a pointer with constant
1552
// offset, exit early.
1553
return true;
1554
1555
if (FirstC) {
1556
if (FirstC == C)
1557
// If we've seen a constant incoming value before and it is the same
1558
// constant we see this time, continue checking the next incoming value.
1559
continue;
1560
// Otherwise early exit because we either see a different constant or saw
1561
// a constant before but we have a pointer with constant offset this time.
1562
return true;
1563
}
1564
1565
if (FirstV) {
1566
// The same logic as above, but check pointer with constant offset here.
1567
if (FirstBaseAndOffset == BaseAndOffset)
1568
continue;
1569
return true;
1570
}
1571
1572
if (C) {
1573
// This is the 1st time we've seen a constant, record it.
1574
FirstC = C;
1575
continue;
1576
}
1577
1578
// The remaining case is that this is the 1st time we've seen a pointer with
1579
// constant offset, record it.
1580
FirstV = V;
1581
FirstBaseAndOffset = BaseAndOffset;
1582
}
1583
1584
// Check if we can map phi to a constant.
1585
if (FirstC) {
1586
SimplifiedValues[&I] = FirstC;
1587
return true;
1588
}
1589
1590
// Check if we can map phi to a pointer with constant offset.
1591
if (FirstBaseAndOffset.first) {
1592
ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1593
1594
if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
1595
SROAArgValues[&I] = SROAArg;
1596
}
1597
1598
return true;
1599
}
1600
1601
/// Check we can fold GEPs of constant-offset call site argument pointers.
1602
/// This requires target data and inbounds GEPs.
1603
///
1604
/// \return true if the specified GEP can be folded.
1605
bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1606
// Check if we have a base + offset for the pointer.
1607
std::pair<Value *, APInt> BaseAndOffset =
1608
ConstantOffsetPtrs.lookup(I.getPointerOperand());
1609
if (!BaseAndOffset.first)
1610
return false;
1611
1612
// Check if the offset of this GEP is constant, and if so accumulate it
1613
// into Offset.
1614
if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
1615
return false;
1616
1617
// Add the result as a new mapping to Base + Offset.
1618
ConstantOffsetPtrs[&I] = BaseAndOffset;
1619
1620
return true;
1621
}
1622
1623
bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1624
auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
1625
1626
// Lambda to check whether a GEP's indices are all constant.
1627
auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1628
for (const Use &Op : GEP.indices())
1629
if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))
1630
return false;
1631
return true;
1632
};
1633
1634
if (!DisableGEPConstOperand)
1635
if (simplifyInstruction(I))
1636
return true;
1637
1638
if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1639
if (SROAArg)
1640
SROAArgValues[&I] = SROAArg;
1641
1642
// Constant GEPs are modeled as free.
1643
return true;
1644
}
1645
1646
// Variable GEPs will require math and will disable SROA.
1647
if (SROAArg)
1648
disableSROAForArg(SROAArg);
1649
return isGEPFree(I);
1650
}
1651
1652
/// Simplify \p I if its operands are constants and update SimplifiedValues.
1653
bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1654
SmallVector<Constant *> COps;
1655
for (Value *Op : I.operands()) {
1656
Constant *COp = dyn_cast<Constant>(Op);
1657
if (!COp)
1658
COp = SimplifiedValues.lookup(Op);
1659
if (!COp)
1660
return false;
1661
COps.push_back(COp);
1662
}
1663
auto *C = ConstantFoldInstOperands(&I, COps, DL);
1664
if (!C)
1665
return false;
1666
SimplifiedValues[&I] = C;
1667
return true;
1668
}
1669
1670
/// Try to simplify a call to llvm.is.constant.
1671
///
1672
/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1673
/// we expect calls of this specific intrinsic to be infrequent.
1674
///
1675
/// FIXME: Given that we know CB's parent (F) caller
1676
/// (CandidateCall->getParent()->getParent()), we might be able to determine
1677
/// whether inlining F into F's caller would change how the call to
1678
/// llvm.is.constant would evaluate.
1679
bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1680
Value *Arg = CB.getArgOperand(0);
1681
auto *C = dyn_cast<Constant>(Arg);
1682
1683
if (!C)
1684
C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(Arg));
1685
1686
Type *RT = CB.getFunctionType()->getReturnType();
1687
SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
1688
return true;
1689
}
1690
1691
bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1692
// As per the langref, "The fourth argument to llvm.objectsize determines if
1693
// the value should be evaluated at runtime."
1694
if (cast<ConstantInt>(CB.getArgOperand(3))->isOne())
1695
return false;
1696
1697
Value *V = lowerObjectSizeCall(&cast<IntrinsicInst>(CB), DL, nullptr,
1698
/*MustSucceed=*/true);
1699
Constant *C = dyn_cast_or_null<Constant>(V);
1700
if (C)
1701
SimplifiedValues[&CB] = C;
1702
return C;
1703
}
1704
1705
bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1706
// Propagate constants through bitcasts.
1707
if (simplifyInstruction(I))
1708
return true;
1709
1710
// Track base/offsets through casts
1711
std::pair<Value *, APInt> BaseAndOffset =
1712
ConstantOffsetPtrs.lookup(I.getOperand(0));
1713
// Casts don't change the offset, just wrap it up.
1714
if (BaseAndOffset.first)
1715
ConstantOffsetPtrs[&I] = BaseAndOffset;
1716
1717
// Also look for SROA candidates here.
1718
if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1719
SROAArgValues[&I] = SROAArg;
1720
1721
// Bitcasts are always zero cost.
1722
return true;
1723
}
1724
1725
bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1726
// Propagate constants through ptrtoint.
1727
if (simplifyInstruction(I))
1728
return true;
1729
1730
// Track base/offset pairs when converted to a plain integer provided the
1731
// integer is large enough to represent the pointer.
1732
unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1733
unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1734
if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1735
std::pair<Value *, APInt> BaseAndOffset =
1736
ConstantOffsetPtrs.lookup(I.getOperand(0));
1737
if (BaseAndOffset.first)
1738
ConstantOffsetPtrs[&I] = BaseAndOffset;
1739
}
1740
1741
// This is really weird. Technically, ptrtoint will disable SROA. However,
1742
// unless that ptrtoint is *used* somewhere in the live basic blocks after
1743
// inlining, it will be nuked, and SROA should proceed. All of the uses which
1744
// would block SROA would also block SROA if applied directly to a pointer,
1745
// and so we can just add the integer in here. The only places where SROA is
1746
// preserved either cannot fire on an integer, or won't in-and-of themselves
1747
// disable SROA (ext) w/o some later use that we would see and disable.
1748
if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1749
SROAArgValues[&I] = SROAArg;
1750
1751
return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1752
TargetTransformInfo::TCC_Free;
1753
}
1754
1755
bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1756
// Propagate constants through ptrtoint.
1757
if (simplifyInstruction(I))
1758
return true;
1759
1760
// Track base/offset pairs when round-tripped through a pointer without
1761
// modifications provided the integer is not too large.
1762
Value *Op = I.getOperand(0);
1763
unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1764
if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1765
std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1766
if (BaseAndOffset.first)
1767
ConstantOffsetPtrs[&I] = BaseAndOffset;
1768
}
1769
1770
// "Propagate" SROA here in the same manner as we do for ptrtoint above.
1771
if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1772
SROAArgValues[&I] = SROAArg;
1773
1774
return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1775
TargetTransformInfo::TCC_Free;
1776
}
1777
1778
bool CallAnalyzer::visitCastInst(CastInst &I) {
1779
// Propagate constants through casts.
1780
if (simplifyInstruction(I))
1781
return true;
1782
1783
// Disable SROA in the face of arbitrary casts we don't explicitly list
1784
// elsewhere.
1785
disableSROA(I.getOperand(0));
1786
1787
// If this is a floating-point cast, and the target says this operation
1788
// is expensive, this may eventually become a library call. Treat the cost
1789
// as such.
1790
switch (I.getOpcode()) {
1791
case Instruction::FPTrunc:
1792
case Instruction::FPExt:
1793
case Instruction::UIToFP:
1794
case Instruction::SIToFP:
1795
case Instruction::FPToUI:
1796
case Instruction::FPToSI:
1797
if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1798
onCallPenalty();
1799
break;
1800
default:
1801
break;
1802
}
1803
1804
return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1805
TargetTransformInfo::TCC_Free;
1806
}
1807
1808
bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1809
return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1810
}
1811
1812
bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1813
// Does the *call site* have the NonNull attribute set on an argument? We
1814
// use the attribute on the call site to memoize any analysis done in the
1815
// caller. This will also trip if the callee function has a non-null
1816
// parameter attribute, but that's a less interesting case because hopefully
1817
// the callee would already have been simplified based on that.
1818
if (Argument *A = dyn_cast<Argument>(V))
1819
if (paramHasAttr(A, Attribute::NonNull))
1820
return true;
1821
1822
// Is this an alloca in the caller? This is distinct from the attribute case
1823
// above because attributes aren't updated within the inliner itself and we
1824
// always want to catch the alloca derived case.
1825
if (isAllocaDerivedArg(V))
1826
// We can actually predict the result of comparisons between an
1827
// alloca-derived value and null. Note that this fires regardless of
1828
// SROA firing.
1829
return true;
1830
1831
return false;
1832
}
1833
1834
bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1835
// If the normal destination of the invoke or the parent block of the call
1836
// site is unreachable-terminated, there is little point in inlining this
1837
// unless there is literally zero cost.
1838
// FIXME: Note that it is possible that an unreachable-terminated block has a
1839
// hot entry. For example, in below scenario inlining hot_call_X() may be
1840
// beneficial :
1841
// main() {
1842
// hot_call_1();
1843
// ...
1844
// hot_call_N()
1845
// exit(0);
1846
// }
1847
// For now, we are not handling this corner case here as it is rare in real
1848
// code. In future, we should elaborate this based on BPI and BFI in more
1849
// general threshold adjusting heuristics in updateThreshold().
1850
if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1851
if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1852
return false;
1853
} else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1854
return false;
1855
1856
return true;
1857
}
1858
1859
bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1860
BlockFrequencyInfo *CallerBFI) {
1861
// If global profile summary is available, then callsite's coldness is
1862
// determined based on that.
1863
if (PSI && PSI->hasProfileSummary())
1864
return PSI->isColdCallSite(Call, CallerBFI);
1865
1866
// Otherwise we need BFI to be available.
1867
if (!CallerBFI)
1868
return false;
1869
1870
// Determine if the callsite is cold relative to caller's entry. We could
1871
// potentially cache the computation of scaled entry frequency, but the added
1872
// complexity is not worth it unless this scaling shows up high in the
1873
// profiles.
1874
const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1875
auto CallSiteBB = Call.getParent();
1876
auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1877
auto CallerEntryFreq =
1878
CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1879
return CallSiteFreq < CallerEntryFreq * ColdProb;
1880
}
1881
1882
std::optional<int>
1883
InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1884
BlockFrequencyInfo *CallerBFI) {
1885
1886
// If global profile summary is available, then callsite's hotness is
1887
// determined based on that.
1888
if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
1889
return Params.HotCallSiteThreshold;
1890
1891
// Otherwise we need BFI to be available and to have a locally hot callsite
1892
// threshold.
1893
if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1894
return std::nullopt;
1895
1896
// Determine if the callsite is hot relative to caller's entry. We could
1897
// potentially cache the computation of scaled entry frequency, but the added
1898
// complexity is not worth it unless this scaling shows up high in the
1899
// profiles.
1900
const BasicBlock *CallSiteBB = Call.getParent();
1901
BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1902
BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
1903
std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(HotCallSiteRelFreq);
1904
if (Limit && CallSiteFreq >= *Limit)
1905
return Params.LocallyHotCallSiteThreshold;
1906
1907
// Otherwise treat it normally.
1908
return std::nullopt;
1909
}
1910
1911
void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1912
// If no size growth is allowed for this inlining, set Threshold to 0.
1913
if (!allowSizeGrowth(Call)) {
1914
Threshold = 0;
1915
return;
1916
}
1917
1918
Function *Caller = Call.getCaller();
1919
1920
// return min(A, B) if B is valid.
1921
auto MinIfValid = [](int A, std::optional<int> B) {
1922
return B ? std::min(A, *B) : A;
1923
};
1924
1925
// return max(A, B) if B is valid.
1926
auto MaxIfValid = [](int A, std::optional<int> B) {
1927
return B ? std::max(A, *B) : A;
1928
};
1929
1930
// Various bonus percentages. These are multiplied by Threshold to get the
1931
// bonus values.
1932
// SingleBBBonus: This bonus is applied if the callee has a single reachable
1933
// basic block at the given callsite context. This is speculatively applied
1934
// and withdrawn if more than one basic block is seen.
1935
//
1936
// LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1937
// of the last call to a static function as inlining such functions is
1938
// guaranteed to reduce code size.
1939
//
1940
// These bonus percentages may be set to 0 based on properties of the caller
1941
// and the callsite.
1942
int SingleBBBonusPercent = 50;
1943
int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1944
int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
1945
1946
// Lambda to set all the above bonus and bonus percentages to 0.
1947
auto DisallowAllBonuses = [&]() {
1948
SingleBBBonusPercent = 0;
1949
VectorBonusPercent = 0;
1950
LastCallToStaticBonus = 0;
1951
};
1952
1953
// Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1954
// and reduce the threshold if the caller has the necessary attribute.
1955
if (Caller->hasMinSize()) {
1956
Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1957
// For minsize, we want to disable the single BB bonus and the vector
1958
// bonuses, but not the last-call-to-static bonus. Inlining the last call to
1959
// a static function will, at the minimum, eliminate the parameter setup and
1960
// call/return instructions.
1961
SingleBBBonusPercent = 0;
1962
VectorBonusPercent = 0;
1963
} else if (Caller->hasOptSize())
1964
Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1965
1966
// Adjust the threshold based on inlinehint attribute and profile based
1967
// hotness information if the caller does not have MinSize attribute.
1968
if (!Caller->hasMinSize()) {
1969
if (Callee.hasFnAttribute(Attribute::InlineHint))
1970
Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1971
1972
// FIXME: After switching to the new passmanager, simplify the logic below
1973
// by checking only the callsite hotness/coldness as we will reliably
1974
// have local profile information.
1975
//
1976
// Callsite hotness and coldness can be determined if sample profile is
1977
// used (which adds hotness metadata to calls) or if caller's
1978
// BlockFrequencyInfo is available.
1979
BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
1980
auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1981
if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1982
LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1983
// FIXME: This should update the threshold only if it exceeds the
1984
// current threshold, but AutoFDO + ThinLTO currently relies on this
1985
// behavior to prevent inlining of hot callsites during ThinLTO
1986
// compile phase.
1987
Threshold = *HotCallSiteThreshold;
1988
} else if (isColdCallSite(Call, CallerBFI)) {
1989
LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1990
// Do not apply bonuses for a cold callsite including the
1991
// LastCallToStatic bonus. While this bonus might result in code size
1992
// reduction, it can cause the size of a non-cold caller to increase
1993
// preventing it from being inlined.
1994
DisallowAllBonuses();
1995
Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1996
} else if (PSI) {
1997
// Use callee's global profile information only if we have no way of
1998
// determining this via callsite information.
1999
if (PSI->isFunctionEntryHot(&Callee)) {
2000
LLVM_DEBUG(dbgs() << "Hot callee.\n");
2001
// If callsite hotness can not be determined, we may still know
2002
// that the callee is hot and treat it as a weaker hint for threshold
2003
// increase.
2004
Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2005
} else if (PSI->isFunctionEntryCold(&Callee)) {
2006
LLVM_DEBUG(dbgs() << "Cold callee.\n");
2007
// Do not apply bonuses for a cold callee including the
2008
// LastCallToStatic bonus. While this bonus might result in code size
2009
// reduction, it can cause the size of a non-cold caller to increase
2010
// preventing it from being inlined.
2011
DisallowAllBonuses();
2012
Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2013
}
2014
}
2015
}
2016
2017
Threshold += TTI.adjustInliningThreshold(&Call);
2018
2019
// Finally, take the target-specific inlining threshold multiplier into
2020
// account.
2021
Threshold *= TTI.getInliningThresholdMultiplier();
2022
2023
SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2024
VectorBonus = Threshold * VectorBonusPercent / 100;
2025
2026
// If there is only one call of the function, and it has internal linkage,
2027
// the cost of inlining it drops dramatically. It may seem odd to update
2028
// Cost in updateThreshold, but the bonus depends on the logic in this method.
2029
if (isSoleCallToLocalFunction(Call, F)) {
2030
Cost -= LastCallToStaticBonus;
2031
StaticBonusApplied = LastCallToStaticBonus;
2032
}
2033
}
2034
2035
bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2036
Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2037
// First try to handle simplified comparisons.
2038
if (simplifyInstruction(I))
2039
return true;
2040
2041
if (I.getOpcode() == Instruction::FCmp)
2042
return false;
2043
2044
// Otherwise look for a comparison between constant offset pointers with
2045
// a common base.
2046
Value *LHSBase, *RHSBase;
2047
APInt LHSOffset, RHSOffset;
2048
std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2049
if (LHSBase) {
2050
std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2051
if (RHSBase && LHSBase == RHSBase) {
2052
// We have common bases, fold the icmp to a constant based on the
2053
// offsets.
2054
SimplifiedValues[&I] = ConstantInt::getBool(
2055
I.getType(),
2056
ICmpInst::compare(LHSOffset, RHSOffset, I.getPredicate()));
2057
++NumConstantPtrCmps;
2058
return true;
2059
}
2060
}
2061
2062
auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2063
for (auto *User : I.users())
2064
if (auto *Instr = dyn_cast<Instruction>(User))
2065
if (!Instr->getMetadata(LLVMContext::MD_make_implicit))
2066
return false;
2067
return true;
2068
};
2069
2070
// If the comparison is an equality comparison with null, we can simplify it
2071
// if we know the value (argument) can't be null
2072
if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1))) {
2073
if (isKnownNonNullInCallee(I.getOperand(0))) {
2074
bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2075
SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
2076
: ConstantInt::getFalse(I.getType());
2077
return true;
2078
}
2079
// Implicit null checks act as unconditional branches and their comparisons
2080
// should be treated as simplified and free of cost.
2081
if (isImplicitNullCheckCmp(I))
2082
return true;
2083
}
2084
return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
2085
}
2086
2087
bool CallAnalyzer::visitSub(BinaryOperator &I) {
2088
// Try to handle a special case: we can fold computing the difference of two
2089
// constant-related pointers.
2090
Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2091
Value *LHSBase, *RHSBase;
2092
APInt LHSOffset, RHSOffset;
2093
std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2094
if (LHSBase) {
2095
std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2096
if (RHSBase && LHSBase == RHSBase) {
2097
// We have common bases, fold the subtract to a constant based on the
2098
// offsets.
2099
Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2100
Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2101
if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
2102
SimplifiedValues[&I] = C;
2103
++NumConstantPtrDiffs;
2104
return true;
2105
}
2106
}
2107
}
2108
2109
// Otherwise, fall back to the generic logic for simplifying and handling
2110
// instructions.
2111
return Base::visitSub(I);
2112
}
2113
2114
bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2115
Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2116
Constant *CLHS = dyn_cast<Constant>(LHS);
2117
if (!CLHS)
2118
CLHS = SimplifiedValues.lookup(LHS);
2119
Constant *CRHS = dyn_cast<Constant>(RHS);
2120
if (!CRHS)
2121
CRHS = SimplifiedValues.lookup(RHS);
2122
2123
Value *SimpleV = nullptr;
2124
if (auto FI = dyn_cast<FPMathOperator>(&I))
2125
SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
2126
FI->getFastMathFlags(), DL);
2127
else
2128
SimpleV =
2129
simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
2130
2131
if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2132
SimplifiedValues[&I] = C;
2133
2134
if (SimpleV)
2135
return true;
2136
2137
// Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2138
disableSROA(LHS);
2139
disableSROA(RHS);
2140
2141
// If the instruction is floating point, and the target says this operation
2142
// is expensive, this may eventually become a library call. Treat the cost
2143
// as such. Unless it's fneg which can be implemented with an xor.
2144
using namespace llvm::PatternMatch;
2145
if (I.getType()->isFloatingPointTy() &&
2146
TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
2147
!match(&I, m_FNeg(m_Value())))
2148
onCallPenalty();
2149
2150
return false;
2151
}
2152
2153
bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2154
Value *Op = I.getOperand(0);
2155
Constant *COp = dyn_cast<Constant>(Op);
2156
if (!COp)
2157
COp = SimplifiedValues.lookup(Op);
2158
2159
Value *SimpleV = simplifyFNegInst(
2160
COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
2161
2162
if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2163
SimplifiedValues[&I] = C;
2164
2165
if (SimpleV)
2166
return true;
2167
2168
// Disable any SROA on arguments to arbitrary, unsimplified fneg.
2169
disableSROA(Op);
2170
2171
return false;
2172
}
2173
2174
bool CallAnalyzer::visitLoad(LoadInst &I) {
2175
if (handleSROA(I.getPointerOperand(), I.isSimple()))
2176
return true;
2177
2178
// If the data is already loaded from this address and hasn't been clobbered
2179
// by any stores or calls, this load is likely to be redundant and can be
2180
// eliminated.
2181
if (EnableLoadElimination &&
2182
!LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
2183
onLoadEliminationOpportunity();
2184
return true;
2185
}
2186
2187
onMemAccess();
2188
return false;
2189
}
2190
2191
bool CallAnalyzer::visitStore(StoreInst &I) {
2192
if (handleSROA(I.getPointerOperand(), I.isSimple()))
2193
return true;
2194
2195
// The store can potentially clobber loads and prevent repeated loads from
2196
// being eliminated.
2197
// FIXME:
2198
// 1. We can probably keep an initial set of eliminatable loads substracted
2199
// from the cost even when we finally see a store. We just need to disable
2200
// *further* accumulation of elimination savings.
2201
// 2. We should probably at some point thread MemorySSA for the callee into
2202
// this and then use that to actually compute *really* precise savings.
2203
disableLoadElimination();
2204
2205
onMemAccess();
2206
return false;
2207
}
2208
2209
bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2210
// Constant folding for extract value is trivial.
2211
if (simplifyInstruction(I))
2212
return true;
2213
2214
// SROA can't look through these, but they may be free.
2215
return Base::visitExtractValue(I);
2216
}
2217
2218
bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2219
// Constant folding for insert value is trivial.
2220
if (simplifyInstruction(I))
2221
return true;
2222
2223
// SROA can't look through these, but they may be free.
2224
return Base::visitInsertValue(I);
2225
}
2226
2227
/// Try to simplify a call site.
2228
///
2229
/// Takes a concrete function and callsite and tries to actually simplify it by
2230
/// analyzing the arguments and call itself with instsimplify. Returns true if
2231
/// it has simplified the callsite to some other entity (a constant), making it
2232
/// free.
2233
bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2234
// FIXME: Using the instsimplify logic directly for this is inefficient
2235
// because we have to continually rebuild the argument list even when no
2236
// simplifications can be performed. Until that is fixed with remapping
2237
// inside of instsimplify, directly constant fold calls here.
2238
if (!canConstantFoldCallTo(&Call, F))
2239
return false;
2240
2241
// Try to re-map the arguments to constants.
2242
SmallVector<Constant *, 4> ConstantArgs;
2243
ConstantArgs.reserve(Call.arg_size());
2244
for (Value *I : Call.args()) {
2245
Constant *C = dyn_cast<Constant>(I);
2246
if (!C)
2247
C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
2248
if (!C)
2249
return false; // This argument doesn't map to a constant.
2250
2251
ConstantArgs.push_back(C);
2252
}
2253
if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
2254
SimplifiedValues[&Call] = C;
2255
return true;
2256
}
2257
2258
return false;
2259
}
2260
2261
bool CallAnalyzer::visitCallBase(CallBase &Call) {
2262
if (!onCallBaseVisitStart(Call))
2263
return true;
2264
2265
if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2266
!F.hasFnAttribute(Attribute::ReturnsTwice)) {
2267
// This aborts the entire analysis.
2268
ExposesReturnsTwice = true;
2269
return false;
2270
}
2271
if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
2272
ContainsNoDuplicateCall = true;
2273
2274
Function *F = Call.getCalledFunction();
2275
bool IsIndirectCall = !F;
2276
if (IsIndirectCall) {
2277
// Check if this happens to be an indirect function call to a known function
2278
// in this inline context. If not, we've done all we can.
2279
Value *Callee = Call.getCalledOperand();
2280
F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
2281
if (!F || F->getFunctionType() != Call.getFunctionType()) {
2282
onCallArgumentSetup(Call);
2283
2284
if (!Call.onlyReadsMemory())
2285
disableLoadElimination();
2286
return Base::visitCallBase(Call);
2287
}
2288
}
2289
2290
assert(F && "Expected a call to a known function");
2291
2292
// When we have a concrete function, first try to simplify it directly.
2293
if (simplifyCallSite(F, Call))
2294
return true;
2295
2296
// Next check if it is an intrinsic we know about.
2297
// FIXME: Lift this into part of the InstVisitor.
2298
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
2299
switch (II->getIntrinsicID()) {
2300
default:
2301
if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
2302
disableLoadElimination();
2303
return Base::visitCallBase(Call);
2304
2305
case Intrinsic::load_relative:
2306
onLoadRelativeIntrinsic();
2307
return false;
2308
2309
case Intrinsic::memset:
2310
case Intrinsic::memcpy:
2311
case Intrinsic::memmove:
2312
disableLoadElimination();
2313
// SROA can usually chew through these intrinsics, but they aren't free.
2314
return false;
2315
case Intrinsic::icall_branch_funnel:
2316
case Intrinsic::localescape:
2317
HasUninlineableIntrinsic = true;
2318
return false;
2319
case Intrinsic::vastart:
2320
InitsVargArgs = true;
2321
return false;
2322
case Intrinsic::launder_invariant_group:
2323
case Intrinsic::strip_invariant_group:
2324
if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
2325
SROAArgValues[II] = SROAArg;
2326
return true;
2327
case Intrinsic::is_constant:
2328
return simplifyIntrinsicCallIsConstant(Call);
2329
case Intrinsic::objectsize:
2330
return simplifyIntrinsicCallObjectSize(Call);
2331
}
2332
}
2333
2334
if (F == Call.getFunction()) {
2335
// This flag will fully abort the analysis, so don't bother with anything
2336
// else.
2337
IsRecursiveCall = true;
2338
if (!AllowRecursiveCall)
2339
return false;
2340
}
2341
2342
if (TTI.isLoweredToCall(F)) {
2343
onLoweredCall(F, Call, IsIndirectCall);
2344
}
2345
2346
if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2347
disableLoadElimination();
2348
return Base::visitCallBase(Call);
2349
}
2350
2351
bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2352
// At least one return instruction will be free after inlining.
2353
bool Free = !HasReturn;
2354
HasReturn = true;
2355
return Free;
2356
}
2357
2358
bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2359
// We model unconditional branches as essentially free -- they really
2360
// shouldn't exist at all, but handling them makes the behavior of the
2361
// inliner more regular and predictable. Interestingly, conditional branches
2362
// which will fold away are also free.
2363
return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
2364
BI.getMetadata(LLVMContext::MD_make_implicit) ||
2365
isa_and_nonnull<ConstantInt>(
2366
SimplifiedValues.lookup(BI.getCondition()));
2367
}
2368
2369
bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2370
bool CheckSROA = SI.getType()->isPointerTy();
2371
Value *TrueVal = SI.getTrueValue();
2372
Value *FalseVal = SI.getFalseValue();
2373
2374
Constant *TrueC = dyn_cast<Constant>(TrueVal);
2375
if (!TrueC)
2376
TrueC = SimplifiedValues.lookup(TrueVal);
2377
Constant *FalseC = dyn_cast<Constant>(FalseVal);
2378
if (!FalseC)
2379
FalseC = SimplifiedValues.lookup(FalseVal);
2380
Constant *CondC =
2381
dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
2382
2383
if (!CondC) {
2384
// Select C, X, X => X
2385
if (TrueC == FalseC && TrueC) {
2386
SimplifiedValues[&SI] = TrueC;
2387
return true;
2388
}
2389
2390
if (!CheckSROA)
2391
return Base::visitSelectInst(SI);
2392
2393
std::pair<Value *, APInt> TrueBaseAndOffset =
2394
ConstantOffsetPtrs.lookup(TrueVal);
2395
std::pair<Value *, APInt> FalseBaseAndOffset =
2396
ConstantOffsetPtrs.lookup(FalseVal);
2397
if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2398
ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2399
2400
if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
2401
SROAArgValues[&SI] = SROAArg;
2402
return true;
2403
}
2404
2405
return Base::visitSelectInst(SI);
2406
}
2407
2408
// Select condition is a constant.
2409
Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2410
: (CondC->isNullValue()) ? FalseVal
2411
: nullptr;
2412
if (!SelectedV) {
2413
// Condition is a vector constant that is not all 1s or all 0s. If all
2414
// operands are constants, ConstantFoldSelectInstruction() can handle the
2415
// cases such as select vectors.
2416
if (TrueC && FalseC) {
2417
if (auto *C = ConstantFoldSelectInstruction(CondC, TrueC, FalseC)) {
2418
SimplifiedValues[&SI] = C;
2419
return true;
2420
}
2421
}
2422
return Base::visitSelectInst(SI);
2423
}
2424
2425
// Condition is either all 1s or all 0s. SI can be simplified.
2426
if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
2427
SimplifiedValues[&SI] = SelectedC;
2428
return true;
2429
}
2430
2431
if (!CheckSROA)
2432
return true;
2433
2434
std::pair<Value *, APInt> BaseAndOffset =
2435
ConstantOffsetPtrs.lookup(SelectedV);
2436
if (BaseAndOffset.first) {
2437
ConstantOffsetPtrs[&SI] = BaseAndOffset;
2438
2439
if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
2440
SROAArgValues[&SI] = SROAArg;
2441
}
2442
2443
return true;
2444
}
2445
2446
bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2447
// We model unconditional switches as free, see the comments on handling
2448
// branches.
2449
if (isa<ConstantInt>(SI.getCondition()))
2450
return true;
2451
if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
2452
if (isa<ConstantInt>(V))
2453
return true;
2454
2455
// Assume the most general case where the switch is lowered into
2456
// either a jump table, bit test, or a balanced binary tree consisting of
2457
// case clusters without merging adjacent clusters with the same
2458
// destination. We do not consider the switches that are lowered with a mix
2459
// of jump table/bit test/binary search tree. The cost of the switch is
2460
// proportional to the size of the tree or the size of jump table range.
2461
//
2462
// NB: We convert large switches which are just used to initialize large phi
2463
// nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2464
// inlining those. It will prevent inlining in cases where the optimization
2465
// does not (yet) fire.
2466
2467
unsigned JumpTableSize = 0;
2468
BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2469
unsigned NumCaseCluster =
2470
TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
2471
2472
onFinalizeSwitch(JumpTableSize, NumCaseCluster, SI.defaultDestUndefined());
2473
return false;
2474
}
2475
2476
bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2477
// We never want to inline functions that contain an indirectbr. This is
2478
// incorrect because all the blockaddress's (in static global initializers
2479
// for example) would be referring to the original function, and this
2480
// indirect jump would jump from the inlined copy of the function into the
2481
// original function which is extremely undefined behavior.
2482
// FIXME: This logic isn't really right; we can safely inline functions with
2483
// indirectbr's as long as no other function or global references the
2484
// blockaddress of a block within the current function.
2485
HasIndirectBr = true;
2486
return false;
2487
}
2488
2489
bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2490
// FIXME: It's not clear that a single instruction is an accurate model for
2491
// the inline cost of a resume instruction.
2492
return false;
2493
}
2494
2495
bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2496
// FIXME: It's not clear that a single instruction is an accurate model for
2497
// the inline cost of a cleanupret instruction.
2498
return false;
2499
}
2500
2501
bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2502
// FIXME: It's not clear that a single instruction is an accurate model for
2503
// the inline cost of a catchret instruction.
2504
return false;
2505
}
2506
2507
bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2508
// FIXME: It might be reasonably to discount the cost of instructions leading
2509
// to unreachable as they have the lowest possible impact on both runtime and
2510
// code size.
2511
return true; // No actual code is needed for unreachable.
2512
}
2513
2514
bool CallAnalyzer::visitInstruction(Instruction &I) {
2515
// Some instructions are free. All of the free intrinsics can also be
2516
// handled by SROA, etc.
2517
if (TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
2518
TargetTransformInfo::TCC_Free)
2519
return true;
2520
2521
// We found something we don't understand or can't handle. Mark any SROA-able
2522
// values in the operand list as no longer viable.
2523
for (const Use &Op : I.operands())
2524
disableSROA(Op);
2525
2526
return false;
2527
}
2528
2529
/// Analyze a basic block for its contribution to the inline cost.
2530
///
2531
/// This method walks the analyzer over every instruction in the given basic
2532
/// block and accounts for their cost during inlining at this callsite. It
2533
/// aborts early if the threshold has been exceeded or an impossible to inline
2534
/// construct has been detected. It returns false if inlining is no longer
2535
/// viable, and true if inlining remains viable.
2536
InlineResult
2537
CallAnalyzer::analyzeBlock(BasicBlock *BB,
2538
SmallPtrSetImpl<const Value *> &EphValues) {
2539
for (Instruction &I : *BB) {
2540
// FIXME: Currently, the number of instructions in a function regardless of
2541
// our ability to simplify them during inline to constants or dead code,
2542
// are actually used by the vector bonus heuristic. As long as that's true,
2543
// we have to special case debug intrinsics here to prevent differences in
2544
// inlining due to debug symbols. Eventually, the number of unsimplified
2545
// instructions shouldn't factor into the cost computation, but until then,
2546
// hack around it here.
2547
// Similarly, skip pseudo-probes.
2548
if (I.isDebugOrPseudoInst())
2549
continue;
2550
2551
// Skip ephemeral values.
2552
if (EphValues.count(&I))
2553
continue;
2554
2555
++NumInstructions;
2556
if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
2557
++NumVectorInstructions;
2558
2559
// If the instruction simplified to a constant, there is no cost to this
2560
// instruction. Visit the instructions using our InstVisitor to account for
2561
// all of the per-instruction logic. The visit tree returns true if we
2562
// consumed the instruction in any way, and false if the instruction's base
2563
// cost should count against inlining.
2564
onInstructionAnalysisStart(&I);
2565
2566
if (Base::visit(&I))
2567
++NumInstructionsSimplified;
2568
else
2569
onMissedSimplification();
2570
2571
onInstructionAnalysisFinish(&I);
2572
using namespace ore;
2573
// If the visit this instruction detected an uninlinable pattern, abort.
2574
InlineResult IR = InlineResult::success();
2575
if (IsRecursiveCall && !AllowRecursiveCall)
2576
IR = InlineResult::failure("recursive");
2577
else if (ExposesReturnsTwice)
2578
IR = InlineResult::failure("exposes returns twice");
2579
else if (HasDynamicAlloca)
2580
IR = InlineResult::failure("dynamic alloca");
2581
else if (HasIndirectBr)
2582
IR = InlineResult::failure("indirect branch");
2583
else if (HasUninlineableIntrinsic)
2584
IR = InlineResult::failure("uninlinable intrinsic");
2585
else if (InitsVargArgs)
2586
IR = InlineResult::failure("varargs");
2587
if (!IR.isSuccess()) {
2588
if (ORE)
2589
ORE->emit([&]() {
2590
return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2591
&CandidateCall)
2592
<< NV("Callee", &F) << " has uninlinable pattern ("
2593
<< NV("InlineResult", IR.getFailureReason())
2594
<< ") and cost is not fully computed";
2595
});
2596
return IR;
2597
}
2598
2599
// If the caller is a recursive function then we don't want to inline
2600
// functions which allocate a lot of stack space because it would increase
2601
// the caller stack usage dramatically.
2602
if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2603
auto IR =
2604
InlineResult::failure("recursive and allocates too much stack space");
2605
if (ORE)
2606
ORE->emit([&]() {
2607
return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2608
&CandidateCall)
2609
<< NV("Callee", &F) << " is "
2610
<< NV("InlineResult", IR.getFailureReason())
2611
<< ". Cost is not fully computed";
2612
});
2613
return IR;
2614
}
2615
2616
if (shouldStop())
2617
return InlineResult::failure(
2618
"Call site analysis is not favorable to inlining.");
2619
}
2620
2621
return InlineResult::success();
2622
}
2623
2624
/// Compute the base pointer and cumulative constant offsets for V.
2625
///
2626
/// This strips all constant offsets off of V, leaving it the base pointer, and
2627
/// accumulates the total constant offset applied in the returned constant. It
2628
/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2629
/// no constant offsets applied.
2630
ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2631
if (!V->getType()->isPointerTy())
2632
return nullptr;
2633
2634
unsigned AS = V->getType()->getPointerAddressSpace();
2635
unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2636
APInt Offset = APInt::getZero(IntPtrWidth);
2637
2638
// Even though we don't look through PHI nodes, we could be called on an
2639
// instruction in an unreachable block, which may be on a cycle.
2640
SmallPtrSet<Value *, 4> Visited;
2641
Visited.insert(V);
2642
do {
2643
if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
2644
if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
2645
return nullptr;
2646
V = GEP->getPointerOperand();
2647
} else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
2648
if (GA->isInterposable())
2649
break;
2650
V = GA->getAliasee();
2651
} else {
2652
break;
2653
}
2654
assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2655
} while (Visited.insert(V).second);
2656
2657
Type *IdxPtrTy = DL.getIndexType(V->getType());
2658
return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
2659
}
2660
2661
/// Find dead blocks due to deleted CFG edges during inlining.
2662
///
2663
/// If we know the successor of the current block, \p CurrBB, has to be \p
2664
/// NextBB, the other successors of \p CurrBB are dead if these successors have
2665
/// no live incoming CFG edges. If one block is found to be dead, we can
2666
/// continue growing the dead block list by checking the successors of the dead
2667
/// blocks to see if all their incoming edges are dead or not.
2668
void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2669
auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2670
// A CFG edge is dead if the predecessor is dead or the predecessor has a
2671
// known successor which is not the one under exam.
2672
return (DeadBlocks.count(Pred) ||
2673
(KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
2674
};
2675
2676
auto IsNewlyDead = [&](BasicBlock *BB) {
2677
// If all the edges to a block are dead, the block is also dead.
2678
return (!DeadBlocks.count(BB) &&
2679
llvm::all_of(predecessors(BB),
2680
[&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2681
};
2682
2683
for (BasicBlock *Succ : successors(CurrBB)) {
2684
if (Succ == NextBB || !IsNewlyDead(Succ))
2685
continue;
2686
SmallVector<BasicBlock *, 4> NewDead;
2687
NewDead.push_back(Succ);
2688
while (!NewDead.empty()) {
2689
BasicBlock *Dead = NewDead.pop_back_val();
2690
if (DeadBlocks.insert(Dead).second)
2691
// Continue growing the dead block lists.
2692
for (BasicBlock *S : successors(Dead))
2693
if (IsNewlyDead(S))
2694
NewDead.push_back(S);
2695
}
2696
}
2697
}
2698
2699
/// Analyze a call site for potential inlining.
2700
///
2701
/// Returns true if inlining this call is viable, and false if it is not
2702
/// viable. It computes the cost and adjusts the threshold based on numerous
2703
/// factors and heuristics. If this method returns false but the computed cost
2704
/// is below the computed threshold, then inlining was forcibly disabled by
2705
/// some artifact of the routine.
2706
InlineResult CallAnalyzer::analyze() {
2707
++NumCallsAnalyzed;
2708
2709
auto Result = onAnalysisStart();
2710
if (!Result.isSuccess())
2711
return Result;
2712
2713
if (F.empty())
2714
return InlineResult::success();
2715
2716
Function *Caller = CandidateCall.getFunction();
2717
// Check if the caller function is recursive itself.
2718
for (User *U : Caller->users()) {
2719
CallBase *Call = dyn_cast<CallBase>(U);
2720
if (Call && Call->getFunction() == Caller) {
2721
IsCallerRecursive = true;
2722
break;
2723
}
2724
}
2725
2726
// Populate our simplified values by mapping from function arguments to call
2727
// arguments with known important simplifications.
2728
auto CAI = CandidateCall.arg_begin();
2729
for (Argument &FAI : F.args()) {
2730
assert(CAI != CandidateCall.arg_end());
2731
if (Constant *C = dyn_cast<Constant>(CAI))
2732
SimplifiedValues[&FAI] = C;
2733
2734
Value *PtrArg = *CAI;
2735
if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2736
ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
2737
2738
// We can SROA any pointer arguments derived from alloca instructions.
2739
if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2740
SROAArgValues[&FAI] = SROAArg;
2741
onInitializeSROAArg(SROAArg);
2742
EnabledSROAAllocas.insert(SROAArg);
2743
}
2744
}
2745
++CAI;
2746
}
2747
NumConstantArgs = SimplifiedValues.size();
2748
NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2749
NumAllocaArgs = SROAArgValues.size();
2750
2751
// FIXME: If a caller has multiple calls to a callee, we end up recomputing
2752
// the ephemeral values multiple times (and they're completely determined by
2753
// the callee, so this is purely duplicate work).
2754
SmallPtrSet<const Value *, 32> EphValues;
2755
CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2756
2757
// The worklist of live basic blocks in the callee *after* inlining. We avoid
2758
// adding basic blocks of the callee which can be proven to be dead for this
2759
// particular call site in order to get more accurate cost estimates. This
2760
// requires a somewhat heavyweight iteration pattern: we need to walk the
2761
// basic blocks in a breadth-first order as we insert live successors. To
2762
// accomplish this, prioritizing for small iterations because we exit after
2763
// crossing our threshold, we use a small-size optimized SetVector.
2764
typedef SmallSetVector<BasicBlock *, 16> BBSetVector;
2765
BBSetVector BBWorklist;
2766
BBWorklist.insert(&F.getEntryBlock());
2767
2768
// Note that we *must not* cache the size, this loop grows the worklist.
2769
for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2770
if (shouldStop())
2771
break;
2772
2773
BasicBlock *BB = BBWorklist[Idx];
2774
if (BB->empty())
2775
continue;
2776
2777
onBlockStart(BB);
2778
2779
// Disallow inlining a blockaddress with uses other than strictly callbr.
2780
// A blockaddress only has defined behavior for an indirect branch in the
2781
// same function, and we do not currently support inlining indirect
2782
// branches. But, the inliner may not see an indirect branch that ends up
2783
// being dead code at a particular call site. If the blockaddress escapes
2784
// the function, e.g., via a global variable, inlining may lead to an
2785
// invalid cross-function reference.
2786
// FIXME: pr/39560: continue relaxing this overt restriction.
2787
if (BB->hasAddressTaken())
2788
for (User *U : BlockAddress::get(&*BB)->users())
2789
if (!isa<CallBrInst>(*U))
2790
return InlineResult::failure("blockaddress used outside of callbr");
2791
2792
// Analyze the cost of this block. If we blow through the threshold, this
2793
// returns false, and we can bail on out.
2794
InlineResult IR = analyzeBlock(BB, EphValues);
2795
if (!IR.isSuccess())
2796
return IR;
2797
2798
Instruction *TI = BB->getTerminator();
2799
2800
// Add in the live successors by first checking whether we have terminator
2801
// that may be simplified based on the values simplified by this call.
2802
if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2803
if (BI->isConditional()) {
2804
Value *Cond = BI->getCondition();
2805
if (ConstantInt *SimpleCond =
2806
dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2807
BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2808
BBWorklist.insert(NextBB);
2809
KnownSuccessors[BB] = NextBB;
2810
findDeadBlocks(BB, NextBB);
2811
continue;
2812
}
2813
}
2814
} else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2815
Value *Cond = SI->getCondition();
2816
if (ConstantInt *SimpleCond =
2817
dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2818
BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2819
BBWorklist.insert(NextBB);
2820
KnownSuccessors[BB] = NextBB;
2821
findDeadBlocks(BB, NextBB);
2822
continue;
2823
}
2824
}
2825
2826
// If we're unable to select a particular successor, just count all of
2827
// them.
2828
for (BasicBlock *Succ : successors(BB))
2829
BBWorklist.insert(Succ);
2830
2831
onBlockAnalyzed(BB);
2832
}
2833
2834
// If this is a noduplicate call, we can still inline as long as
2835
// inlining this would cause the removal of the caller (so the instruction
2836
// is not actually duplicated, just moved).
2837
if (!isSoleCallToLocalFunction(CandidateCall, F) && ContainsNoDuplicateCall)
2838
return InlineResult::failure("noduplicate");
2839
2840
// If the callee's stack size exceeds the user-specified threshold,
2841
// do not let it be inlined.
2842
// The command line option overrides a limit set in the function attributes.
2843
size_t FinalStackSizeThreshold = StackSizeThreshold;
2844
if (!StackSizeThreshold.getNumOccurrences())
2845
if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
2846
Caller, InlineConstants::MaxInlineStackSizeAttributeName))
2847
FinalStackSizeThreshold = *AttrMaxStackSize;
2848
if (AllocatedSize > FinalStackSizeThreshold)
2849
return InlineResult::failure("stacksize");
2850
2851
return finalizeAnalysis();
2852
}
2853
2854
void InlineCostCallAnalyzer::print(raw_ostream &OS) {
2855
#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2856
if (PrintInstructionComments)
2857
F.print(OS, &Writer);
2858
DEBUG_PRINT_STAT(NumConstantArgs);
2859
DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2860
DEBUG_PRINT_STAT(NumAllocaArgs);
2861
DEBUG_PRINT_STAT(NumConstantPtrCmps);
2862
DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2863
DEBUG_PRINT_STAT(NumInstructionsSimplified);
2864
DEBUG_PRINT_STAT(NumInstructions);
2865
DEBUG_PRINT_STAT(SROACostSavings);
2866
DEBUG_PRINT_STAT(SROACostSavingsLost);
2867
DEBUG_PRINT_STAT(LoadEliminationCost);
2868
DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2869
DEBUG_PRINT_STAT(Cost);
2870
DEBUG_PRINT_STAT(Threshold);
2871
#undef DEBUG_PRINT_STAT
2872
}
2873
2874
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2875
/// Dump stats about this call's analysis.
2876
LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
2877
#endif
2878
2879
/// Test that there are no attribute conflicts between Caller and Callee
2880
/// that prevent inlining.
2881
static bool functionsHaveCompatibleAttributes(
2882
Function *Caller, Function *Callee, TargetTransformInfo &TTI,
2883
function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2884
// Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2885
// caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2886
// object, and always returns the same object (which is overwritten on each
2887
// GetTLI call). Therefore we copy the first result.
2888
auto CalleeTLI = GetTLI(*Callee);
2889
return (IgnoreTTIInlineCompatible ||
2890
TTI.areInlineCompatible(Caller, Callee)) &&
2891
GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2892
InlineCallerSupersetNoBuiltin) &&
2893
AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2894
}
2895
2896
int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
2897
const DataLayout &DL) {
2898
int64_t Cost = 0;
2899
for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2900
if (Call.isByValArgument(I)) {
2901
// We approximate the number of loads and stores needed by dividing the
2902
// size of the byval type by the target's pointer size.
2903
PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2904
unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
2905
unsigned AS = PTy->getAddressSpace();
2906
unsigned PointerSize = DL.getPointerSizeInBits(AS);
2907
// Ceiling division.
2908
unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2909
2910
// If it generates more than 8 stores it is likely to be expanded as an
2911
// inline memcpy so we take that as an upper bound. Otherwise we assume
2912
// one load and one store per word copied.
2913
// FIXME: The maxStoresPerMemcpy setting from the target should be used
2914
// here instead of a magic number of 8, but it's not available via
2915
// DataLayout.
2916
NumStores = std::min(NumStores, 8U);
2917
2918
Cost += 2 * NumStores * InstrCost;
2919
} else {
2920
// For non-byval arguments subtract off one instruction per call
2921
// argument.
2922
Cost += InstrCost;
2923
}
2924
}
2925
// The call instruction also disappears after inlining.
2926
Cost += InstrCost;
2927
Cost += TTI.getInlineCallPenalty(Call.getCaller(), Call, CallPenalty);
2928
2929
return std::min<int64_t>(Cost, INT_MAX);
2930
}
2931
2932
InlineCost llvm::getInlineCost(
2933
CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2934
function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2935
function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2936
function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2937
ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2938
return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2939
GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
2940
}
2941
2942
std::optional<int> llvm::getInliningCostEstimate(
2943
CallBase &Call, TargetTransformInfo &CalleeTTI,
2944
function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2945
function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2946
ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2947
const InlineParams Params = {/* DefaultThreshold*/ 0,
2948
/*HintThreshold*/ {},
2949
/*ColdThreshold*/ {},
2950
/*OptSizeThreshold*/ {},
2951
/*OptMinSizeThreshold*/ {},
2952
/*HotCallSiteThreshold*/ {},
2953
/*LocallyHotCallSiteThreshold*/ {},
2954
/*ColdCallSiteThreshold*/ {},
2955
/*ComputeFullInlineCost*/ true,
2956
/*EnableDeferral*/ true};
2957
2958
InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
2959
GetAssumptionCache, GetBFI, PSI, ORE, true,
2960
/*IgnoreThreshold*/ true);
2961
auto R = CA.analyze();
2962
if (!R.isSuccess())
2963
return std::nullopt;
2964
return CA.getCost();
2965
}
2966
2967
std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
2968
CallBase &Call, TargetTransformInfo &CalleeTTI,
2969
function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2970
function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2971
ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2972
InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
2973
ORE, *Call.getCalledFunction(), Call);
2974
auto R = CFA.analyze();
2975
if (!R.isSuccess())
2976
return std::nullopt;
2977
return CFA.features();
2978
}
2979
2980
std::optional<InlineResult> llvm::getAttributeBasedInliningDecision(
2981
CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
2982
function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
2983
2984
// Cannot inline indirect calls.
2985
if (!Callee)
2986
return InlineResult::failure("indirect call");
2987
2988
// When callee coroutine function is inlined into caller coroutine function
2989
// before coro-split pass,
2990
// coro-early pass can not handle this quiet well.
2991
// So we won't inline the coroutine function if it have not been unsplited
2992
if (Callee->isPresplitCoroutine())
2993
return InlineResult::failure("unsplited coroutine call");
2994
2995
// Never inline calls with byval arguments that does not have the alloca
2996
// address space. Since byval arguments can be replaced with a copy to an
2997
// alloca, the inlined code would need to be adjusted to handle that the
2998
// argument is in the alloca address space (so it is a little bit complicated
2999
// to solve).
3000
unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace();
3001
for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
3002
if (Call.isByValArgument(I)) {
3003
PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
3004
if (PTy->getAddressSpace() != AllocaAS)
3005
return InlineResult::failure("byval arguments without alloca"
3006
" address space");
3007
}
3008
3009
// Calls to functions with always-inline attributes should be inlined
3010
// whenever possible.
3011
if (Call.hasFnAttr(Attribute::AlwaysInline)) {
3012
if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
3013
return InlineResult::failure("noinline call site attribute");
3014
3015
auto IsViable = isInlineViable(*Callee);
3016
if (IsViable.isSuccess())
3017
return InlineResult::success();
3018
return InlineResult::failure(IsViable.getFailureReason());
3019
}
3020
3021
// Never inline functions with conflicting attributes (unless callee has
3022
// always-inline attribute).
3023
Function *Caller = Call.getCaller();
3024
if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
3025
return InlineResult::failure("conflicting attributes");
3026
3027
// Don't inline this call if the caller has the optnone attribute.
3028
if (Caller->hasOptNone())
3029
return InlineResult::failure("optnone attribute");
3030
3031
// Don't inline a function that treats null pointer as valid into a caller
3032
// that does not have this attribute.
3033
if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3034
return InlineResult::failure("nullptr definitions incompatible");
3035
3036
// Don't inline functions which can be interposed at link-time.
3037
if (Callee->isInterposable())
3038
return InlineResult::failure("interposable");
3039
3040
// Don't inline functions marked noinline.
3041
if (Callee->hasFnAttribute(Attribute::NoInline))
3042
return InlineResult::failure("noinline function attribute");
3043
3044
// Don't inline call sites marked noinline.
3045
if (Call.isNoInline())
3046
return InlineResult::failure("noinline call site attribute");
3047
3048
return std::nullopt;
3049
}
3050
3051
InlineCost llvm::getInlineCost(
3052
CallBase &Call, Function *Callee, const InlineParams &Params,
3053
TargetTransformInfo &CalleeTTI,
3054
function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3055
function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3056
function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3057
ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3058
3059
auto UserDecision =
3060
llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3061
3062
if (UserDecision) {
3063
if (UserDecision->isSuccess())
3064
return llvm::InlineCost::getAlways("always inline attribute");
3065
return llvm::InlineCost::getNever(UserDecision->getFailureReason());
3066
}
3067
3068
LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3069
<< "... (caller:" << Call.getCaller()->getName()
3070
<< ")\n");
3071
3072
InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3073
GetAssumptionCache, GetBFI, PSI, ORE);
3074
InlineResult ShouldInline = CA.analyze();
3075
3076
LLVM_DEBUG(CA.dump());
3077
3078
// Always make cost benefit based decision explicit.
3079
// We use always/never here since threshold is not meaningful,
3080
// as it's not what drives cost-benefit analysis.
3081
if (CA.wasDecidedByCostBenefit()) {
3082
if (ShouldInline.isSuccess())
3083
return InlineCost::getAlways("benefit over cost",
3084
CA.getCostBenefitPair());
3085
else
3086
return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
3087
}
3088
3089
if (CA.wasDecidedByCostThreshold())
3090
return InlineCost::get(CA.getCost(), CA.getThreshold(),
3091
CA.getStaticBonusApplied());
3092
3093
// No details on how the decision was made, simply return always or never.
3094
return ShouldInline.isSuccess()
3095
? InlineCost::getAlways("empty function")
3096
: InlineCost::getNever(ShouldInline.getFailureReason());
3097
}
3098
3099
InlineResult llvm::isInlineViable(Function &F) {
3100
bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
3101
for (BasicBlock &BB : F) {
3102
// Disallow inlining of functions which contain indirect branches.
3103
if (isa<IndirectBrInst>(BB.getTerminator()))
3104
return InlineResult::failure("contains indirect branches");
3105
3106
// Disallow inlining of blockaddresses which are used by non-callbr
3107
// instructions.
3108
if (BB.hasAddressTaken())
3109
for (User *U : BlockAddress::get(&BB)->users())
3110
if (!isa<CallBrInst>(*U))
3111
return InlineResult::failure("blockaddress used outside of callbr");
3112
3113
for (auto &II : BB) {
3114
CallBase *Call = dyn_cast<CallBase>(&II);
3115
if (!Call)
3116
continue;
3117
3118
// Disallow recursive calls.
3119
Function *Callee = Call->getCalledFunction();
3120
if (&F == Callee)
3121
return InlineResult::failure("recursive call");
3122
3123
// Disallow calls which expose returns-twice to a function not previously
3124
// attributed as such.
3125
if (!ReturnsTwice && isa<CallInst>(Call) &&
3126
cast<CallInst>(Call)->canReturnTwice())
3127
return InlineResult::failure("exposes returns-twice attribute");
3128
3129
if (Callee)
3130
switch (Callee->getIntrinsicID()) {
3131
default:
3132
break;
3133
case llvm::Intrinsic::icall_branch_funnel:
3134
// Disallow inlining of @llvm.icall.branch.funnel because current
3135
// backend can't separate call targets from call arguments.
3136
return InlineResult::failure(
3137
"disallowed inlining of @llvm.icall.branch.funnel");
3138
case llvm::Intrinsic::localescape:
3139
// Disallow inlining functions that call @llvm.localescape. Doing this
3140
// correctly would require major changes to the inliner.
3141
return InlineResult::failure(
3142
"disallowed inlining of @llvm.localescape");
3143
case llvm::Intrinsic::vastart:
3144
// Disallow inlining of functions that initialize VarArgs with
3145
// va_start.
3146
return InlineResult::failure(
3147
"contains VarArgs initialized with va_start");
3148
}
3149
}
3150
}
3151
3152
return InlineResult::success();
3153
}
3154
3155
// APIs to create InlineParams based on command line flags and/or other
3156
// parameters.
3157
3158
InlineParams llvm::getInlineParams(int Threshold) {
3159
InlineParams Params;
3160
3161
// This field is the threshold to use for a callee by default. This is
3162
// derived from one or more of:
3163
// * optimization or size-optimization levels,
3164
// * a value passed to createFunctionInliningPass function, or
3165
// * the -inline-threshold flag.
3166
// If the -inline-threshold flag is explicitly specified, that is used
3167
// irrespective of anything else.
3168
if (InlineThreshold.getNumOccurrences() > 0)
3169
Params.DefaultThreshold = InlineThreshold;
3170
else
3171
Params.DefaultThreshold = Threshold;
3172
3173
// Set the HintThreshold knob from the -inlinehint-threshold.
3174
Params.HintThreshold = HintThreshold;
3175
3176
// Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3177
Params.HotCallSiteThreshold = HotCallSiteThreshold;
3178
3179
// If the -locally-hot-callsite-threshold is explicitly specified, use it to
3180
// populate LocallyHotCallSiteThreshold. Later, we populate
3181
// Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3182
// we know that optimization level is O3 (in the getInlineParams variant that
3183
// takes the opt and size levels).
3184
// FIXME: Remove this check (and make the assignment unconditional) after
3185
// addressing size regression issues at O2.
3186
if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3187
Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3188
3189
// Set the ColdCallSiteThreshold knob from the
3190
// -inline-cold-callsite-threshold.
3191
Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
3192
3193
// Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3194
// -inlinehint-threshold commandline option is not explicitly given. If that
3195
// option is present, then its value applies even for callees with size and
3196
// minsize attributes.
3197
// If the -inline-threshold is not specified, set the ColdThreshold from the
3198
// -inlinecold-threshold even if it is not explicitly passed. If
3199
// -inline-threshold is specified, then -inlinecold-threshold needs to be
3200
// explicitly specified to set the ColdThreshold knob
3201
if (InlineThreshold.getNumOccurrences() == 0) {
3202
Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
3203
Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
3204
Params.ColdThreshold = ColdThreshold;
3205
} else if (ColdThreshold.getNumOccurrences() > 0) {
3206
Params.ColdThreshold = ColdThreshold;
3207
}
3208
return Params;
3209
}
3210
3211
InlineParams llvm::getInlineParams() {
3212
return getInlineParams(DefaultThreshold);
3213
}
3214
3215
// Compute the default threshold for inlining based on the opt level and the
3216
// size opt level.
3217
static int computeThresholdFromOptLevels(unsigned OptLevel,
3218
unsigned SizeOptLevel) {
3219
if (OptLevel > 2)
3220
return InlineConstants::OptAggressiveThreshold;
3221
if (SizeOptLevel == 1) // -Os
3222
return InlineConstants::OptSizeThreshold;
3223
if (SizeOptLevel == 2) // -Oz
3224
return InlineConstants::OptMinSizeThreshold;
3225
return DefaultThreshold;
3226
}
3227
3228
InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3229
auto Params =
3230
getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3231
// At O3, use the value of -locally-hot-callsite-threshold option to populate
3232
// Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3233
// when it is specified explicitly.
3234
if (OptLevel > 2)
3235
Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3236
return Params;
3237
}
3238
3239
PreservedAnalyses
3240
InlineCostAnnotationPrinterPass::run(Function &F,
3241
FunctionAnalysisManager &FAM) {
3242
PrintInstructionComments = true;
3243
std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3244
[&](Function &F) -> AssumptionCache & {
3245
return FAM.getResult<AssumptionAnalysis>(F);
3246
};
3247
Module *M = F.getParent();
3248
ProfileSummaryInfo PSI(*M);
3249
DataLayout DL(M);
3250
TargetTransformInfo TTI(DL);
3251
// FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3252
// In the current implementation, the type of InlineParams doesn't matter as
3253
// the pass serves only for verification of inliner's decisions.
3254
// We can add a flag which determines InlineParams for this run. Right now,
3255
// the default InlineParams are used.
3256
const InlineParams Params = llvm::getInlineParams();
3257
for (BasicBlock &BB : F) {
3258
for (Instruction &I : BB) {
3259
if (CallInst *CI = dyn_cast<CallInst>(&I)) {
3260
Function *CalledFunction = CI->getCalledFunction();
3261
if (!CalledFunction || CalledFunction->isDeclaration())
3262
continue;
3263
OptimizationRemarkEmitter ORE(CalledFunction);
3264
InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
3265
GetAssumptionCache, nullptr, &PSI, &ORE);
3266
ICCA.analyze();
3267
OS << " Analyzing call of " << CalledFunction->getName()
3268
<< "... (caller:" << CI->getCaller()->getName() << ")\n";
3269
ICCA.print(OS);
3270
OS << "\n";
3271
}
3272
}
3273
}
3274
return PreservedAnalyses::all();
3275
}
3276
3277