Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
35266 views
1
//===- FunctionSpecialization.cpp - Function Specialization ---------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#include "llvm/Transforms/IPO/FunctionSpecialization.h"
10
#include "llvm/ADT/Statistic.h"
11
#include "llvm/Analysis/CodeMetrics.h"
12
#include "llvm/Analysis/ConstantFolding.h"
13
#include "llvm/Analysis/InlineCost.h"
14
#include "llvm/Analysis/InstructionSimplify.h"
15
#include "llvm/Analysis/TargetTransformInfo.h"
16
#include "llvm/Analysis/ValueLattice.h"
17
#include "llvm/Analysis/ValueLatticeUtils.h"
18
#include "llvm/Analysis/ValueTracking.h"
19
#include "llvm/IR/IntrinsicInst.h"
20
#include "llvm/Transforms/Scalar/SCCP.h"
21
#include "llvm/Transforms/Utils/Cloning.h"
22
#include "llvm/Transforms/Utils/SCCPSolver.h"
23
#include "llvm/Transforms/Utils/SizeOpts.h"
24
#include <cmath>
25
26
using namespace llvm;
27
28
#define DEBUG_TYPE "function-specialization"
29
30
STATISTIC(NumSpecsCreated, "Number of specializations created");
31
32
static cl::opt<bool> ForceSpecialization(
33
"force-specialization", cl::init(false), cl::Hidden, cl::desc(
34
"Force function specialization for every call site with a constant "
35
"argument"));
36
37
static cl::opt<unsigned> MaxClones(
38
"funcspec-max-clones", cl::init(3), cl::Hidden, cl::desc(
39
"The maximum number of clones allowed for a single function "
40
"specialization"));
41
42
static cl::opt<unsigned>
43
MaxDiscoveryIterations("funcspec-max-discovery-iterations", cl::init(100),
44
cl::Hidden,
45
cl::desc("The maximum number of iterations allowed "
46
"when searching for transitive "
47
"phis"));
48
49
static cl::opt<unsigned> MaxIncomingPhiValues(
50
"funcspec-max-incoming-phi-values", cl::init(8), cl::Hidden,
51
cl::desc("The maximum number of incoming values a PHI node can have to be "
52
"considered during the specialization bonus estimation"));
53
54
static cl::opt<unsigned> MaxBlockPredecessors(
55
"funcspec-max-block-predecessors", cl::init(2), cl::Hidden, cl::desc(
56
"The maximum number of predecessors a basic block can have to be "
57
"considered during the estimation of dead code"));
58
59
static cl::opt<unsigned> MinFunctionSize(
60
"funcspec-min-function-size", cl::init(300), cl::Hidden, cl::desc(
61
"Don't specialize functions that have less than this number of "
62
"instructions"));
63
64
static cl::opt<unsigned> MaxCodeSizeGrowth(
65
"funcspec-max-codesize-growth", cl::init(3), cl::Hidden, cl::desc(
66
"Maximum codesize growth allowed per function"));
67
68
static cl::opt<unsigned> MinCodeSizeSavings(
69
"funcspec-min-codesize-savings", cl::init(20), cl::Hidden, cl::desc(
70
"Reject specializations whose codesize savings are less than this"
71
"much percent of the original function size"));
72
73
static cl::opt<unsigned> MinLatencySavings(
74
"funcspec-min-latency-savings", cl::init(40), cl::Hidden,
75
cl::desc("Reject specializations whose latency savings are less than this"
76
"much percent of the original function size"));
77
78
static cl::opt<unsigned> MinInliningBonus(
79
"funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, cl::desc(
80
"Reject specializations whose inlining bonus is less than this"
81
"much percent of the original function size"));
82
83
static cl::opt<bool> SpecializeOnAddress(
84
"funcspec-on-address", cl::init(false), cl::Hidden, cl::desc(
85
"Enable function specialization on the address of global values"));
86
87
// Disabled by default as it can significantly increase compilation times.
88
//
89
// https://llvm-compile-time-tracker.com
90
// https://github.com/nikic/llvm-compile-time-tracker
91
static cl::opt<bool> SpecializeLiteralConstant(
92
"funcspec-for-literal-constant", cl::init(false), cl::Hidden, cl::desc(
93
"Enable specialization of functions that take a literal constant as an "
94
"argument"));
95
96
bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ,
97
DenseSet<BasicBlock *> &DeadBlocks) {
98
unsigned I = 0;
99
return all_of(predecessors(Succ),
100
[&I, BB, Succ, &DeadBlocks] (BasicBlock *Pred) {
101
return I++ < MaxBlockPredecessors &&
102
(Pred == BB || Pred == Succ || DeadBlocks.contains(Pred));
103
});
104
}
105
106
// Estimates the codesize savings due to dead code after constant propagation.
107
// \p WorkList represents the basic blocks of a specialization which will
108
// eventually become dead once we replace instructions that are known to be
109
// constants. The successors of such blocks are added to the list as long as
110
// the \p Solver found they were executable prior to specialization, and only
111
// if all their predecessors are dead.
112
Cost InstCostVisitor::estimateBasicBlocks(
113
SmallVectorImpl<BasicBlock *> &WorkList) {
114
Cost CodeSize = 0;
115
// Accumulate the instruction cost of each basic block weighted by frequency.
116
while (!WorkList.empty()) {
117
BasicBlock *BB = WorkList.pop_back_val();
118
119
// These blocks are considered dead as far as the InstCostVisitor
120
// is concerned. They haven't been proven dead yet by the Solver,
121
// but may become if we propagate the specialization arguments.
122
if (!DeadBlocks.insert(BB).second)
123
continue;
124
125
for (Instruction &I : *BB) {
126
// Disregard SSA copies.
127
if (auto *II = dyn_cast<IntrinsicInst>(&I))
128
if (II->getIntrinsicID() == Intrinsic::ssa_copy)
129
continue;
130
// If it's a known constant we have already accounted for it.
131
if (KnownConstants.contains(&I))
132
continue;
133
134
Cost C = TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
135
136
LLVM_DEBUG(dbgs() << "FnSpecialization: CodeSize " << C
137
<< " for user " << I << "\n");
138
CodeSize += C;
139
}
140
141
// Keep adding dead successors to the list as long as they are
142
// executable and only reachable from dead blocks.
143
for (BasicBlock *SuccBB : successors(BB))
144
if (isBlockExecutable(SuccBB) &&
145
canEliminateSuccessor(BB, SuccBB, DeadBlocks))
146
WorkList.push_back(SuccBB);
147
}
148
return CodeSize;
149
}
150
151
static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) {
152
if (auto *C = dyn_cast<Constant>(V))
153
return C;
154
return KnownConstants.lookup(V);
155
}
156
157
Bonus InstCostVisitor::getBonusFromPendingPHIs() {
158
Bonus B;
159
while (!PendingPHIs.empty()) {
160
Instruction *Phi = PendingPHIs.pop_back_val();
161
// The pending PHIs could have been proven dead by now.
162
if (isBlockExecutable(Phi->getParent()))
163
B += getUserBonus(Phi);
164
}
165
return B;
166
}
167
168
/// Compute a bonus for replacing argument \p A with constant \p C.
169
Bonus InstCostVisitor::getSpecializationBonus(Argument *A, Constant *C) {
170
LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
171
<< C->getNameOrAsOperand() << "\n");
172
Bonus B;
173
for (auto *U : A->users())
174
if (auto *UI = dyn_cast<Instruction>(U))
175
if (isBlockExecutable(UI->getParent()))
176
B += getUserBonus(UI, A, C);
177
178
LLVM_DEBUG(dbgs() << "FnSpecialization: Accumulated bonus {CodeSize = "
179
<< B.CodeSize << ", Latency = " << B.Latency
180
<< "} for argument " << *A << "\n");
181
return B;
182
}
183
184
Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) {
185
// We have already propagated a constant for this user.
186
if (KnownConstants.contains(User))
187
return {0, 0};
188
189
// Cache the iterator before visiting.
190
LastVisited = Use ? KnownConstants.insert({Use, C}).first
191
: KnownConstants.end();
192
193
Cost CodeSize = 0;
194
if (auto *I = dyn_cast<SwitchInst>(User)) {
195
CodeSize = estimateSwitchInst(*I);
196
} else if (auto *I = dyn_cast<BranchInst>(User)) {
197
CodeSize = estimateBranchInst(*I);
198
} else {
199
C = visit(*User);
200
if (!C)
201
return {0, 0};
202
}
203
204
// Even though it doesn't make sense to bind switch and branch instructions
205
// with a constant, unlike any other instruction type, it prevents estimating
206
// their bonus multiple times.
207
KnownConstants.insert({User, C});
208
209
CodeSize += TTI.getInstructionCost(User, TargetTransformInfo::TCK_CodeSize);
210
211
uint64_t Weight = BFI.getBlockFreq(User->getParent()).getFrequency() /
212
BFI.getEntryFreq().getFrequency();
213
214
Cost Latency = Weight *
215
TTI.getInstructionCost(User, TargetTransformInfo::TCK_Latency);
216
217
LLVM_DEBUG(dbgs() << "FnSpecialization: {CodeSize = " << CodeSize
218
<< ", Latency = " << Latency << "} for user "
219
<< *User << "\n");
220
221
Bonus B(CodeSize, Latency);
222
for (auto *U : User->users())
223
if (auto *UI = dyn_cast<Instruction>(U))
224
if (UI != User && isBlockExecutable(UI->getParent()))
225
B += getUserBonus(UI, User, C);
226
227
return B;
228
}
229
230
Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) {
231
assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
232
233
if (I.getCondition() != LastVisited->first)
234
return 0;
235
236
auto *C = dyn_cast<ConstantInt>(LastVisited->second);
237
if (!C)
238
return 0;
239
240
BasicBlock *Succ = I.findCaseValue(C)->getCaseSuccessor();
241
// Initialize the worklist with the dead basic blocks. These are the
242
// destination labels which are different from the one corresponding
243
// to \p C. They should be executable and have a unique predecessor.
244
SmallVector<BasicBlock *> WorkList;
245
for (const auto &Case : I.cases()) {
246
BasicBlock *BB = Case.getCaseSuccessor();
247
if (BB != Succ && isBlockExecutable(BB) &&
248
canEliminateSuccessor(I.getParent(), BB, DeadBlocks))
249
WorkList.push_back(BB);
250
}
251
252
return estimateBasicBlocks(WorkList);
253
}
254
255
Cost InstCostVisitor::estimateBranchInst(BranchInst &I) {
256
assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
257
258
if (I.getCondition() != LastVisited->first)
259
return 0;
260
261
BasicBlock *Succ = I.getSuccessor(LastVisited->second->isOneValue());
262
// Initialize the worklist with the dead successor as long as
263
// it is executable and has a unique predecessor.
264
SmallVector<BasicBlock *> WorkList;
265
if (isBlockExecutable(Succ) &&
266
canEliminateSuccessor(I.getParent(), Succ, DeadBlocks))
267
WorkList.push_back(Succ);
268
269
return estimateBasicBlocks(WorkList);
270
}
271
272
bool InstCostVisitor::discoverTransitivelyIncomingValues(
273
Constant *Const, PHINode *Root, DenseSet<PHINode *> &TransitivePHIs) {
274
275
SmallVector<PHINode *, 64> WorkList;
276
WorkList.push_back(Root);
277
unsigned Iter = 0;
278
279
while (!WorkList.empty()) {
280
PHINode *PN = WorkList.pop_back_val();
281
282
if (++Iter > MaxDiscoveryIterations ||
283
PN->getNumIncomingValues() > MaxIncomingPhiValues)
284
return false;
285
286
if (!TransitivePHIs.insert(PN).second)
287
continue;
288
289
for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
290
Value *V = PN->getIncomingValue(I);
291
292
// Disregard self-references and dead incoming values.
293
if (auto *Inst = dyn_cast<Instruction>(V))
294
if (Inst == PN || DeadBlocks.contains(PN->getIncomingBlock(I)))
295
continue;
296
297
if (Constant *C = findConstantFor(V, KnownConstants)) {
298
// Not all incoming values are the same constant. Bail immediately.
299
if (C != Const)
300
return false;
301
continue;
302
}
303
304
if (auto *Phi = dyn_cast<PHINode>(V)) {
305
WorkList.push_back(Phi);
306
continue;
307
}
308
309
// We can't reason about anything else.
310
return false;
311
}
312
}
313
return true;
314
}
315
316
Constant *InstCostVisitor::visitPHINode(PHINode &I) {
317
if (I.getNumIncomingValues() > MaxIncomingPhiValues)
318
return nullptr;
319
320
bool Inserted = VisitedPHIs.insert(&I).second;
321
Constant *Const = nullptr;
322
bool HaveSeenIncomingPHI = false;
323
324
for (unsigned Idx = 0, E = I.getNumIncomingValues(); Idx != E; ++Idx) {
325
Value *V = I.getIncomingValue(Idx);
326
327
// Disregard self-references and dead incoming values.
328
if (auto *Inst = dyn_cast<Instruction>(V))
329
if (Inst == &I || DeadBlocks.contains(I.getIncomingBlock(Idx)))
330
continue;
331
332
if (Constant *C = findConstantFor(V, KnownConstants)) {
333
if (!Const)
334
Const = C;
335
// Not all incoming values are the same constant. Bail immediately.
336
if (C != Const)
337
return nullptr;
338
continue;
339
}
340
341
if (Inserted) {
342
// First time we are seeing this phi. We will retry later, after
343
// all the constant arguments have been propagated. Bail for now.
344
PendingPHIs.push_back(&I);
345
return nullptr;
346
}
347
348
if (isa<PHINode>(V)) {
349
// Perhaps it is a Transitive Phi. We will confirm later.
350
HaveSeenIncomingPHI = true;
351
continue;
352
}
353
354
// We can't reason about anything else.
355
return nullptr;
356
}
357
358
if (!Const)
359
return nullptr;
360
361
if (!HaveSeenIncomingPHI)
362
return Const;
363
364
DenseSet<PHINode *> TransitivePHIs;
365
if (!discoverTransitivelyIncomingValues(Const, &I, TransitivePHIs))
366
return nullptr;
367
368
return Const;
369
}
370
371
Constant *InstCostVisitor::visitFreezeInst(FreezeInst &I) {
372
assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
373
374
if (isGuaranteedNotToBeUndefOrPoison(LastVisited->second))
375
return LastVisited->second;
376
return nullptr;
377
}
378
379
Constant *InstCostVisitor::visitCallBase(CallBase &I) {
380
Function *F = I.getCalledFunction();
381
if (!F || !canConstantFoldCallTo(&I, F))
382
return nullptr;
383
384
SmallVector<Constant *, 8> Operands;
385
Operands.reserve(I.getNumOperands());
386
387
for (unsigned Idx = 0, E = I.getNumOperands() - 1; Idx != E; ++Idx) {
388
Value *V = I.getOperand(Idx);
389
Constant *C = findConstantFor(V, KnownConstants);
390
if (!C)
391
return nullptr;
392
Operands.push_back(C);
393
}
394
395
auto Ops = ArrayRef(Operands.begin(), Operands.end());
396
return ConstantFoldCall(&I, F, Ops);
397
}
398
399
Constant *InstCostVisitor::visitLoadInst(LoadInst &I) {
400
assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
401
402
if (isa<ConstantPointerNull>(LastVisited->second))
403
return nullptr;
404
return ConstantFoldLoadFromConstPtr(LastVisited->second, I.getType(), DL);
405
}
406
407
Constant *InstCostVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
408
SmallVector<Constant *, 8> Operands;
409
Operands.reserve(I.getNumOperands());
410
411
for (unsigned Idx = 0, E = I.getNumOperands(); Idx != E; ++Idx) {
412
Value *V = I.getOperand(Idx);
413
Constant *C = findConstantFor(V, KnownConstants);
414
if (!C)
415
return nullptr;
416
Operands.push_back(C);
417
}
418
419
auto Ops = ArrayRef(Operands.begin(), Operands.end());
420
return ConstantFoldInstOperands(&I, Ops, DL);
421
}
422
423
Constant *InstCostVisitor::visitSelectInst(SelectInst &I) {
424
assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
425
426
if (I.getCondition() != LastVisited->first)
427
return nullptr;
428
429
Value *V = LastVisited->second->isZeroValue() ? I.getFalseValue()
430
: I.getTrueValue();
431
Constant *C = findConstantFor(V, KnownConstants);
432
return C;
433
}
434
435
Constant *InstCostVisitor::visitCastInst(CastInst &I) {
436
return ConstantFoldCastOperand(I.getOpcode(), LastVisited->second,
437
I.getType(), DL);
438
}
439
440
Constant *InstCostVisitor::visitCmpInst(CmpInst &I) {
441
assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
442
443
bool Swap = I.getOperand(1) == LastVisited->first;
444
Value *V = Swap ? I.getOperand(0) : I.getOperand(1);
445
Constant *Other = findConstantFor(V, KnownConstants);
446
if (!Other)
447
return nullptr;
448
449
Constant *Const = LastVisited->second;
450
return Swap ?
451
ConstantFoldCompareInstOperands(I.getPredicate(), Other, Const, DL)
452
: ConstantFoldCompareInstOperands(I.getPredicate(), Const, Other, DL);
453
}
454
455
Constant *InstCostVisitor::visitUnaryOperator(UnaryOperator &I) {
456
assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
457
458
return ConstantFoldUnaryOpOperand(I.getOpcode(), LastVisited->second, DL);
459
}
460
461
Constant *InstCostVisitor::visitBinaryOperator(BinaryOperator &I) {
462
assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
463
464
bool Swap = I.getOperand(1) == LastVisited->first;
465
Value *V = Swap ? I.getOperand(0) : I.getOperand(1);
466
Constant *Other = findConstantFor(V, KnownConstants);
467
if (!Other)
468
return nullptr;
469
470
Constant *Const = LastVisited->second;
471
return dyn_cast_or_null<Constant>(Swap ?
472
simplifyBinOp(I.getOpcode(), Other, Const, SimplifyQuery(DL))
473
: simplifyBinOp(I.getOpcode(), Const, Other, SimplifyQuery(DL)));
474
}
475
476
Constant *FunctionSpecializer::getPromotableAlloca(AllocaInst *Alloca,
477
CallInst *Call) {
478
Value *StoreValue = nullptr;
479
for (auto *User : Alloca->users()) {
480
// We can't use llvm::isAllocaPromotable() as that would fail because of
481
// the usage in the CallInst, which is what we check here.
482
if (User == Call)
483
continue;
484
if (auto *Bitcast = dyn_cast<BitCastInst>(User)) {
485
if (!Bitcast->hasOneUse() || *Bitcast->user_begin() != Call)
486
return nullptr;
487
continue;
488
}
489
490
if (auto *Store = dyn_cast<StoreInst>(User)) {
491
// This is a duplicate store, bail out.
492
if (StoreValue || Store->isVolatile())
493
return nullptr;
494
StoreValue = Store->getValueOperand();
495
continue;
496
}
497
// Bail if there is any other unknown usage.
498
return nullptr;
499
}
500
501
if (!StoreValue)
502
return nullptr;
503
504
return getCandidateConstant(StoreValue);
505
}
506
507
// A constant stack value is an AllocaInst that has a single constant
508
// value stored to it. Return this constant if such an alloca stack value
509
// is a function argument.
510
Constant *FunctionSpecializer::getConstantStackValue(CallInst *Call,
511
Value *Val) {
512
if (!Val)
513
return nullptr;
514
Val = Val->stripPointerCasts();
515
if (auto *ConstVal = dyn_cast<ConstantInt>(Val))
516
return ConstVal;
517
auto *Alloca = dyn_cast<AllocaInst>(Val);
518
if (!Alloca || !Alloca->getAllocatedType()->isIntegerTy())
519
return nullptr;
520
return getPromotableAlloca(Alloca, Call);
521
}
522
523
// To support specializing recursive functions, it is important to propagate
524
// constant arguments because after a first iteration of specialisation, a
525
// reduced example may look like this:
526
//
527
// define internal void @RecursiveFn(i32* arg1) {
528
// %temp = alloca i32, align 4
529
// store i32 2 i32* %temp, align 4
530
// call void @RecursiveFn.1(i32* nonnull %temp)
531
// ret void
532
// }
533
//
534
// Before a next iteration, we need to propagate the constant like so
535
// which allows further specialization in next iterations.
536
//
537
// @funcspec.arg = internal constant i32 2
538
//
539
// define internal void @someFunc(i32* arg1) {
540
// call void @otherFunc(i32* nonnull @funcspec.arg)
541
// ret void
542
// }
543
//
544
// See if there are any new constant values for the callers of \p F via
545
// stack variables and promote them to global variables.
546
void FunctionSpecializer::promoteConstantStackValues(Function *F) {
547
for (User *U : F->users()) {
548
549
auto *Call = dyn_cast<CallInst>(U);
550
if (!Call)
551
continue;
552
553
if (!Solver.isBlockExecutable(Call->getParent()))
554
continue;
555
556
for (const Use &U : Call->args()) {
557
unsigned Idx = Call->getArgOperandNo(&U);
558
Value *ArgOp = Call->getArgOperand(Idx);
559
Type *ArgOpType = ArgOp->getType();
560
561
if (!Call->onlyReadsMemory(Idx) || !ArgOpType->isPointerTy())
562
continue;
563
564
auto *ConstVal = getConstantStackValue(Call, ArgOp);
565
if (!ConstVal)
566
continue;
567
568
Value *GV = new GlobalVariable(M, ConstVal->getType(), true,
569
GlobalValue::InternalLinkage, ConstVal,
570
"specialized.arg." + Twine(++NGlobals));
571
Call->setArgOperand(Idx, GV);
572
}
573
}
574
}
575
576
// ssa_copy intrinsics are introduced by the SCCP solver. These intrinsics
577
// interfere with the promoteConstantStackValues() optimization.
578
static void removeSSACopy(Function &F) {
579
for (BasicBlock &BB : F) {
580
for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
581
auto *II = dyn_cast<IntrinsicInst>(&Inst);
582
if (!II)
583
continue;
584
if (II->getIntrinsicID() != Intrinsic::ssa_copy)
585
continue;
586
Inst.replaceAllUsesWith(II->getOperand(0));
587
Inst.eraseFromParent();
588
}
589
}
590
}
591
592
/// Remove any ssa_copy intrinsics that may have been introduced.
593
void FunctionSpecializer::cleanUpSSA() {
594
for (Function *F : Specializations)
595
removeSSACopy(*F);
596
}
597
598
599
template <> struct llvm::DenseMapInfo<SpecSig> {
600
static inline SpecSig getEmptyKey() { return {~0U, {}}; }
601
602
static inline SpecSig getTombstoneKey() { return {~1U, {}}; }
603
604
static unsigned getHashValue(const SpecSig &S) {
605
return static_cast<unsigned>(hash_value(S));
606
}
607
608
static bool isEqual(const SpecSig &LHS, const SpecSig &RHS) {
609
return LHS == RHS;
610
}
611
};
612
613
FunctionSpecializer::~FunctionSpecializer() {
614
LLVM_DEBUG(
615
if (NumSpecsCreated > 0)
616
dbgs() << "FnSpecialization: Created " << NumSpecsCreated
617
<< " specializations in module " << M.getName() << "\n");
618
// Eliminate dead code.
619
removeDeadFunctions();
620
cleanUpSSA();
621
}
622
623
/// Attempt to specialize functions in the module to enable constant
624
/// propagation across function boundaries.
625
///
626
/// \returns true if at least one function is specialized.
627
bool FunctionSpecializer::run() {
628
// Find possible specializations for each function.
629
SpecMap SM;
630
SmallVector<Spec, 32> AllSpecs;
631
unsigned NumCandidates = 0;
632
for (Function &F : M) {
633
if (!isCandidateFunction(&F))
634
continue;
635
636
auto [It, Inserted] = FunctionMetrics.try_emplace(&F);
637
CodeMetrics &Metrics = It->second;
638
//Analyze the function.
639
if (Inserted) {
640
SmallPtrSet<const Value *, 32> EphValues;
641
CodeMetrics::collectEphemeralValues(&F, &GetAC(F), EphValues);
642
for (BasicBlock &BB : F)
643
Metrics.analyzeBasicBlock(&BB, GetTTI(F), EphValues);
644
}
645
646
// If the code metrics reveal that we shouldn't duplicate the function,
647
// or if the code size implies that this function is easy to get inlined,
648
// then we shouldn't specialize it.
649
if (Metrics.notDuplicatable || !Metrics.NumInsts.isValid() ||
650
(!ForceSpecialization && !F.hasFnAttribute(Attribute::NoInline) &&
651
Metrics.NumInsts < MinFunctionSize))
652
continue;
653
654
// TODO: For now only consider recursive functions when running multiple
655
// times. This should change if specialization on literal constants gets
656
// enabled.
657
if (!Inserted && !Metrics.isRecursive && !SpecializeLiteralConstant)
658
continue;
659
660
int64_t Sz = *Metrics.NumInsts.getValue();
661
assert(Sz > 0 && "CodeSize should be positive");
662
// It is safe to down cast from int64_t, NumInsts is always positive.
663
unsigned FuncSize = static_cast<unsigned>(Sz);
664
665
LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for "
666
<< F.getName() << " is " << FuncSize << "\n");
667
668
if (Inserted && Metrics.isRecursive)
669
promoteConstantStackValues(&F);
670
671
if (!findSpecializations(&F, FuncSize, AllSpecs, SM)) {
672
LLVM_DEBUG(
673
dbgs() << "FnSpecialization: No possible specializations found for "
674
<< F.getName() << "\n");
675
continue;
676
}
677
678
++NumCandidates;
679
}
680
681
if (!NumCandidates) {
682
LLVM_DEBUG(
683
dbgs()
684
<< "FnSpecialization: No possible specializations found in module\n");
685
return false;
686
}
687
688
// Choose the most profitable specialisations, which fit in the module
689
// specialization budget, which is derived from maximum number of
690
// specializations per specialization candidate function.
691
auto CompareScore = [&AllSpecs](unsigned I, unsigned J) {
692
if (AllSpecs[I].Score != AllSpecs[J].Score)
693
return AllSpecs[I].Score > AllSpecs[J].Score;
694
return I > J;
695
};
696
const unsigned NSpecs =
697
std::min(NumCandidates * MaxClones, unsigned(AllSpecs.size()));
698
SmallVector<unsigned> BestSpecs(NSpecs + 1);
699
std::iota(BestSpecs.begin(), BestSpecs.begin() + NSpecs, 0);
700
if (AllSpecs.size() > NSpecs) {
701
LLVM_DEBUG(dbgs() << "FnSpecialization: Number of candidates exceed "
702
<< "the maximum number of clones threshold.\n"
703
<< "FnSpecialization: Specializing the "
704
<< NSpecs
705
<< " most profitable candidates.\n");
706
std::make_heap(BestSpecs.begin(), BestSpecs.begin() + NSpecs, CompareScore);
707
for (unsigned I = NSpecs, N = AllSpecs.size(); I < N; ++I) {
708
BestSpecs[NSpecs] = I;
709
std::push_heap(BestSpecs.begin(), BestSpecs.end(), CompareScore);
710
std::pop_heap(BestSpecs.begin(), BestSpecs.end(), CompareScore);
711
}
712
}
713
714
LLVM_DEBUG(dbgs() << "FnSpecialization: List of specializations \n";
715
for (unsigned I = 0; I < NSpecs; ++I) {
716
const Spec &S = AllSpecs[BestSpecs[I]];
717
dbgs() << "FnSpecialization: Function " << S.F->getName()
718
<< " , score " << S.Score << "\n";
719
for (const ArgInfo &Arg : S.Sig.Args)
720
dbgs() << "FnSpecialization: FormalArg = "
721
<< Arg.Formal->getNameOrAsOperand()
722
<< ", ActualArg = " << Arg.Actual->getNameOrAsOperand()
723
<< "\n";
724
});
725
726
// Create the chosen specializations.
727
SmallPtrSet<Function *, 8> OriginalFuncs;
728
SmallVector<Function *> Clones;
729
for (unsigned I = 0; I < NSpecs; ++I) {
730
Spec &S = AllSpecs[BestSpecs[I]];
731
S.Clone = createSpecialization(S.F, S.Sig);
732
733
// Update the known call sites to call the clone.
734
for (CallBase *Call : S.CallSites) {
735
LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *Call
736
<< " to call " << S.Clone->getName() << "\n");
737
Call->setCalledFunction(S.Clone);
738
}
739
740
Clones.push_back(S.Clone);
741
OriginalFuncs.insert(S.F);
742
}
743
744
Solver.solveWhileResolvedUndefsIn(Clones);
745
746
// Update the rest of the call sites - these are the recursive calls, calls
747
// to discarded specialisations and calls that may match a specialisation
748
// after the solver runs.
749
for (Function *F : OriginalFuncs) {
750
auto [Begin, End] = SM[F];
751
updateCallSites(F, AllSpecs.begin() + Begin, AllSpecs.begin() + End);
752
}
753
754
for (Function *F : Clones) {
755
if (F->getReturnType()->isVoidTy())
756
continue;
757
if (F->getReturnType()->isStructTy()) {
758
auto *STy = cast<StructType>(F->getReturnType());
759
if (!Solver.isStructLatticeConstant(F, STy))
760
continue;
761
} else {
762
auto It = Solver.getTrackedRetVals().find(F);
763
assert(It != Solver.getTrackedRetVals().end() &&
764
"Return value ought to be tracked");
765
if (SCCPSolver::isOverdefined(It->second))
766
continue;
767
}
768
for (User *U : F->users()) {
769
if (auto *CS = dyn_cast<CallBase>(U)) {
770
//The user instruction does not call our function.
771
if (CS->getCalledFunction() != F)
772
continue;
773
Solver.resetLatticeValueFor(CS);
774
}
775
}
776
}
777
778
// Rerun the solver to notify the users of the modified callsites.
779
Solver.solveWhileResolvedUndefs();
780
781
for (Function *F : OriginalFuncs)
782
if (FunctionMetrics[F].isRecursive)
783
promoteConstantStackValues(F);
784
785
return true;
786
}
787
788
void FunctionSpecializer::removeDeadFunctions() {
789
for (Function *F : FullySpecialized) {
790
LLVM_DEBUG(dbgs() << "FnSpecialization: Removing dead function "
791
<< F->getName() << "\n");
792
if (FAM)
793
FAM->clear(*F, F->getName());
794
F->eraseFromParent();
795
}
796
FullySpecialized.clear();
797
}
798
799
/// Clone the function \p F and remove the ssa_copy intrinsics added by
800
/// the SCCPSolver in the cloned version.
801
static Function *cloneCandidateFunction(Function *F, unsigned NSpecs) {
802
ValueToValueMapTy Mappings;
803
Function *Clone = CloneFunction(F, Mappings);
804
Clone->setName(F->getName() + ".specialized." + Twine(NSpecs));
805
removeSSACopy(*Clone);
806
return Clone;
807
}
808
809
bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
810
SmallVectorImpl<Spec> &AllSpecs,
811
SpecMap &SM) {
812
// A mapping from a specialisation signature to the index of the respective
813
// entry in the all specialisation array. Used to ensure uniqueness of
814
// specialisations.
815
DenseMap<SpecSig, unsigned> UniqueSpecs;
816
817
// Get a list of interesting arguments.
818
SmallVector<Argument *> Args;
819
for (Argument &Arg : F->args())
820
if (isArgumentInteresting(&Arg))
821
Args.push_back(&Arg);
822
823
if (Args.empty())
824
return false;
825
826
for (User *U : F->users()) {
827
if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
828
continue;
829
auto &CS = *cast<CallBase>(U);
830
831
// The user instruction does not call our function.
832
if (CS.getCalledFunction() != F)
833
continue;
834
835
// If the call site has attribute minsize set, that callsite won't be
836
// specialized.
837
if (CS.hasFnAttr(Attribute::MinSize))
838
continue;
839
840
// If the parent of the call site will never be executed, we don't need
841
// to worry about the passed value.
842
if (!Solver.isBlockExecutable(CS.getParent()))
843
continue;
844
845
// Examine arguments and create a specialisation candidate from the
846
// constant operands of this call site.
847
SpecSig S;
848
for (Argument *A : Args) {
849
Constant *C = getCandidateConstant(CS.getArgOperand(A->getArgNo()));
850
if (!C)
851
continue;
852
LLVM_DEBUG(dbgs() << "FnSpecialization: Found interesting argument "
853
<< A->getName() << " : " << C->getNameOrAsOperand()
854
<< "\n");
855
S.Args.push_back({A, C});
856
}
857
858
if (S.Args.empty())
859
continue;
860
861
// Check if we have encountered the same specialisation already.
862
if (auto It = UniqueSpecs.find(S); It != UniqueSpecs.end()) {
863
// Existing specialisation. Add the call to the list to rewrite, unless
864
// it's a recursive call. A specialisation, generated because of a
865
// recursive call may end up as not the best specialisation for all
866
// the cloned instances of this call, which result from specialising
867
// functions. Hence we don't rewrite the call directly, but match it with
868
// the best specialisation once all specialisations are known.
869
if (CS.getFunction() == F)
870
continue;
871
const unsigned Index = It->second;
872
AllSpecs[Index].CallSites.push_back(&CS);
873
} else {
874
// Calculate the specialisation gain.
875
Bonus B;
876
unsigned Score = 0;
877
InstCostVisitor Visitor = getInstCostVisitorFor(F);
878
for (ArgInfo &A : S.Args) {
879
B += Visitor.getSpecializationBonus(A.Formal, A.Actual);
880
Score += getInliningBonus(A.Formal, A.Actual);
881
}
882
B += Visitor.getBonusFromPendingPHIs();
883
884
885
LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
886
<< B.CodeSize << ", Latency = " << B.Latency
887
<< ", Inlining = " << Score << "}\n");
888
889
FunctionGrowth[F] += FuncSize - B.CodeSize;
890
891
auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize,
892
unsigned FuncGrowth) -> bool {
893
// No check required.
894
if (ForceSpecialization)
895
return true;
896
// Minimum inlining bonus.
897
if (Score > MinInliningBonus * FuncSize / 100)
898
return true;
899
// Minimum codesize savings.
900
if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100)
901
return false;
902
// Minimum latency savings.
903
if (B.Latency < MinLatencySavings * FuncSize / 100)
904
return false;
905
// Maximum codesize growth.
906
if (FuncGrowth / FuncSize > MaxCodeSizeGrowth)
907
return false;
908
return true;
909
};
910
911
// Discard unprofitable specialisations.
912
if (!IsProfitable(B, Score, FuncSize, FunctionGrowth[F]))
913
continue;
914
915
// Create a new specialisation entry.
916
Score += std::max(B.CodeSize, B.Latency);
917
auto &Spec = AllSpecs.emplace_back(F, S, Score);
918
if (CS.getFunction() != F)
919
Spec.CallSites.push_back(&CS);
920
const unsigned Index = AllSpecs.size() - 1;
921
UniqueSpecs[S] = Index;
922
if (auto [It, Inserted] = SM.try_emplace(F, Index, Index + 1); !Inserted)
923
It->second.second = Index + 1;
924
}
925
}
926
927
return !UniqueSpecs.empty();
928
}
929
930
bool FunctionSpecializer::isCandidateFunction(Function *F) {
931
if (F->isDeclaration() || F->arg_empty())
932
return false;
933
934
if (F->hasFnAttribute(Attribute::NoDuplicate))
935
return false;
936
937
// Do not specialize the cloned function again.
938
if (Specializations.contains(F))
939
return false;
940
941
// If we're optimizing the function for size, we shouldn't specialize it.
942
if (F->hasOptSize() ||
943
shouldOptimizeForSize(F, nullptr, nullptr, PGSOQueryType::IRPass))
944
return false;
945
946
// Exit if the function is not executable. There's no point in specializing
947
// a dead function.
948
if (!Solver.isBlockExecutable(&F->getEntryBlock()))
949
return false;
950
951
// It wastes time to specialize a function which would get inlined finally.
952
if (F->hasFnAttribute(Attribute::AlwaysInline))
953
return false;
954
955
LLVM_DEBUG(dbgs() << "FnSpecialization: Try function: " << F->getName()
956
<< "\n");
957
return true;
958
}
959
960
Function *FunctionSpecializer::createSpecialization(Function *F,
961
const SpecSig &S) {
962
Function *Clone = cloneCandidateFunction(F, Specializations.size() + 1);
963
964
// The original function does not neccessarily have internal linkage, but the
965
// clone must.
966
Clone->setLinkage(GlobalValue::InternalLinkage);
967
968
// Initialize the lattice state of the arguments of the function clone,
969
// marking the argument on which we specialized the function constant
970
// with the given value.
971
Solver.setLatticeValueForSpecializationArguments(Clone, S.Args);
972
Solver.markBlockExecutable(&Clone->front());
973
Solver.addArgumentTrackedFunction(Clone);
974
Solver.addTrackedFunction(Clone);
975
976
// Mark all the specialized functions
977
Specializations.insert(Clone);
978
++NumSpecsCreated;
979
980
return Clone;
981
}
982
983
/// Compute the inlining bonus for replacing argument \p A with constant \p C.
984
/// The below heuristic is only concerned with exposing inlining
985
/// opportunities via indirect call promotion. If the argument is not a
986
/// (potentially casted) function pointer, give up.
987
unsigned FunctionSpecializer::getInliningBonus(Argument *A, Constant *C) {
988
Function *CalledFunction = dyn_cast<Function>(C->stripPointerCasts());
989
if (!CalledFunction)
990
return 0;
991
992
// Get TTI for the called function (used for the inline cost).
993
auto &CalleeTTI = (GetTTI)(*CalledFunction);
994
995
// Look at all the call sites whose called value is the argument.
996
// Specializing the function on the argument would allow these indirect
997
// calls to be promoted to direct calls. If the indirect call promotion
998
// would likely enable the called function to be inlined, specializing is a
999
// good idea.
1000
int InliningBonus = 0;
1001
for (User *U : A->users()) {
1002
if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
1003
continue;
1004
auto *CS = cast<CallBase>(U);
1005
if (CS->getCalledOperand() != A)
1006
continue;
1007
if (CS->getFunctionType() != CalledFunction->getFunctionType())
1008
continue;
1009
1010
// Get the cost of inlining the called function at this call site. Note
1011
// that this is only an estimate. The called function may eventually
1012
// change in a way that leads to it not being inlined here, even though
1013
// inlining looks profitable now. For example, one of its called
1014
// functions may be inlined into it, making the called function too large
1015
// to be inlined into this call site.
1016
//
1017
// We apply a boost for performing indirect call promotion by increasing
1018
// the default threshold by the threshold for indirect calls.
1019
auto Params = getInlineParams();
1020
Params.DefaultThreshold += InlineConstants::IndirectCallThreshold;
1021
InlineCost IC =
1022
getInlineCost(*CS, CalledFunction, Params, CalleeTTI, GetAC, GetTLI);
1023
1024
// We clamp the bonus for this call to be between zero and the default
1025
// threshold.
1026
if (IC.isAlways())
1027
InliningBonus += Params.DefaultThreshold;
1028
else if (IC.isVariable() && IC.getCostDelta() > 0)
1029
InliningBonus += IC.getCostDelta();
1030
1031
LLVM_DEBUG(dbgs() << "FnSpecialization: Inlining bonus " << InliningBonus
1032
<< " for user " << *U << "\n");
1033
}
1034
1035
return InliningBonus > 0 ? static_cast<unsigned>(InliningBonus) : 0;
1036
}
1037
1038
/// Determine if it is possible to specialise the function for constant values
1039
/// of the formal parameter \p A.
1040
bool FunctionSpecializer::isArgumentInteresting(Argument *A) {
1041
// No point in specialization if the argument is unused.
1042
if (A->user_empty())
1043
return false;
1044
1045
Type *Ty = A->getType();
1046
if (!Ty->isPointerTy() && (!SpecializeLiteralConstant ||
1047
(!Ty->isIntegerTy() && !Ty->isFloatingPointTy() && !Ty->isStructTy())))
1048
return false;
1049
1050
// SCCP solver does not record an argument that will be constructed on
1051
// stack.
1052
if (A->hasByValAttr() && !A->getParent()->onlyReadsMemory())
1053
return false;
1054
1055
// For non-argument-tracked functions every argument is overdefined.
1056
if (!Solver.isArgumentTrackedFunction(A->getParent()))
1057
return true;
1058
1059
// Check the lattice value and decide if we should attemt to specialize,
1060
// based on this argument. No point in specialization, if the lattice value
1061
// is already a constant.
1062
bool IsOverdefined = Ty->isStructTy()
1063
? any_of(Solver.getStructLatticeValueFor(A), SCCPSolver::isOverdefined)
1064
: SCCPSolver::isOverdefined(Solver.getLatticeValueFor(A));
1065
1066
LLVM_DEBUG(
1067
if (IsOverdefined)
1068
dbgs() << "FnSpecialization: Found interesting parameter "
1069
<< A->getNameOrAsOperand() << "\n";
1070
else
1071
dbgs() << "FnSpecialization: Nothing to do, parameter "
1072
<< A->getNameOrAsOperand() << " is already constant\n";
1073
);
1074
return IsOverdefined;
1075
}
1076
1077
/// Check if the value \p V (an actual argument) is a constant or can only
1078
/// have a constant value. Return that constant.
1079
Constant *FunctionSpecializer::getCandidateConstant(Value *V) {
1080
if (isa<PoisonValue>(V))
1081
return nullptr;
1082
1083
// Select for possible specialisation values that are constants or
1084
// are deduced to be constants or constant ranges with a single element.
1085
Constant *C = dyn_cast<Constant>(V);
1086
if (!C)
1087
C = Solver.getConstantOrNull(V);
1088
1089
// Don't specialize on (anything derived from) the address of a non-constant
1090
// global variable, unless explicitly enabled.
1091
if (C && C->getType()->isPointerTy() && !C->isNullValue())
1092
if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(C));
1093
GV && !(GV->isConstant() || SpecializeOnAddress))
1094
return nullptr;
1095
1096
return C;
1097
}
1098
1099
void FunctionSpecializer::updateCallSites(Function *F, const Spec *Begin,
1100
const Spec *End) {
1101
// Collect the call sites that need updating.
1102
SmallVector<CallBase *> ToUpdate;
1103
for (User *U : F->users())
1104
if (auto *CS = dyn_cast<CallBase>(U);
1105
CS && CS->getCalledFunction() == F &&
1106
Solver.isBlockExecutable(CS->getParent()))
1107
ToUpdate.push_back(CS);
1108
1109
unsigned NCallsLeft = ToUpdate.size();
1110
for (CallBase *CS : ToUpdate) {
1111
bool ShouldDecrementCount = CS->getFunction() == F;
1112
1113
// Find the best matching specialisation.
1114
const Spec *BestSpec = nullptr;
1115
for (const Spec &S : make_range(Begin, End)) {
1116
if (!S.Clone || (BestSpec && S.Score <= BestSpec->Score))
1117
continue;
1118
1119
if (any_of(S.Sig.Args, [CS, this](const ArgInfo &Arg) {
1120
unsigned ArgNo = Arg.Formal->getArgNo();
1121
return getCandidateConstant(CS->getArgOperand(ArgNo)) != Arg.Actual;
1122
}))
1123
continue;
1124
1125
BestSpec = &S;
1126
}
1127
1128
if (BestSpec) {
1129
LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *CS
1130
<< " to call " << BestSpec->Clone->getName() << "\n");
1131
CS->setCalledFunction(BestSpec->Clone);
1132
ShouldDecrementCount = true;
1133
}
1134
1135
if (ShouldDecrementCount)
1136
--NCallsLeft;
1137
}
1138
1139
// If the function has been completely specialized, the original function
1140
// is no longer needed. Mark it unreachable.
1141
if (NCallsLeft == 0 && Solver.isArgumentTrackedFunction(F)) {
1142
Solver.markFunctionUnreachable(F);
1143
FullySpecialized.insert(F);
1144
}
1145
}
1146
1147