Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
35266 views
1
//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file implements the Loop Distribution Pass. Its main focus is to
10
// distribute loops that cannot be vectorized due to dependence cycles. It
11
// tries to isolate the offending dependences into a new loop allowing
12
// vectorization of the remaining parts.
13
//
14
// For dependence analysis, the pass uses the LoopVectorizer's
15
// LoopAccessAnalysis. Because this analysis presumes no change in the order of
16
// memory operations, special care is taken to preserve the lexical order of
17
// these operations.
18
//
19
// Similarly to the Vectorizer, the pass also supports loop versioning to
20
// run-time disambiguate potentially overlapping arrays.
21
//
22
//===----------------------------------------------------------------------===//
23
24
#include "llvm/Transforms/Scalar/LoopDistribute.h"
25
#include "llvm/ADT/DenseMap.h"
26
#include "llvm/ADT/DepthFirstIterator.h"
27
#include "llvm/ADT/EquivalenceClasses.h"
28
#include "llvm/ADT/STLExtras.h"
29
#include "llvm/ADT/SetVector.h"
30
#include "llvm/ADT/SmallVector.h"
31
#include "llvm/ADT/Statistic.h"
32
#include "llvm/ADT/StringRef.h"
33
#include "llvm/ADT/Twine.h"
34
#include "llvm/ADT/iterator_range.h"
35
#include "llvm/Analysis/AssumptionCache.h"
36
#include "llvm/Analysis/GlobalsModRef.h"
37
#include "llvm/Analysis/LoopAccessAnalysis.h"
38
#include "llvm/Analysis/LoopAnalysisManager.h"
39
#include "llvm/Analysis/LoopInfo.h"
40
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
41
#include "llvm/Analysis/ScalarEvolution.h"
42
#include "llvm/Analysis/TargetLibraryInfo.h"
43
#include "llvm/Analysis/TargetTransformInfo.h"
44
#include "llvm/IR/BasicBlock.h"
45
#include "llvm/IR/Constants.h"
46
#include "llvm/IR/DiagnosticInfo.h"
47
#include "llvm/IR/Dominators.h"
48
#include "llvm/IR/Function.h"
49
#include "llvm/IR/Instruction.h"
50
#include "llvm/IR/Instructions.h"
51
#include "llvm/IR/LLVMContext.h"
52
#include "llvm/IR/Metadata.h"
53
#include "llvm/IR/PassManager.h"
54
#include "llvm/IR/Value.h"
55
#include "llvm/Support/Casting.h"
56
#include "llvm/Support/CommandLine.h"
57
#include "llvm/Support/Debug.h"
58
#include "llvm/Support/raw_ostream.h"
59
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
60
#include "llvm/Transforms/Utils/Cloning.h"
61
#include "llvm/Transforms/Utils/LoopUtils.h"
62
#include "llvm/Transforms/Utils/LoopVersioning.h"
63
#include "llvm/Transforms/Utils/ValueMapper.h"
64
#include <cassert>
65
#include <functional>
66
#include <list>
67
#include <tuple>
68
#include <utility>
69
70
using namespace llvm;
71
72
#define LDIST_NAME "loop-distribute"
73
#define DEBUG_TYPE LDIST_NAME
74
75
/// @{
76
/// Metadata attribute names
77
static const char *const LLVMLoopDistributeFollowupAll =
78
"llvm.loop.distribute.followup_all";
79
static const char *const LLVMLoopDistributeFollowupCoincident =
80
"llvm.loop.distribute.followup_coincident";
81
static const char *const LLVMLoopDistributeFollowupSequential =
82
"llvm.loop.distribute.followup_sequential";
83
static const char *const LLVMLoopDistributeFollowupFallback =
84
"llvm.loop.distribute.followup_fallback";
85
/// @}
86
87
static cl::opt<bool>
88
LDistVerify("loop-distribute-verify", cl::Hidden,
89
cl::desc("Turn on DominatorTree and LoopInfo verification "
90
"after Loop Distribution"),
91
cl::init(false));
92
93
static cl::opt<bool> DistributeNonIfConvertible(
94
"loop-distribute-non-if-convertible", cl::Hidden,
95
cl::desc("Whether to distribute into a loop that may not be "
96
"if-convertible by the loop vectorizer"),
97
cl::init(false));
98
99
static cl::opt<unsigned> DistributeSCEVCheckThreshold(
100
"loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
101
cl::desc("The maximum number of SCEV checks allowed for Loop "
102
"Distribution"));
103
104
static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
105
"loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
106
cl::Hidden,
107
cl::desc("The maximum number of SCEV checks allowed for Loop "
108
"Distribution for loop marked with #pragma clang loop "
109
"distribute(enable)"));
110
111
static cl::opt<bool> EnableLoopDistribute(
112
"enable-loop-distribute", cl::Hidden,
113
cl::desc("Enable the new, experimental LoopDistribution Pass"),
114
cl::init(false));
115
116
STATISTIC(NumLoopsDistributed, "Number of loops distributed");
117
118
namespace {
119
120
/// Maintains the set of instructions of the loop for a partition before
121
/// cloning. After cloning, it hosts the new loop.
122
class InstPartition {
123
using InstructionSet = SmallSetVector<Instruction *, 8>;
124
125
public:
126
InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
127
: DepCycle(DepCycle), OrigLoop(L) {
128
Set.insert(I);
129
}
130
131
/// Returns whether this partition contains a dependence cycle.
132
bool hasDepCycle() const { return DepCycle; }
133
134
/// Adds an instruction to this partition.
135
void add(Instruction *I) { Set.insert(I); }
136
137
/// Collection accessors.
138
InstructionSet::iterator begin() { return Set.begin(); }
139
InstructionSet::iterator end() { return Set.end(); }
140
InstructionSet::const_iterator begin() const { return Set.begin(); }
141
InstructionSet::const_iterator end() const { return Set.end(); }
142
bool empty() const { return Set.empty(); }
143
144
/// Moves this partition into \p Other. This partition becomes empty
145
/// after this.
146
void moveTo(InstPartition &Other) {
147
Other.Set.insert(Set.begin(), Set.end());
148
Set.clear();
149
Other.DepCycle |= DepCycle;
150
}
151
152
/// Populates the partition with a transitive closure of all the
153
/// instructions that the seeded instructions dependent on.
154
void populateUsedSet() {
155
// FIXME: We currently don't use control-dependence but simply include all
156
// blocks (possibly empty at the end) and let simplifycfg mostly clean this
157
// up.
158
for (auto *B : OrigLoop->getBlocks())
159
Set.insert(B->getTerminator());
160
161
// Follow the use-def chains to form a transitive closure of all the
162
// instructions that the originally seeded instructions depend on.
163
SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
164
while (!Worklist.empty()) {
165
Instruction *I = Worklist.pop_back_val();
166
// Insert instructions from the loop that we depend on.
167
for (Value *V : I->operand_values()) {
168
auto *I = dyn_cast<Instruction>(V);
169
if (I && OrigLoop->contains(I->getParent()) && Set.insert(I))
170
Worklist.push_back(I);
171
}
172
}
173
}
174
175
/// Clones the original loop.
176
///
177
/// Updates LoopInfo and DominatorTree using the information that block \p
178
/// LoopDomBB dominates the loop.
179
Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
180
unsigned Index, LoopInfo *LI,
181
DominatorTree *DT) {
182
ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
183
VMap, Twine(".ldist") + Twine(Index),
184
LI, DT, ClonedLoopBlocks);
185
return ClonedLoop;
186
}
187
188
/// The cloned loop. If this partition is mapped to the original loop,
189
/// this is null.
190
const Loop *getClonedLoop() const { return ClonedLoop; }
191
192
/// Returns the loop where this partition ends up after distribution.
193
/// If this partition is mapped to the original loop then use the block from
194
/// the loop.
195
Loop *getDistributedLoop() const {
196
return ClonedLoop ? ClonedLoop : OrigLoop;
197
}
198
199
/// The VMap that is populated by cloning and then used in
200
/// remapinstruction to remap the cloned instructions.
201
ValueToValueMapTy &getVMap() { return VMap; }
202
203
/// Remaps the cloned instructions using VMap.
204
void remapInstructions() {
205
remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
206
}
207
208
/// Based on the set of instructions selected for this partition,
209
/// removes the unnecessary ones.
210
void removeUnusedInsts() {
211
SmallVector<Instruction *, 8> Unused;
212
213
for (auto *Block : OrigLoop->getBlocks())
214
for (auto &Inst : *Block)
215
if (!Set.count(&Inst)) {
216
Instruction *NewInst = &Inst;
217
if (!VMap.empty())
218
NewInst = cast<Instruction>(VMap[NewInst]);
219
220
assert(!isa<BranchInst>(NewInst) &&
221
"Branches are marked used early on");
222
Unused.push_back(NewInst);
223
}
224
225
// Delete the instructions backwards, as it has a reduced likelihood of
226
// having to update as many def-use and use-def chains.
227
for (auto *Inst : reverse(Unused)) {
228
if (!Inst->use_empty())
229
Inst->replaceAllUsesWith(PoisonValue::get(Inst->getType()));
230
Inst->eraseFromParent();
231
}
232
}
233
234
void print(raw_ostream &OS) const {
235
OS << (DepCycle ? " (cycle)\n" : "\n");
236
for (auto *I : Set)
237
// Prefix with the block name.
238
OS << " " << I->getParent()->getName() << ":" << *I << "\n";
239
}
240
241
void printBlocks(raw_ostream &OS) const {
242
for (auto *BB : getDistributedLoop()->getBlocks())
243
OS << *BB;
244
}
245
246
private:
247
/// Instructions from OrigLoop selected for this partition.
248
InstructionSet Set;
249
250
/// Whether this partition contains a dependence cycle.
251
bool DepCycle;
252
253
/// The original loop.
254
Loop *OrigLoop;
255
256
/// The cloned loop. If this partition is mapped to the original loop,
257
/// this is null.
258
Loop *ClonedLoop = nullptr;
259
260
/// The blocks of ClonedLoop including the preheader. If this
261
/// partition is mapped to the original loop, this is empty.
262
SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
263
264
/// These gets populated once the set of instructions have been
265
/// finalized. If this partition is mapped to the original loop, these are not
266
/// set.
267
ValueToValueMapTy VMap;
268
};
269
270
/// Holds the set of Partitions. It populates them, merges them and then
271
/// clones the loops.
272
class InstPartitionContainer {
273
using InstToPartitionIdT = DenseMap<Instruction *, int>;
274
275
public:
276
InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
277
: L(L), LI(LI), DT(DT) {}
278
279
/// Returns the number of partitions.
280
unsigned getSize() const { return PartitionContainer.size(); }
281
282
/// Adds \p Inst into the current partition if that is marked to
283
/// contain cycles. Otherwise start a new partition for it.
284
void addToCyclicPartition(Instruction *Inst) {
285
// If the current partition is non-cyclic. Start a new one.
286
if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
287
PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
288
else
289
PartitionContainer.back().add(Inst);
290
}
291
292
/// Adds \p Inst into a partition that is not marked to contain
293
/// dependence cycles.
294
///
295
// Initially we isolate memory instructions into as many partitions as
296
// possible, then later we may merge them back together.
297
void addToNewNonCyclicPartition(Instruction *Inst) {
298
PartitionContainer.emplace_back(Inst, L);
299
}
300
301
/// Merges adjacent non-cyclic partitions.
302
///
303
/// The idea is that we currently only want to isolate the non-vectorizable
304
/// partition. We could later allow more distribution among these partition
305
/// too.
306
void mergeAdjacentNonCyclic() {
307
mergeAdjacentPartitionsIf(
308
[](const InstPartition *P) { return !P->hasDepCycle(); });
309
}
310
311
/// If a partition contains only conditional stores, we won't vectorize
312
/// it. Try to merge it with a previous cyclic partition.
313
void mergeNonIfConvertible() {
314
mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
315
if (Partition->hasDepCycle())
316
return true;
317
318
// Now, check if all stores are conditional in this partition.
319
bool seenStore = false;
320
321
for (auto *Inst : *Partition)
322
if (isa<StoreInst>(Inst)) {
323
seenStore = true;
324
if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
325
return false;
326
}
327
return seenStore;
328
});
329
}
330
331
/// Merges the partitions according to various heuristics.
332
void mergeBeforePopulating() {
333
mergeAdjacentNonCyclic();
334
if (!DistributeNonIfConvertible)
335
mergeNonIfConvertible();
336
}
337
338
/// Merges partitions in order to ensure that no loads are duplicated.
339
///
340
/// We can't duplicate loads because that could potentially reorder them.
341
/// LoopAccessAnalysis provides dependency information with the context that
342
/// the order of memory operation is preserved.
343
///
344
/// Return if any partitions were merged.
345
bool mergeToAvoidDuplicatedLoads() {
346
using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>;
347
using ToBeMergedT = EquivalenceClasses<InstPartition *>;
348
349
LoadToPartitionT LoadToPartition;
350
ToBeMergedT ToBeMerged;
351
352
// Step through the partitions and create equivalence between partitions
353
// that contain the same load. Also put partitions in between them in the
354
// same equivalence class to avoid reordering of memory operations.
355
for (PartitionContainerT::iterator I = PartitionContainer.begin(),
356
E = PartitionContainer.end();
357
I != E; ++I) {
358
auto *PartI = &*I;
359
360
// If a load occurs in two partitions PartI and PartJ, merge all
361
// partitions (PartI, PartJ] into PartI.
362
for (Instruction *Inst : *PartI)
363
if (isa<LoadInst>(Inst)) {
364
bool NewElt;
365
LoadToPartitionT::iterator LoadToPart;
366
367
std::tie(LoadToPart, NewElt) =
368
LoadToPartition.insert(std::make_pair(Inst, PartI));
369
if (!NewElt) {
370
LLVM_DEBUG(
371
dbgs()
372
<< "LDist: Merging partitions due to this load in multiple "
373
<< "partitions: " << PartI << ", " << LoadToPart->second << "\n"
374
<< *Inst << "\n");
375
376
auto PartJ = I;
377
do {
378
--PartJ;
379
ToBeMerged.unionSets(PartI, &*PartJ);
380
} while (&*PartJ != LoadToPart->second);
381
}
382
}
383
}
384
if (ToBeMerged.empty())
385
return false;
386
387
// Merge the member of an equivalence class into its class leader. This
388
// makes the members empty.
389
for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
390
I != E; ++I) {
391
if (!I->isLeader())
392
continue;
393
394
auto PartI = I->getData();
395
for (auto *PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
396
ToBeMerged.member_end())) {
397
PartJ->moveTo(*PartI);
398
}
399
}
400
401
// Remove the empty partitions.
402
PartitionContainer.remove_if(
403
[](const InstPartition &P) { return P.empty(); });
404
405
return true;
406
}
407
408
/// Sets up the mapping between instructions to partitions. If the
409
/// instruction is duplicated across multiple partitions, set the entry to -1.
410
void setupPartitionIdOnInstructions() {
411
int PartitionID = 0;
412
for (const auto &Partition : PartitionContainer) {
413
for (Instruction *Inst : Partition) {
414
bool NewElt;
415
InstToPartitionIdT::iterator Iter;
416
417
std::tie(Iter, NewElt) =
418
InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
419
if (!NewElt)
420
Iter->second = -1;
421
}
422
++PartitionID;
423
}
424
}
425
426
/// Populates the partition with everything that the seeding
427
/// instructions require.
428
void populateUsedSet() {
429
for (auto &P : PartitionContainer)
430
P.populateUsedSet();
431
}
432
433
/// This performs the main chunk of the work of cloning the loops for
434
/// the partitions.
435
void cloneLoops() {
436
BasicBlock *OrigPH = L->getLoopPreheader();
437
// At this point the predecessor of the preheader is either the memcheck
438
// block or the top part of the original preheader.
439
BasicBlock *Pred = OrigPH->getSinglePredecessor();
440
assert(Pred && "Preheader does not have a single predecessor");
441
BasicBlock *ExitBlock = L->getExitBlock();
442
assert(ExitBlock && "No single exit block");
443
Loop *NewLoop;
444
445
assert(!PartitionContainer.empty() && "at least two partitions expected");
446
// We're cloning the preheader along with the loop so we already made sure
447
// it was empty.
448
assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
449
"preheader not empty");
450
451
// Preserve the original loop ID for use after the transformation.
452
MDNode *OrigLoopID = L->getLoopID();
453
454
// Create a loop for each partition except the last. Clone the original
455
// loop before PH along with adding a preheader for the cloned loop. Then
456
// update PH to point to the newly added preheader.
457
BasicBlock *TopPH = OrigPH;
458
unsigned Index = getSize() - 1;
459
for (auto &Part : llvm::drop_begin(llvm::reverse(PartitionContainer))) {
460
NewLoop = Part.cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
461
462
Part.getVMap()[ExitBlock] = TopPH;
463
Part.remapInstructions();
464
setNewLoopID(OrigLoopID, &Part);
465
--Index;
466
TopPH = NewLoop->getLoopPreheader();
467
}
468
Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
469
470
// Also set a new loop ID for the last loop.
471
setNewLoopID(OrigLoopID, &PartitionContainer.back());
472
473
// Now go in forward order and update the immediate dominator for the
474
// preheaders with the exiting block of the previous loop. Dominance
475
// within the loop is updated in cloneLoopWithPreheader.
476
for (auto Curr = PartitionContainer.cbegin(),
477
Next = std::next(PartitionContainer.cbegin()),
478
E = PartitionContainer.cend();
479
Next != E; ++Curr, ++Next)
480
DT->changeImmediateDominator(
481
Next->getDistributedLoop()->getLoopPreheader(),
482
Curr->getDistributedLoop()->getExitingBlock());
483
}
484
485
/// Removes the dead instructions from the cloned loops.
486
void removeUnusedInsts() {
487
for (auto &Partition : PartitionContainer)
488
Partition.removeUnusedInsts();
489
}
490
491
/// For each memory pointer, it computes the partitionId the pointer is
492
/// used in.
493
///
494
/// This returns an array of int where the I-th entry corresponds to I-th
495
/// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
496
/// partitions its entry is set to -1.
497
SmallVector<int, 8>
498
computePartitionSetForPointers(const LoopAccessInfo &LAI) {
499
const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
500
501
unsigned N = RtPtrCheck->Pointers.size();
502
SmallVector<int, 8> PtrToPartitions(N);
503
for (unsigned I = 0; I < N; ++I) {
504
Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
505
auto Instructions =
506
LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
507
508
int &Partition = PtrToPartitions[I];
509
// First set it to uninitialized.
510
Partition = -2;
511
for (Instruction *Inst : Instructions) {
512
// Note that this could be -1 if Inst is duplicated across multiple
513
// partitions.
514
int ThisPartition = this->InstToPartitionId[Inst];
515
if (Partition == -2)
516
Partition = ThisPartition;
517
// -1 means belonging to multiple partitions.
518
else if (Partition == -1)
519
break;
520
else if (Partition != (int)ThisPartition)
521
Partition = -1;
522
}
523
assert(Partition != -2 && "Pointer not belonging to any partition");
524
}
525
526
return PtrToPartitions;
527
}
528
529
void print(raw_ostream &OS) const {
530
unsigned Index = 0;
531
for (const auto &P : PartitionContainer) {
532
OS << "LDist: Partition " << Index++ << ":";
533
P.print(OS);
534
}
535
}
536
537
void dump() const { print(dbgs()); }
538
539
#ifndef NDEBUG
540
friend raw_ostream &operator<<(raw_ostream &OS,
541
const InstPartitionContainer &Partitions) {
542
Partitions.print(OS);
543
return OS;
544
}
545
#endif
546
547
void printBlocks(raw_ostream &OS) const {
548
unsigned Index = 0;
549
for (const auto &P : PartitionContainer) {
550
OS << "LDist: Partition " << Index++ << ":";
551
P.printBlocks(OS);
552
}
553
}
554
555
private:
556
using PartitionContainerT = std::list<InstPartition>;
557
558
/// List of partitions.
559
PartitionContainerT PartitionContainer;
560
561
/// Mapping from Instruction to partition Id. If the instruction
562
/// belongs to multiple partitions the entry contains -1.
563
InstToPartitionIdT InstToPartitionId;
564
565
Loop *L;
566
LoopInfo *LI;
567
DominatorTree *DT;
568
569
/// The control structure to merge adjacent partitions if both satisfy
570
/// the \p Predicate.
571
template <class UnaryPredicate>
572
void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
573
InstPartition *PrevMatch = nullptr;
574
for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
575
auto DoesMatch = Predicate(&*I);
576
if (PrevMatch == nullptr && DoesMatch) {
577
PrevMatch = &*I;
578
++I;
579
} else if (PrevMatch != nullptr && DoesMatch) {
580
I->moveTo(*PrevMatch);
581
I = PartitionContainer.erase(I);
582
} else {
583
PrevMatch = nullptr;
584
++I;
585
}
586
}
587
}
588
589
/// Assign new LoopIDs for the partition's cloned loop.
590
void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) {
591
std::optional<MDNode *> PartitionID = makeFollowupLoopID(
592
OrigLoopID,
593
{LLVMLoopDistributeFollowupAll,
594
Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential
595
: LLVMLoopDistributeFollowupCoincident});
596
if (PartitionID) {
597
Loop *NewLoop = Part->getDistributedLoop();
598
NewLoop->setLoopID(*PartitionID);
599
}
600
}
601
};
602
603
/// For each memory instruction, this class maintains difference of the
604
/// number of unsafe dependences that start out from this instruction minus
605
/// those that end here.
606
///
607
/// By traversing the memory instructions in program order and accumulating this
608
/// number, we know whether any unsafe dependence crosses over a program point.
609
class MemoryInstructionDependences {
610
using Dependence = MemoryDepChecker::Dependence;
611
612
public:
613
struct Entry {
614
Instruction *Inst;
615
unsigned NumUnsafeDependencesStartOrEnd = 0;
616
617
Entry(Instruction *Inst) : Inst(Inst) {}
618
};
619
620
using AccessesType = SmallVector<Entry, 8>;
621
622
AccessesType::const_iterator begin() const { return Accesses.begin(); }
623
AccessesType::const_iterator end() const { return Accesses.end(); }
624
625
MemoryInstructionDependences(
626
const SmallVectorImpl<Instruction *> &Instructions,
627
const SmallVectorImpl<Dependence> &Dependences) {
628
Accesses.append(Instructions.begin(), Instructions.end());
629
630
LLVM_DEBUG(dbgs() << "LDist: Backward dependences:\n");
631
for (const auto &Dep : Dependences)
632
if (Dep.isPossiblyBackward()) {
633
// Note that the designations source and destination follow the program
634
// order, i.e. source is always first. (The direction is given by the
635
// DepType.)
636
++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
637
--Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
638
639
LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions));
640
}
641
}
642
643
private:
644
AccessesType Accesses;
645
};
646
647
/// The actual class performing the per-loop work.
648
class LoopDistributeForLoop {
649
public:
650
LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
651
ScalarEvolution *SE, LoopAccessInfoManager &LAIs,
652
OptimizationRemarkEmitter *ORE)
653
: L(L), F(F), LI(LI), DT(DT), SE(SE), LAIs(LAIs), ORE(ORE) {
654
setForced();
655
}
656
657
/// Try to distribute an inner-most loop.
658
bool processLoop() {
659
assert(L->isInnermost() && "Only process inner loops.");
660
661
LLVM_DEBUG(dbgs() << "\nLDist: Checking a loop in '"
662
<< L->getHeader()->getParent()->getName() << "' from "
663
<< L->getLocStr() << "\n");
664
665
// Having a single exit block implies there's also one exiting block.
666
if (!L->getExitBlock())
667
return fail("MultipleExitBlocks", "multiple exit blocks");
668
if (!L->isLoopSimplifyForm())
669
return fail("NotLoopSimplifyForm",
670
"loop is not in loop-simplify form");
671
if (!L->isRotatedForm())
672
return fail("NotBottomTested", "loop is not bottom tested");
673
674
BasicBlock *PH = L->getLoopPreheader();
675
676
LAI = &LAIs.getInfo(*L);
677
678
// Currently, we only distribute to isolate the part of the loop with
679
// dependence cycles to enable partial vectorization.
680
if (LAI->canVectorizeMemory())
681
return fail("MemOpsCanBeVectorized",
682
"memory operations are safe for vectorization");
683
684
auto *Dependences = LAI->getDepChecker().getDependences();
685
if (!Dependences || Dependences->empty())
686
return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
687
688
LLVM_DEBUG(dbgs() << "LDist: Found a candidate loop: "
689
<< L->getHeader()->getName() << "\n");
690
691
InstPartitionContainer Partitions(L, LI, DT);
692
693
// First, go through each memory operation and assign them to consecutive
694
// partitions (the order of partitions follows program order). Put those
695
// with unsafe dependences into "cyclic" partition otherwise put each store
696
// in its own "non-cyclic" partition (we'll merge these later).
697
//
698
// Note that a memory operation (e.g. Load2 below) at a program point that
699
// has an unsafe dependence (Store3->Load1) spanning over it must be
700
// included in the same cyclic partition as the dependent operations. This
701
// is to preserve the original program order after distribution. E.g.:
702
//
703
// NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
704
// Load1 -. 1 0->1
705
// Load2 | /Unsafe/ 0 1
706
// Store3 -' -1 1->0
707
// Load4 0 0
708
//
709
// NumUnsafeDependencesActive > 0 indicates this situation and in this case
710
// we just keep assigning to the same cyclic partition until
711
// NumUnsafeDependencesActive reaches 0.
712
const MemoryDepChecker &DepChecker = LAI->getDepChecker();
713
MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
714
*Dependences);
715
716
int NumUnsafeDependencesActive = 0;
717
for (const auto &InstDep : MID) {
718
Instruction *I = InstDep.Inst;
719
// We update NumUnsafeDependencesActive post-instruction, catch the
720
// start of a dependence directly via NumUnsafeDependencesStartOrEnd.
721
if (NumUnsafeDependencesActive ||
722
InstDep.NumUnsafeDependencesStartOrEnd > 0)
723
Partitions.addToCyclicPartition(I);
724
else
725
Partitions.addToNewNonCyclicPartition(I);
726
NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
727
assert(NumUnsafeDependencesActive >= 0 &&
728
"Negative number of dependences active");
729
}
730
731
// Add partitions for values used outside. These partitions can be out of
732
// order from the original program order. This is OK because if the
733
// partition uses a load we will merge this partition with the original
734
// partition of the load that we set up in the previous loop (see
735
// mergeToAvoidDuplicatedLoads).
736
auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
737
for (auto *Inst : DefsUsedOutside)
738
Partitions.addToNewNonCyclicPartition(Inst);
739
740
LLVM_DEBUG(dbgs() << "LDist: Seeded partitions:\n" << Partitions);
741
if (Partitions.getSize() < 2)
742
return fail("CantIsolateUnsafeDeps",
743
"cannot isolate unsafe dependencies");
744
745
// Run the merge heuristics: Merge non-cyclic adjacent partitions since we
746
// should be able to vectorize these together.
747
Partitions.mergeBeforePopulating();
748
LLVM_DEBUG(dbgs() << "LDist: Merged partitions:\n" << Partitions);
749
if (Partitions.getSize() < 2)
750
return fail("CantIsolateUnsafeDeps",
751
"cannot isolate unsafe dependencies");
752
753
// Now, populate the partitions with non-memory operations.
754
Partitions.populateUsedSet();
755
LLVM_DEBUG(dbgs() << "LDist: Populated partitions:\n" << Partitions);
756
757
// In order to preserve original lexical order for loads, keep them in the
758
// partition that we set up in the MemoryInstructionDependences loop.
759
if (Partitions.mergeToAvoidDuplicatedLoads()) {
760
LLVM_DEBUG(dbgs() << "LDist: Partitions merged to ensure unique loads:\n"
761
<< Partitions);
762
if (Partitions.getSize() < 2)
763
return fail("CantIsolateUnsafeDeps",
764
"cannot isolate unsafe dependencies");
765
}
766
767
// Don't distribute the loop if we need too many SCEV run-time checks, or
768
// any if it's illegal.
769
const SCEVPredicate &Pred = LAI->getPSE().getPredicate();
770
if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {
771
return fail("RuntimeCheckWithConvergent",
772
"may not insert runtime check with convergent operation");
773
}
774
775
if (Pred.getComplexity() > (IsForced.value_or(false)
776
? PragmaDistributeSCEVCheckThreshold
777
: DistributeSCEVCheckThreshold))
778
return fail("TooManySCEVRuntimeChecks",
779
"too many SCEV run-time checks needed.\n");
780
781
if (!IsForced.value_or(false) && hasDisableAllTransformsHint(L))
782
return fail("HeuristicDisabled", "distribution heuristic disabled");
783
784
LLVM_DEBUG(dbgs() << "LDist: Distributing loop: "
785
<< L->getHeader()->getName() << "\n");
786
// We're done forming the partitions set up the reverse mapping from
787
// instructions to partitions.
788
Partitions.setupPartitionIdOnInstructions();
789
790
// If we need run-time checks, version the loop now.
791
auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
792
const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
793
const auto &AllChecks = RtPtrChecking->getChecks();
794
auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
795
RtPtrChecking);
796
797
if (LAI->hasConvergentOp() && !Checks.empty()) {
798
return fail("RuntimeCheckWithConvergent",
799
"may not insert runtime check with convergent operation");
800
}
801
802
// To keep things simple have an empty preheader before we version or clone
803
// the loop. (Also split if this has no predecessor, i.e. entry, because we
804
// rely on PH having a predecessor.)
805
if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
806
SplitBlock(PH, PH->getTerminator(), DT, LI);
807
808
if (!Pred.isAlwaysTrue() || !Checks.empty()) {
809
assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning");
810
811
MDNode *OrigLoopID = L->getLoopID();
812
813
LLVM_DEBUG(dbgs() << "LDist: Pointers:\n");
814
LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
815
LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE);
816
LVer.versionLoop(DefsUsedOutside);
817
LVer.annotateLoopWithNoAlias();
818
819
// The unversioned loop will not be changed, so we inherit all attributes
820
// from the original loop, but remove the loop distribution metadata to
821
// avoid to distribute it again.
822
MDNode *UnversionedLoopID = *makeFollowupLoopID(
823
OrigLoopID,
824
{LLVMLoopDistributeFollowupAll, LLVMLoopDistributeFollowupFallback},
825
"llvm.loop.distribute.", true);
826
LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID);
827
}
828
829
// Create identical copies of the original loop for each partition and hook
830
// them up sequentially.
831
Partitions.cloneLoops();
832
833
// Now, we remove the instruction from each loop that don't belong to that
834
// partition.
835
Partitions.removeUnusedInsts();
836
LLVM_DEBUG(dbgs() << "LDist: After removing unused Instrs:\n");
837
LLVM_DEBUG(Partitions.printBlocks(dbgs()));
838
839
if (LDistVerify) {
840
LI->verify(*DT);
841
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
842
}
843
844
++NumLoopsDistributed;
845
// Report the success.
846
ORE->emit([&]() {
847
return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
848
L->getHeader())
849
<< "distributed loop";
850
});
851
return true;
852
}
853
854
/// Provide diagnostics then \return with false.
855
bool fail(StringRef RemarkName, StringRef Message) {
856
LLVMContext &Ctx = F->getContext();
857
bool Forced = isForced().value_or(false);
858
859
LLVM_DEBUG(dbgs() << "LDist: Skipping; " << Message << "\n");
860
861
// With Rpass-missed report that distribution failed.
862
ORE->emit([&]() {
863
return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed",
864
L->getStartLoc(), L->getHeader())
865
<< "loop not distributed: use -Rpass-analysis=loop-distribute for "
866
"more "
867
"info";
868
});
869
870
// With Rpass-analysis report why. This is on by default if distribution
871
// was requested explicitly.
872
ORE->emit(OptimizationRemarkAnalysis(
873
Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME,
874
RemarkName, L->getStartLoc(), L->getHeader())
875
<< "loop not distributed: " << Message);
876
877
// Also issue a warning if distribution was requested explicitly but it
878
// failed.
879
if (Forced)
880
Ctx.diagnose(DiagnosticInfoOptimizationFailure(
881
*F, L->getStartLoc(), "loop not distributed: failed "
882
"explicitly specified loop distribution"));
883
884
return false;
885
}
886
887
/// Return if distribution forced to be enabled/disabled for the loop.
888
///
889
/// If the optional has a value, it indicates whether distribution was forced
890
/// to be enabled (true) or disabled (false). If the optional has no value
891
/// distribution was not forced either way.
892
const std::optional<bool> &isForced() const { return IsForced; }
893
894
private:
895
/// Filter out checks between pointers from the same partition.
896
///
897
/// \p PtrToPartition contains the partition number for pointers. Partition
898
/// number -1 means that the pointer is used in multiple partitions. In this
899
/// case we can't safely omit the check.
900
SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks(
901
const SmallVectorImpl<RuntimePointerCheck> &AllChecks,
902
const SmallVectorImpl<int> &PtrToPartition,
903
const RuntimePointerChecking *RtPtrChecking) {
904
SmallVector<RuntimePointerCheck, 4> Checks;
905
906
copy_if(AllChecks, std::back_inserter(Checks),
907
[&](const RuntimePointerCheck &Check) {
908
for (unsigned PtrIdx1 : Check.first->Members)
909
for (unsigned PtrIdx2 : Check.second->Members)
910
// Only include this check if there is a pair of pointers
911
// that require checking and the pointers fall into
912
// separate partitions.
913
//
914
// (Note that we already know at this point that the two
915
// pointer groups need checking but it doesn't follow
916
// that each pair of pointers within the two groups need
917
// checking as well.
918
//
919
// In other words we don't want to include a check just
920
// because there is a pair of pointers between the two
921
// pointer groups that require checks and a different
922
// pair whose pointers fall into different partitions.)
923
if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
924
!RuntimePointerChecking::arePointersInSamePartition(
925
PtrToPartition, PtrIdx1, PtrIdx2))
926
return true;
927
return false;
928
});
929
930
return Checks;
931
}
932
933
/// Check whether the loop metadata is forcing distribution to be
934
/// enabled/disabled.
935
void setForced() {
936
std::optional<const MDOperand *> Value =
937
findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
938
if (!Value)
939
return;
940
941
const MDOperand *Op = *Value;
942
assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
943
IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
944
}
945
946
Loop *L;
947
Function *F;
948
949
// Analyses used.
950
LoopInfo *LI;
951
const LoopAccessInfo *LAI = nullptr;
952
DominatorTree *DT;
953
ScalarEvolution *SE;
954
LoopAccessInfoManager &LAIs;
955
OptimizationRemarkEmitter *ORE;
956
957
/// Indicates whether distribution is forced to be enabled/disabled for
958
/// the loop.
959
///
960
/// If the optional has a value, it indicates whether distribution was forced
961
/// to be enabled (true) or disabled (false). If the optional has no value
962
/// distribution was not forced either way.
963
std::optional<bool> IsForced;
964
};
965
966
} // end anonymous namespace
967
968
static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
969
ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
970
LoopAccessInfoManager &LAIs) {
971
// Build up a worklist of inner-loops to distribute. This is necessary as the
972
// act of distributing a loop creates new loops and can invalidate iterators
973
// across the loops.
974
SmallVector<Loop *, 8> Worklist;
975
976
for (Loop *TopLevelLoop : *LI)
977
for (Loop *L : depth_first(TopLevelLoop))
978
// We only handle inner-most loops.
979
if (L->isInnermost())
980
Worklist.push_back(L);
981
982
// Now walk the identified inner loops.
983
bool Changed = false;
984
for (Loop *L : Worklist) {
985
LoopDistributeForLoop LDL(L, &F, LI, DT, SE, LAIs, ORE);
986
987
// If distribution was forced for the specific loop to be
988
// enabled/disabled, follow that. Otherwise use the global flag.
989
if (LDL.isForced().value_or(EnableLoopDistribute))
990
Changed |= LDL.processLoop();
991
}
992
993
// Process each loop nest in the function.
994
return Changed;
995
}
996
997
PreservedAnalyses LoopDistributePass::run(Function &F,
998
FunctionAnalysisManager &AM) {
999
auto &LI = AM.getResult<LoopAnalysis>(F);
1000
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
1001
auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
1002
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
1003
1004
LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
1005
bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, LAIs);
1006
if (!Changed)
1007
return PreservedAnalyses::all();
1008
PreservedAnalyses PA;
1009
PA.preserve<LoopAnalysis>();
1010
PA.preserve<DominatorTreeAnalysis>();
1011
return PA;
1012
}
1013
1014