Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
35266 views
1
//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
/// This file implements SLP analysis based on VPlan. The analysis is based on
9
/// the ideas described in
10
///
11
/// Look-ahead SLP: auto-vectorization in the presence of commutative
12
/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
13
/// Luís F. W. Góes
14
///
15
//===----------------------------------------------------------------------===//
16
17
#include "VPlan.h"
18
#include "VPlanValue.h"
19
#include "llvm/ADT/DenseMap.h"
20
#include "llvm/ADT/SmallVector.h"
21
#include "llvm/Analysis/VectorUtils.h"
22
#include "llvm/IR/Instruction.h"
23
#include "llvm/IR/Instructions.h"
24
#include "llvm/IR/Type.h"
25
#include "llvm/IR/Value.h"
26
#include "llvm/Support/Casting.h"
27
#include "llvm/Support/Debug.h"
28
#include "llvm/Support/ErrorHandling.h"
29
#include "llvm/Support/raw_ostream.h"
30
#include <algorithm>
31
#include <cassert>
32
#include <optional>
33
#include <utility>
34
35
using namespace llvm;
36
37
#define DEBUG_TYPE "vplan-slp"
38
39
// Number of levels to look ahead when re-ordering multi node operands.
40
static unsigned LookaheadMaxDepth = 5;
41
42
VPInstruction *VPlanSlp::markFailed() {
43
// FIXME: Currently this is used to signal we hit instructions we cannot
44
// trivially SLP'ize.
45
CompletelySLP = false;
46
return nullptr;
47
}
48
49
void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
50
if (all_of(Operands, [](VPValue *V) {
51
return cast<VPInstruction>(V)->getUnderlyingInstr();
52
})) {
53
unsigned BundleSize = 0;
54
for (VPValue *V : Operands) {
55
Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
56
assert(!T->isVectorTy() && "Only scalar types supported for now");
57
BundleSize += T->getScalarSizeInBits();
58
}
59
WidestBundleBits = std::max(WidestBundleBits, BundleSize);
60
}
61
62
auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
63
assert(Res.second &&
64
"Already created a combined instruction for the operand bundle");
65
(void)Res;
66
}
67
68
bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
69
// Currently we only support VPInstructions.
70
if (!all_of(Operands, [](VPValue *Op) {
71
return Op && isa<VPInstruction>(Op) &&
72
cast<VPInstruction>(Op)->getUnderlyingInstr();
73
})) {
74
LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
75
return false;
76
}
77
78
// Check if opcodes and type width agree for all instructions in the bundle.
79
// FIXME: Differing widths/opcodes can be handled by inserting additional
80
// instructions.
81
// FIXME: Deal with non-primitive types.
82
const Instruction *OriginalInstr =
83
cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
84
unsigned Opcode = OriginalInstr->getOpcode();
85
unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
86
if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
87
const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
88
return I->getOpcode() == Opcode &&
89
I->getType()->getPrimitiveSizeInBits() == Width;
90
})) {
91
LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
92
return false;
93
}
94
95
// For now, all operands must be defined in the same BB.
96
if (any_of(Operands, [this](VPValue *Op) {
97
return cast<VPInstruction>(Op)->getParent() != &this->BB;
98
})) {
99
LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
100
return false;
101
}
102
103
if (any_of(Operands,
104
[](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
105
LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
106
return false;
107
}
108
109
// For loads, check that there are no instructions writing to memory in
110
// between them.
111
// TODO: we only have to forbid instructions writing to memory that could
112
// interfere with any of the loads in the bundle
113
if (Opcode == Instruction::Load) {
114
unsigned LoadsSeen = 0;
115
VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
116
for (auto &I : *Parent) {
117
auto *VPI = dyn_cast<VPInstruction>(&I);
118
if (!VPI)
119
break;
120
if (VPI->getOpcode() == Instruction::Load &&
121
llvm::is_contained(Operands, VPI))
122
LoadsSeen++;
123
124
if (LoadsSeen == Operands.size())
125
break;
126
if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
127
LLVM_DEBUG(
128
dbgs() << "VPSLP: instruction modifying memory between loads\n");
129
return false;
130
}
131
}
132
133
if (!all_of(Operands, [](VPValue *Op) {
134
return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
135
->isSimple();
136
})) {
137
LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
138
return false;
139
}
140
}
141
142
if (Opcode == Instruction::Store)
143
if (!all_of(Operands, [](VPValue *Op) {
144
return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
145
->isSimple();
146
})) {
147
LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
148
return false;
149
}
150
151
return true;
152
}
153
154
static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
155
unsigned OperandIndex) {
156
SmallVector<VPValue *, 4> Operands;
157
for (VPValue *V : Values) {
158
// Currently we only support VPInstructions.
159
auto *U = cast<VPInstruction>(V);
160
Operands.push_back(U->getOperand(OperandIndex));
161
}
162
return Operands;
163
}
164
165
static bool areCommutative(ArrayRef<VPValue *> Values) {
166
return Instruction::isCommutative(
167
cast<VPInstruction>(Values[0])->getOpcode());
168
}
169
170
static SmallVector<SmallVector<VPValue *, 4>, 4>
171
getOperands(ArrayRef<VPValue *> Values) {
172
SmallVector<SmallVector<VPValue *, 4>, 4> Result;
173
auto *VPI = cast<VPInstruction>(Values[0]);
174
175
switch (VPI->getOpcode()) {
176
case Instruction::Load:
177
llvm_unreachable("Loads terminate a tree, no need to get operands");
178
case Instruction::Store:
179
Result.push_back(getOperands(Values, 0));
180
break;
181
default:
182
for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
183
Result.push_back(getOperands(Values, I));
184
break;
185
}
186
187
return Result;
188
}
189
190
/// Returns the opcode of Values or ~0 if they do not all agree.
191
static std::optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
192
unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
193
if (any_of(Values, [Opcode](VPValue *V) {
194
return cast<VPInstruction>(V)->getOpcode() != Opcode;
195
}))
196
return std::nullopt;
197
return {Opcode};
198
}
199
200
/// Returns true if A and B access sequential memory if they are loads or
201
/// stores or if they have identical opcodes otherwise.
202
static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
203
VPInterleavedAccessInfo &IAI) {
204
if (A->getOpcode() != B->getOpcode())
205
return false;
206
207
if (A->getOpcode() != Instruction::Load &&
208
A->getOpcode() != Instruction::Store)
209
return true;
210
auto *GA = IAI.getInterleaveGroup(A);
211
auto *GB = IAI.getInterleaveGroup(B);
212
213
return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
214
}
215
216
/// Implements getLAScore from Listing 7 in the paper.
217
/// Traverses and compares operands of V1 and V2 to MaxLevel.
218
static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
219
VPInterleavedAccessInfo &IAI) {
220
auto *I1 = dyn_cast<VPInstruction>(V1);
221
auto *I2 = dyn_cast<VPInstruction>(V2);
222
// Currently we only support VPInstructions.
223
if (!I1 || !I2)
224
return 0;
225
226
if (MaxLevel == 0)
227
return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);
228
229
unsigned Score = 0;
230
for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
231
for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
232
Score +=
233
getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
234
return Score;
235
}
236
237
std::pair<VPlanSlp::OpMode, VPValue *>
238
VPlanSlp::getBest(OpMode Mode, VPValue *Last,
239
SmallPtrSetImpl<VPValue *> &Candidates,
240
VPInterleavedAccessInfo &IAI) {
241
assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
242
"Currently we only handle load and commutative opcodes");
243
LLVM_DEBUG(dbgs() << " getBest\n");
244
245
SmallVector<VPValue *, 4> BestCandidates;
246
LLVM_DEBUG(dbgs() << " Candidates for "
247
<< *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
248
for (auto *Candidate : Candidates) {
249
auto *LastI = cast<VPInstruction>(Last);
250
auto *CandidateI = cast<VPInstruction>(Candidate);
251
if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
252
LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
253
<< " ");
254
BestCandidates.push_back(Candidate);
255
}
256
}
257
LLVM_DEBUG(dbgs() << "\n");
258
259
if (BestCandidates.empty())
260
return {OpMode::Failed, nullptr};
261
262
if (BestCandidates.size() == 1)
263
return {Mode, BestCandidates[0]};
264
265
VPValue *Best = nullptr;
266
unsigned BestScore = 0;
267
for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
268
unsigned PrevScore = ~0u;
269
bool AllSame = true;
270
271
// FIXME: Avoid visiting the same operands multiple times.
272
for (auto *Candidate : BestCandidates) {
273
unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
274
if (PrevScore == ~0u)
275
PrevScore = Score;
276
if (PrevScore != Score)
277
AllSame = false;
278
PrevScore = Score;
279
280
if (Score > BestScore) {
281
BestScore = Score;
282
Best = Candidate;
283
}
284
}
285
if (!AllSame)
286
break;
287
}
288
LLVM_DEBUG(dbgs() << "Found best "
289
<< *cast<VPInstruction>(Best)->getUnderlyingInstr()
290
<< "\n");
291
Candidates.erase(Best);
292
293
return {Mode, Best};
294
}
295
296
SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
297
SmallVector<MultiNodeOpTy, 4> FinalOrder;
298
SmallVector<OpMode, 4> Mode;
299
FinalOrder.reserve(MultiNodeOps.size());
300
Mode.reserve(MultiNodeOps.size());
301
302
LLVM_DEBUG(dbgs() << "Reordering multinode\n");
303
304
for (auto &Operands : MultiNodeOps) {
305
FinalOrder.push_back({Operands.first, {Operands.second[0]}});
306
if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
307
Instruction::Load)
308
Mode.push_back(OpMode::Load);
309
else
310
Mode.push_back(OpMode::Opcode);
311
}
312
313
for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
314
LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
315
SmallPtrSet<VPValue *, 4> Candidates;
316
LLVM_DEBUG(dbgs() << " Candidates ");
317
for (auto Ops : MultiNodeOps) {
318
LLVM_DEBUG(
319
dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
320
<< " ");
321
Candidates.insert(Ops.second[Lane]);
322
}
323
LLVM_DEBUG(dbgs() << "\n");
324
325
for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
326
LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
327
if (Mode[Op] == OpMode::Failed)
328
continue;
329
330
VPValue *Last = FinalOrder[Op].second[Lane - 1];
331
std::pair<OpMode, VPValue *> Res =
332
getBest(Mode[Op], Last, Candidates, IAI);
333
if (Res.second)
334
FinalOrder[Op].second.push_back(Res.second);
335
else
336
// TODO: handle this case
337
FinalOrder[Op].second.push_back(markFailed());
338
}
339
}
340
341
return FinalOrder;
342
}
343
344
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
345
void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
346
dbgs() << " Ops: ";
347
for (auto *Op : Values) {
348
if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
349
if (auto *Instr = VPInstr->getUnderlyingInstr()) {
350
dbgs() << *Instr << " | ";
351
continue;
352
}
353
dbgs() << " nullptr | ";
354
}
355
dbgs() << "\n";
356
}
357
#endif
358
359
VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
360
assert(!Values.empty() && "Need some operands!");
361
362
// If we already visited this instruction bundle, re-use the existing node
363
auto I = BundleToCombined.find(to_vector<4>(Values));
364
if (I != BundleToCombined.end()) {
365
#ifndef NDEBUG
366
// Check that the resulting graph is a tree. If we re-use a node, this means
367
// its values have multiple users. We only allow this, if all users of each
368
// value are the same instruction.
369
for (auto *V : Values) {
370
auto UI = V->user_begin();
371
auto *FirstUser = *UI++;
372
while (UI != V->user_end()) {
373
assert(*UI == FirstUser && "Currently we only support SLP trees.");
374
UI++;
375
}
376
}
377
#endif
378
return I->second;
379
}
380
381
// Dump inputs
382
LLVM_DEBUG({
383
dbgs() << "buildGraph: ";
384
dumpBundle(Values);
385
});
386
387
if (!areVectorizable(Values))
388
return markFailed();
389
390
assert(getOpcode(Values) && "Opcodes for all values must match");
391
unsigned ValuesOpcode = *getOpcode(Values);
392
393
SmallVector<VPValue *, 4> CombinedOperands;
394
if (areCommutative(Values)) {
395
bool MultiNodeRoot = !MultiNodeActive;
396
MultiNodeActive = true;
397
for (auto &Operands : getOperands(Values)) {
398
LLVM_DEBUG({
399
dbgs() << " Visiting Commutative";
400
dumpBundle(Operands);
401
});
402
403
auto OperandsOpcode = getOpcode(Operands);
404
if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
405
LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
406
CombinedOperands.push_back(buildGraph(Operands));
407
} else {
408
LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
409
// Create dummy VPInstruction, which will we replace later by the
410
// re-ordered operand.
411
VPInstruction *Op = new VPInstruction(0, {});
412
CombinedOperands.push_back(Op);
413
MultiNodeOps.emplace_back(Op, Operands);
414
}
415
}
416
417
if (MultiNodeRoot) {
418
LLVM_DEBUG(dbgs() << "Reorder \n");
419
MultiNodeActive = false;
420
421
auto FinalOrder = reorderMultiNodeOps();
422
423
MultiNodeOps.clear();
424
for (auto &Ops : FinalOrder) {
425
VPInstruction *NewOp = buildGraph(Ops.second);
426
Ops.first->replaceAllUsesWith(NewOp);
427
for (unsigned i = 0; i < CombinedOperands.size(); i++)
428
if (CombinedOperands[i] == Ops.first)
429
CombinedOperands[i] = NewOp;
430
delete Ops.first;
431
Ops.first = NewOp;
432
}
433
LLVM_DEBUG(dbgs() << "Found final order\n");
434
}
435
} else {
436
LLVM_DEBUG(dbgs() << " NonCommuntative\n");
437
if (ValuesOpcode == Instruction::Load)
438
for (VPValue *V : Values)
439
CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
440
else
441
for (auto &Operands : getOperands(Values))
442
CombinedOperands.push_back(buildGraph(Operands));
443
}
444
445
unsigned Opcode;
446
switch (ValuesOpcode) {
447
case Instruction::Load:
448
Opcode = VPInstruction::SLPLoad;
449
break;
450
case Instruction::Store:
451
Opcode = VPInstruction::SLPStore;
452
break;
453
default:
454
Opcode = ValuesOpcode;
455
break;
456
}
457
458
if (!CompletelySLP)
459
return markFailed();
460
461
assert(CombinedOperands.size() > 0 && "Need more some operands");
462
auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
463
auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
464
465
LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
466
<< *cast<VPInstruction>(Values[0]) << "\n");
467
addCombined(Values, VPI);
468
return VPI;
469
}
470
471