CoCalc -- VPlanSLP.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
³⁵²⁶⁶ views
1
//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
/// This file implements SLP analysis based on VPlan. The analysis is based on
9
/// the ideas described in
10
///
11
///   Look-ahead SLP: auto-vectorization in the presence of commutative
12
///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
13
///   Luís F. W. Góes
14
///
15
//===----------------------------------------------------------------------===//
16

17
#include "VPlan.h"
18
#include "VPlanValue.h"
19
#include "llvm/ADT/DenseMap.h"
20
#include "llvm/ADT/SmallVector.h"
21
#include "llvm/Analysis/VectorUtils.h"
22
#include "llvm/IR/Instruction.h"
23
#include "llvm/IR/Instructions.h"
24
#include "llvm/IR/Type.h"
25
#include "llvm/IR/Value.h"
26
#include "llvm/Support/Casting.h"
27
#include "llvm/Support/Debug.h"
28
#include "llvm/Support/ErrorHandling.h"
29
#include "llvm/Support/raw_ostream.h"
30
#include <algorithm>
31
#include <cassert>
32
#include <optional>
33
#include <utility>
34

35
using namespace llvm;
36

37
#define DEBUG_TYPE "vplan-slp"
38

39
// Number of levels to look ahead when re-ordering multi node operands.
40
static unsigned LookaheadMaxDepth = 5;
41

42
VPInstruction *VPlanSlp::markFailed() {
43
  // FIXME: Currently this is used to signal we hit instructions we cannot
44
  //        trivially SLP'ize.
45
  CompletelySLP = false;
46
  return nullptr;
47
}
48

49
void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
50
  if (all_of(Operands, [](VPValue *V) {
51
        return cast<VPInstruction>(V)->getUnderlyingInstr();
52
      })) {
53
    unsigned BundleSize = 0;
54
    for (VPValue *V : Operands) {
55
      Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
56
      assert(!T->isVectorTy() && "Only scalar types supported for now");
57
      BundleSize += T->getScalarSizeInBits();
58
    }
59
    WidestBundleBits = std::max(WidestBundleBits, BundleSize);
60
  }
61

62
  auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
63
  assert(Res.second &&
64
         "Already created a combined instruction for the operand bundle");
65
  (void)Res;
66
}
67

68
bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
69
  // Currently we only support VPInstructions.
70
  if (!all_of(Operands, [](VPValue *Op) {
71
        return Op && isa<VPInstruction>(Op) &&
72
               cast<VPInstruction>(Op)->getUnderlyingInstr();
73
      })) {
74
    LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
75
    return false;
76
  }
77

78
  // Check if opcodes and type width agree for all instructions in the bundle.
79
  // FIXME: Differing widths/opcodes can be handled by inserting additional
80
  //        instructions.
81
  // FIXME: Deal with non-primitive types.
82
  const Instruction *OriginalInstr =
83
      cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
84
  unsigned Opcode = OriginalInstr->getOpcode();
85
  unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
86
  if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
87
        const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
88
        return I->getOpcode() == Opcode &&
89
               I->getType()->getPrimitiveSizeInBits() == Width;
90
      })) {
91
    LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
92
    return false;
93
  }
94

95
  // For now, all operands must be defined in the same BB.
96
  if (any_of(Operands, [this](VPValue *Op) {
97
        return cast<VPInstruction>(Op)->getParent() != &this->BB;
98
      })) {
99
    LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
100
    return false;
101
  }
102

103
  if (any_of(Operands,
104
             [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
105
    LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
106
    return false;
107
  }
108

109
  // For loads, check that there are no instructions writing to memory in
110
  // between them.
111
  // TODO: we only have to forbid instructions writing to memory that could
112
  //       interfere with any of the loads in the bundle
113
  if (Opcode == Instruction::Load) {
114
    unsigned LoadsSeen = 0;
115
    VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
116
    for (auto &I : *Parent) {
117
      auto *VPI = dyn_cast<VPInstruction>(&I);
118
      if (!VPI)
119
        break;
120
      if (VPI->getOpcode() == Instruction::Load &&
121
          llvm::is_contained(Operands, VPI))
122
        LoadsSeen++;
123

124
      if (LoadsSeen == Operands.size())
125
        break;
126
      if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
127
        LLVM_DEBUG(
128
            dbgs() << "VPSLP: instruction modifying memory between loads\n");
129
        return false;
130
      }
131
    }
132

133
    if (!all_of(Operands, [](VPValue *Op) {
134
          return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
135
              ->isSimple();
136
        })) {
137
      LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
138
      return false;
139
    }
140
  }
141

142
  if (Opcode == Instruction::Store)
143
    if (!all_of(Operands, [](VPValue *Op) {
144
          return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
145
              ->isSimple();
146
        })) {
147
      LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
148
      return false;
149
    }
150

151
  return true;
152
}
153

154
static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
155
                                             unsigned OperandIndex) {
156
  SmallVector<VPValue *, 4> Operands;
157
  for (VPValue *V : Values) {
158
    // Currently we only support VPInstructions.
159
    auto *U = cast<VPInstruction>(V);
160
    Operands.push_back(U->getOperand(OperandIndex));
161
  }
162
  return Operands;
163
}
164

165
static bool areCommutative(ArrayRef<VPValue *> Values) {
166
  return Instruction::isCommutative(
167
      cast<VPInstruction>(Values[0])->getOpcode());
168
}
169

170
static SmallVector<SmallVector<VPValue *, 4>, 4>
171
getOperands(ArrayRef<VPValue *> Values) {
172
  SmallVector<SmallVector<VPValue *, 4>, 4> Result;
173
  auto *VPI = cast<VPInstruction>(Values[0]);
174

175
  switch (VPI->getOpcode()) {
176
  case Instruction::Load:
177
    llvm_unreachable("Loads terminate a tree, no need to get operands");
178
  case Instruction::Store:
179
    Result.push_back(getOperands(Values, 0));
180
    break;
181
  default:
182
    for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
183
      Result.push_back(getOperands(Values, I));
184
    break;
185
  }
186

187
  return Result;
188
}
189

190
/// Returns the opcode of Values or ~0 if they do not all agree.
191
static std::optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
192
  unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
193
  if (any_of(Values, [Opcode](VPValue *V) {
194
        return cast<VPInstruction>(V)->getOpcode() != Opcode;
195
      }))
196
    return std::nullopt;
197
  return {Opcode};
198
}
199

200
/// Returns true if A and B access sequential memory if they are loads or
201
/// stores or if they have identical opcodes otherwise.
202
static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
203
                                  VPInterleavedAccessInfo &IAI) {
204
  if (A->getOpcode() != B->getOpcode())
205
    return false;
206

207
  if (A->getOpcode() != Instruction::Load &&
208
      A->getOpcode() != Instruction::Store)
209
    return true;
210
  auto *GA = IAI.getInterleaveGroup(A);
211
  auto *GB = IAI.getInterleaveGroup(B);
212

213
  return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
214
}
215

216
/// Implements getLAScore from Listing 7 in the paper.
217
/// Traverses and compares operands of V1 and V2 to MaxLevel.
218
static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
219
                           VPInterleavedAccessInfo &IAI) {
220
  auto *I1 = dyn_cast<VPInstruction>(V1);
221
  auto *I2 = dyn_cast<VPInstruction>(V2);
222
  // Currently we only support VPInstructions.
223
  if (!I1 || !I2)
224
    return 0;
225

226
  if (MaxLevel == 0)
227
    return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);
228

229
  unsigned Score = 0;
230
  for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
231
    for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
232
      Score +=
233
          getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
234
  return Score;
235
}
236

237
std::pair<VPlanSlp::OpMode, VPValue *>
238
VPlanSlp::getBest(OpMode Mode, VPValue *Last,
239
                  SmallPtrSetImpl<VPValue *> &Candidates,
240
                  VPInterleavedAccessInfo &IAI) {
241
  assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
242
         "Currently we only handle load and commutative opcodes");
243
  LLVM_DEBUG(dbgs() << "      getBest\n");
244

245
  SmallVector<VPValue *, 4> BestCandidates;
246
  LLVM_DEBUG(dbgs() << "        Candidates  for "
247
                    << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
248
  for (auto *Candidate : Candidates) {
249
    auto *LastI = cast<VPInstruction>(Last);
250
    auto *CandidateI = cast<VPInstruction>(Candidate);
251
    if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
252
      LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
253
                        << " ");
254
      BestCandidates.push_back(Candidate);
255
    }
256
  }
257
  LLVM_DEBUG(dbgs() << "\n");
258

259
  if (BestCandidates.empty())
260
    return {OpMode::Failed, nullptr};
261

262
  if (BestCandidates.size() == 1)
263
    return {Mode, BestCandidates[0]};
264

265
  VPValue *Best = nullptr;
266
  unsigned BestScore = 0;
267
  for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
268
    unsigned PrevScore = ~0u;
269
    bool AllSame = true;
270

271
    // FIXME: Avoid visiting the same operands multiple times.
272
    for (auto *Candidate : BestCandidates) {
273
      unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
274
      if (PrevScore == ~0u)
275
        PrevScore = Score;
276
      if (PrevScore != Score)
277
        AllSame = false;
278
      PrevScore = Score;
279

280
      if (Score > BestScore) {
281
        BestScore = Score;
282
        Best = Candidate;
283
      }
284
    }
285
    if (!AllSame)
286
      break;
287
  }
288
  LLVM_DEBUG(dbgs() << "Found best "
289
                    << *cast<VPInstruction>(Best)->getUnderlyingInstr()
290
                    << "\n");
291
  Candidates.erase(Best);
292

293
  return {Mode, Best};
294
}
295

296
SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
297
  SmallVector<MultiNodeOpTy, 4> FinalOrder;
298
  SmallVector<OpMode, 4> Mode;
299
  FinalOrder.reserve(MultiNodeOps.size());
300
  Mode.reserve(MultiNodeOps.size());
301

302
  LLVM_DEBUG(dbgs() << "Reordering multinode\n");
303

304
  for (auto &Operands : MultiNodeOps) {
305
    FinalOrder.push_back({Operands.first, {Operands.second[0]}});
306
    if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
307
        Instruction::Load)
308
      Mode.push_back(OpMode::Load);
309
    else
310
      Mode.push_back(OpMode::Opcode);
311
  }
312

313
  for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
314
    LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n");
315
    SmallPtrSet<VPValue *, 4> Candidates;
316
    LLVM_DEBUG(dbgs() << "  Candidates  ");
317
    for (auto Ops : MultiNodeOps) {
318
      LLVM_DEBUG(
319
          dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
320
                 << " ");
321
      Candidates.insert(Ops.second[Lane]);
322
    }
323
    LLVM_DEBUG(dbgs() << "\n");
324

325
    for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
326
      LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n");
327
      if (Mode[Op] == OpMode::Failed)
328
        continue;
329

330
      VPValue *Last = FinalOrder[Op].second[Lane - 1];
331
      std::pair<OpMode, VPValue *> Res =
332
          getBest(Mode[Op], Last, Candidates, IAI);
333
      if (Res.second)
334
        FinalOrder[Op].second.push_back(Res.second);
335
      else
336
        // TODO: handle this case
337
        FinalOrder[Op].second.push_back(markFailed());
338
    }
339
  }
340

341
  return FinalOrder;
342
}
343

344
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
345
void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
346
  dbgs() << " Ops: ";
347
  for (auto *Op : Values) {
348
    if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
349
      if (auto *Instr = VPInstr->getUnderlyingInstr()) {
350
        dbgs() << *Instr << " | ";
351
        continue;
352
      }
353
    dbgs() << " nullptr | ";
354
  }
355
  dbgs() << "\n";
356
}
357
#endif
358

359
VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
360
  assert(!Values.empty() && "Need some operands!");
361

362
  // If we already visited this instruction bundle, re-use the existing node
363
  auto I = BundleToCombined.find(to_vector<4>(Values));
364
  if (I != BundleToCombined.end()) {
365
#ifndef NDEBUG
366
    // Check that the resulting graph is a tree. If we re-use a node, this means
367
    // its values have multiple users. We only allow this, if all users of each
368
    // value are the same instruction.
369
    for (auto *V : Values) {
370
      auto UI = V->user_begin();
371
      auto *FirstUser = *UI++;
372
      while (UI != V->user_end()) {
373
        assert(*UI == FirstUser && "Currently we only support SLP trees.");
374
        UI++;
375
      }
376
    }
377
#endif
378
    return I->second;
379
  }
380

381
  // Dump inputs
382
  LLVM_DEBUG({
383
    dbgs() << "buildGraph: ";
384
    dumpBundle(Values);
385
  });
386

387
  if (!areVectorizable(Values))
388
    return markFailed();
389

390
  assert(getOpcode(Values) && "Opcodes for all values must match");
391
  unsigned ValuesOpcode = *getOpcode(Values);
392

393
  SmallVector<VPValue *, 4> CombinedOperands;
394
  if (areCommutative(Values)) {
395
    bool MultiNodeRoot = !MultiNodeActive;
396
    MultiNodeActive = true;
397
    for (auto &Operands : getOperands(Values)) {
398
      LLVM_DEBUG({
399
        dbgs() << "  Visiting Commutative";
400
        dumpBundle(Operands);
401
      });
402

403
      auto OperandsOpcode = getOpcode(Operands);
404
      if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
405
        LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n");
406
        CombinedOperands.push_back(buildGraph(Operands));
407
      } else {
408
        LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
409
        // Create dummy VPInstruction, which will we replace later by the
410
        // re-ordered operand.
411
        VPInstruction *Op = new VPInstruction(0, {});
412
        CombinedOperands.push_back(Op);
413
        MultiNodeOps.emplace_back(Op, Operands);
414
      }
415
    }
416

417
    if (MultiNodeRoot) {
418
      LLVM_DEBUG(dbgs() << "Reorder \n");
419
      MultiNodeActive = false;
420

421
      auto FinalOrder = reorderMultiNodeOps();
422

423
      MultiNodeOps.clear();
424
      for (auto &Ops : FinalOrder) {
425
        VPInstruction *NewOp = buildGraph(Ops.second);
426
        Ops.first->replaceAllUsesWith(NewOp);
427
        for (unsigned i = 0; i < CombinedOperands.size(); i++)
428
          if (CombinedOperands[i] == Ops.first)
429
            CombinedOperands[i] = NewOp;
430
        delete Ops.first;
431
        Ops.first = NewOp;
432
      }
433
      LLVM_DEBUG(dbgs() << "Found final order\n");
434
    }
435
  } else {
436
    LLVM_DEBUG(dbgs() << "  NonCommuntative\n");
437
    if (ValuesOpcode == Instruction::Load)
438
      for (VPValue *V : Values)
439
        CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
440
    else
441
      for (auto &Operands : getOperands(Values))
442
        CombinedOperands.push_back(buildGraph(Operands));
443
  }
444

445
  unsigned Opcode;
446
  switch (ValuesOpcode) {
447
  case Instruction::Load:
448
    Opcode = VPInstruction::SLPLoad;
449
    break;
450
  case Instruction::Store:
451
    Opcode = VPInstruction::SLPStore;
452
    break;
453
  default:
454
    Opcode = ValuesOpcode;
455
    break;
456
  }
457

458
  if (!CompletelySLP)
459
    return markFailed();
460

461
  assert(CombinedOperands.size() > 0 && "Need more some operands");
462
  auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
463
  auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
464

465
  LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
466
                    << *cast<VPInstruction>(Values[0]) << "\n");
467
  addCombined(Values, VPI);
468
  return VPI;
469
}
470

471
Product

Resources

Company