CoCalc -- VPlanRecipes.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
³⁵²⁶⁶ views
1
//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This file contains implementations for different VPlan recipes.
11
///
12
//===----------------------------------------------------------------------===//
13

14
#include "VPlan.h"
15
#include "VPlanAnalysis.h"
16
#include "llvm/ADT/STLExtras.h"
17
#include "llvm/ADT/SmallVector.h"
18
#include "llvm/ADT/Twine.h"
19
#include "llvm/Analysis/IVDescriptors.h"
20
#include "llvm/IR/BasicBlock.h"
21
#include "llvm/IR/IRBuilder.h"
22
#include "llvm/IR/Instruction.h"
23
#include "llvm/IR/Instructions.h"
24
#include "llvm/IR/Type.h"
25
#include "llvm/IR/Value.h"
26
#include "llvm/Support/Casting.h"
27
#include "llvm/Support/CommandLine.h"
28
#include "llvm/Support/Debug.h"
29
#include "llvm/Support/raw_ostream.h"
30
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
31
#include "llvm/Transforms/Utils/LoopUtils.h"
32
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
33
#include <cassert>
34

35
using namespace llvm;
36

37
using VectorParts = SmallVector<Value *, 2>;
38

39
namespace llvm {
40
extern cl::opt<bool> EnableVPlanNativePath;
41
}
42
extern cl::opt<unsigned> ForceTargetInstructionCost;
43

44
#define LV_NAME "loop-vectorize"
45
#define DEBUG_TYPE LV_NAME
46

47
bool VPRecipeBase::mayWriteToMemory() const {
48
  switch (getVPDefID()) {
49
  case VPInterleaveSC:
50
    return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
51
  case VPWidenStoreEVLSC:
52
  case VPWidenStoreSC:
53
    return true;
54
  case VPReplicateSC:
55
    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
56
        ->mayWriteToMemory();
57
  case VPWidenCallSC:
58
    return !cast<VPWidenCallRecipe>(this)
59
                ->getCalledScalarFunction()
60
                ->onlyReadsMemory();
61
  case VPBranchOnMaskSC:
62
  case VPScalarIVStepsSC:
63
  case VPPredInstPHISC:
64
    return false;
65
  case VPBlendSC:
66
  case VPReductionEVLSC:
67
  case VPReductionSC:
68
  case VPWidenCanonicalIVSC:
69
  case VPWidenCastSC:
70
  case VPWidenGEPSC:
71
  case VPWidenIntOrFpInductionSC:
72
  case VPWidenLoadEVLSC:
73
  case VPWidenLoadSC:
74
  case VPWidenPHISC:
75
  case VPWidenSC:
76
  case VPWidenSelectSC: {
77
    const Instruction *I =
78
        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
79
    (void)I;
80
    assert((!I || !I->mayWriteToMemory()) &&
81
           "underlying instruction may write to memory");
82
    return false;
83
  }
84
  default:
85
    return true;
86
  }
87
}
88

89
bool VPRecipeBase::mayReadFromMemory() const {
90
  switch (getVPDefID()) {
91
  case VPWidenLoadEVLSC:
92
  case VPWidenLoadSC:
93
    return true;
94
  case VPReplicateSC:
95
    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
96
        ->mayReadFromMemory();
97
  case VPWidenCallSC:
98
    return !cast<VPWidenCallRecipe>(this)
99
                ->getCalledScalarFunction()
100
                ->onlyWritesMemory();
101
  case VPBranchOnMaskSC:
102
  case VPPredInstPHISC:
103
  case VPScalarIVStepsSC:
104
  case VPWidenStoreEVLSC:
105
  case VPWidenStoreSC:
106
    return false;
107
  case VPBlendSC:
108
  case VPReductionEVLSC:
109
  case VPReductionSC:
110
  case VPWidenCanonicalIVSC:
111
  case VPWidenCastSC:
112
  case VPWidenGEPSC:
113
  case VPWidenIntOrFpInductionSC:
114
  case VPWidenPHISC:
115
  case VPWidenSC:
116
  case VPWidenSelectSC: {
117
    const Instruction *I =
118
        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
119
    (void)I;
120
    assert((!I || !I->mayReadFromMemory()) &&
121
           "underlying instruction may read from memory");
122
    return false;
123
  }
124
  default:
125
    return true;
126
  }
127
}
128

129
bool VPRecipeBase::mayHaveSideEffects() const {
130
  switch (getVPDefID()) {
131
  case VPDerivedIVSC:
132
  case VPPredInstPHISC:
133
  case VPScalarCastSC:
134
    return false;
135
  case VPInstructionSC:
136
    switch (cast<VPInstruction>(this)->getOpcode()) {
137
    case Instruction::Or:
138
    case Instruction::ICmp:
139
    case Instruction::Select:
140
    case VPInstruction::Not:
141
    case VPInstruction::CalculateTripCountMinusVF:
142
    case VPInstruction::CanonicalIVIncrementForPart:
143
    case VPInstruction::ExtractFromEnd:
144
    case VPInstruction::FirstOrderRecurrenceSplice:
145
    case VPInstruction::LogicalAnd:
146
    case VPInstruction::PtrAdd:
147
      return false;
148
    default:
149
      return true;
150
    }
151
  case VPWidenCallSC: {
152
    Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
153
    return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
154
  }
155
  case VPBlendSC:
156
  case VPReductionEVLSC:
157
  case VPReductionSC:
158
  case VPScalarIVStepsSC:
159
  case VPWidenCanonicalIVSC:
160
  case VPWidenCastSC:
161
  case VPWidenGEPSC:
162
  case VPWidenIntOrFpInductionSC:
163
  case VPWidenPHISC:
164
  case VPWidenPointerInductionSC:
165
  case VPWidenSC:
166
  case VPWidenSelectSC: {
167
    const Instruction *I =
168
        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
169
    (void)I;
170
    assert((!I || !I->mayHaveSideEffects()) &&
171
           "underlying instruction has side-effects");
172
    return false;
173
  }
174
  case VPInterleaveSC:
175
    return mayWriteToMemory();
176
  case VPWidenLoadEVLSC:
177
  case VPWidenLoadSC:
178
  case VPWidenStoreEVLSC:
179
  case VPWidenStoreSC:
180
    assert(
181
        cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
182
            mayWriteToMemory() &&
183
        "mayHaveSideffects result for ingredient differs from this "
184
        "implementation");
185
    return mayWriteToMemory();
186
  case VPReplicateSC: {
187
    auto *R = cast<VPReplicateRecipe>(this);
188
    return R->getUnderlyingInstr()->mayHaveSideEffects();
189
  }
190
  default:
191
    return true;
192
  }
193
}
194

195
void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
196
  VPValue *ExitValue = getOperand(0);
197
  auto Lane = vputils::isUniformAfterVectorization(ExitValue)
198
                  ? VPLane::getFirstLane()
199
                  : VPLane::getLastLaneForVF(State.VF);
200
  VPBasicBlock *MiddleVPBB =
201
      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
202
  VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
203
  auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
204
  // Values leaving the vector loop reach live out phi's in the exiting block
205
  // via middle block.
206
  auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
207
                       ? MiddleVPBB
208
                       : ExitingVPBB;
209
  BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
210
  // Set insertion point in PredBB in case an extract needs to be generated.
211
  // TODO: Model extracts explicitly.
212
  State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
213
  Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
214
  if (Phi->getBasicBlockIndex(PredBB) != -1)
215
    Phi->setIncomingValueForBlock(PredBB, V);
216
  else
217
    Phi->addIncoming(V, PredBB);
218
}
219

220
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
221
void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
222
  O << "Live-out ";
223
  getPhi()->printAsOperand(O);
224
  O << " = ";
225
  getOperand(0)->printAsOperand(O, SlotTracker);
226
  O << "\n";
227
}
228
#endif
229

230
void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
231
  assert(!Parent && "Recipe already in some VPBasicBlock");
232
  assert(InsertPos->getParent() &&
233
         "Insertion position not in any VPBasicBlock");
234
  InsertPos->getParent()->insert(this, InsertPos->getIterator());
235
}
236

237
void VPRecipeBase::insertBefore(VPBasicBlock &BB,
238
                                iplist<VPRecipeBase>::iterator I) {
239
  assert(!Parent && "Recipe already in some VPBasicBlock");
240
  assert(I == BB.end() || I->getParent() == &BB);
241
  BB.insert(this, I);
242
}
243

244
void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
245
  assert(!Parent && "Recipe already in some VPBasicBlock");
246
  assert(InsertPos->getParent() &&
247
         "Insertion position not in any VPBasicBlock");
248
  InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
249
}
250

251
void VPRecipeBase::removeFromParent() {
252
  assert(getParent() && "Recipe not in any VPBasicBlock");
253
  getParent()->getRecipeList().remove(getIterator());
254
  Parent = nullptr;
255
}
256

257
iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
258
  assert(getParent() && "Recipe not in any VPBasicBlock");
259
  return getParent()->getRecipeList().erase(getIterator());
260
}
261

262
void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
263
  removeFromParent();
264
  insertAfter(InsertPos);
265
}
266

267
void VPRecipeBase::moveBefore(VPBasicBlock &BB,
268
                              iplist<VPRecipeBase>::iterator I) {
269
  removeFromParent();
270
  insertBefore(BB, I);
271
}
272

273
/// Return the underlying instruction to be used for computing \p R's cost via
274
/// the legacy cost model. Return nullptr if there's no suitable instruction.
275
static Instruction *getInstructionForCost(const VPRecipeBase *R) {
276
  if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
277
    return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
278
  if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))
279
    return IG->getInsertPos();
280
  if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
281
    return &WidenMem->getIngredient();
282
  return nullptr;
283
}
284

285
InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
286
  if (auto *UI = getInstructionForCost(this))
287
    if (Ctx.skipCostComputation(UI, VF.isVector()))
288
      return 0;
289

290
  InstructionCost RecipeCost = computeCost(VF, Ctx);
291
  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
292
      RecipeCost.isValid())
293
    RecipeCost = InstructionCost(ForceTargetInstructionCost);
294

295
  LLVM_DEBUG({
296
    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
297
    dump();
298
  });
299
  return RecipeCost;
300
}
301

302
InstructionCost VPRecipeBase::computeCost(ElementCount VF,
303
                                          VPCostContext &Ctx) const {
304
  // Compute the cost for the recipe falling back to the legacy cost model using
305
  // the underlying instruction. If there is no underlying instruction, returns
306
  // 0.
307
  Instruction *UI = getInstructionForCost(this);
308
  if (UI && isa<VPReplicateRecipe>(this)) {
309
    // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
310
    // transform, avoid computing their cost multiple times for now.
311
    Ctx.SkipCostComputation.insert(UI);
312
  }
313
  return UI ? Ctx.getLegacyCost(UI, VF) : 0;
314
}
315

316
FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
317
  assert(OpType == OperationType::FPMathOp &&
318
         "recipe doesn't have fast math flags");
319
  FastMathFlags Res;
320
  Res.setAllowReassoc(FMFs.AllowReassoc);
321
  Res.setNoNaNs(FMFs.NoNaNs);
322
  Res.setNoInfs(FMFs.NoInfs);
323
  Res.setNoSignedZeros(FMFs.NoSignedZeros);
324
  Res.setAllowReciprocal(FMFs.AllowReciprocal);
325
  Res.setAllowContract(FMFs.AllowContract);
326
  Res.setApproxFunc(FMFs.ApproxFunc);
327
  return Res;
328
}
329

330
VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
331
                             VPValue *A, VPValue *B, DebugLoc DL,
332
                             const Twine &Name)
333
    : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
334
                          Pred, DL),
335
      Opcode(Opcode), Name(Name.str()) {
336
  assert(Opcode == Instruction::ICmp &&
337
         "only ICmp predicates supported at the moment");
338
}
339

340
VPInstruction::VPInstruction(unsigned Opcode,
341
                             std::initializer_list<VPValue *> Operands,
342
                             FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
343
    : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
344
      Opcode(Opcode), Name(Name.str()) {
345
  // Make sure the VPInstruction is a floating-point operation.
346
  assert(isFPMathOp() && "this op can't take fast-math flags");
347
}
348

349
bool VPInstruction::doesGeneratePerAllLanes() const {
350
  return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
351
}
352

353
bool VPInstruction::canGenerateScalarForFirstLane() const {
354
  if (Instruction::isBinaryOp(getOpcode()))
355
    return true;
356
  if (isSingleScalar() || isVectorToScalar())
357
    return true;
358
  switch (Opcode) {
359
  case Instruction::ICmp:
360
  case VPInstruction::BranchOnCond:
361
  case VPInstruction::BranchOnCount:
362
  case VPInstruction::CalculateTripCountMinusVF:
363
  case VPInstruction::CanonicalIVIncrementForPart:
364
  case VPInstruction::PtrAdd:
365
  case VPInstruction::ExplicitVectorLength:
366
    return true;
367
  default:
368
    return false;
369
  }
370
}
371

372
Value *VPInstruction::generatePerLane(VPTransformState &State,
373
                                      const VPIteration &Lane) {
374
  IRBuilderBase &Builder = State.Builder;
375

376
  assert(getOpcode() == VPInstruction::PtrAdd &&
377
         "only PtrAdd opcodes are supported for now");
378
  return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
379
                              State.get(getOperand(1), Lane), Name);
380
}
381

382
Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
383
  IRBuilderBase &Builder = State.Builder;
384

385
  if (Instruction::isBinaryOp(getOpcode())) {
386
    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
387
    Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
388
    Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
389
    auto *Res =
390
        Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
391
    if (auto *I = dyn_cast<Instruction>(Res))
392
      setFlags(I);
393
    return Res;
394
  }
395

396
  switch (getOpcode()) {
397
  case VPInstruction::Not: {
398
    Value *A = State.get(getOperand(0), Part);
399
    return Builder.CreateNot(A, Name);
400
  }
401
  case Instruction::ICmp: {
402
    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
403
    Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
404
    Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
405
    return Builder.CreateCmp(getPredicate(), A, B, Name);
406
  }
407
  case Instruction::Select: {
408
    Value *Cond = State.get(getOperand(0), Part);
409
    Value *Op1 = State.get(getOperand(1), Part);
410
    Value *Op2 = State.get(getOperand(2), Part);
411
    return Builder.CreateSelect(Cond, Op1, Op2, Name);
412
  }
413
  case VPInstruction::ActiveLaneMask: {
414
    // Get first lane of vector induction variable.
415
    Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
416
    // Get the original loop tripcount.
417
    Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
418

419
    // If this part of the active lane mask is scalar, generate the CMP directly
420
    // to avoid unnecessary extracts.
421
    if (State.VF.isScalar())
422
      return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
423
                               Name);
424

425
    auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
426
    auto *PredTy = VectorType::get(Int1Ty, State.VF);
427
    return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
428
                                   {PredTy, ScalarTC->getType()},
429
                                   {VIVElem0, ScalarTC}, nullptr, Name);
430
  }
431
  case VPInstruction::FirstOrderRecurrenceSplice: {
432
    // Generate code to combine the previous and current values in vector v3.
433
    //
434
    //   vector.ph:
435
    //     v_init = vector(..., ..., ..., a[-1])
436
    //     br vector.body
437
    //
438
    //   vector.body
439
    //     i = phi [0, vector.ph], [i+4, vector.body]
440
    //     v1 = phi [v_init, vector.ph], [v2, vector.body]
441
    //     v2 = a[i, i+1, i+2, i+3];
442
    //     v3 = vector(v1(3), v2(0, 1, 2))
443

444
    // For the first part, use the recurrence phi (v1), otherwise v2.
445
    auto *V1 = State.get(getOperand(0), 0);
446
    Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
447
    if (!PartMinus1->getType()->isVectorTy())
448
      return PartMinus1;
449
    Value *V2 = State.get(getOperand(1), Part);
450
    return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
451
  }
452
  case VPInstruction::CalculateTripCountMinusVF: {
453
    if (Part != 0)
454
      return State.get(this, 0, /*IsScalar*/ true);
455

456
    Value *ScalarTC = State.get(getOperand(0), {0, 0});
457
    Value *Step =
458
        createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
459
    Value *Sub = Builder.CreateSub(ScalarTC, Step);
460
    Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
461
    Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
462
    return Builder.CreateSelect(Cmp, Sub, Zero);
463
  }
464
  case VPInstruction::ExplicitVectorLength: {
465
    // Compute EVL
466
    auto GetEVL = [=](VPTransformState &State, Value *AVL) {
467
      assert(AVL->getType()->isIntegerTy() &&
468
             "Requested vector length should be an integer.");
469

470
      // TODO: Add support for MaxSafeDist for correct loop emission.
471
      assert(State.VF.isScalable() && "Expected scalable vector factor.");
472
      Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
473

474
      Value *EVL = State.Builder.CreateIntrinsic(
475
          State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
476
          {AVL, VFArg, State.Builder.getTrue()});
477
      return EVL;
478
    };
479
    // TODO: Restructure this code with an explicit remainder loop, vsetvli can
480
    // be outside of the main loop.
481
    assert(Part == 0 && "No unrolling expected for predicated vectorization.");
482
    // Compute VTC - IV as the AVL (requested vector length).
483
    Value *Index = State.get(getOperand(0), VPIteration(0, 0));
484
    Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
485
    Value *AVL = State.Builder.CreateSub(TripCount, Index);
486
    Value *EVL = GetEVL(State, AVL);
487
    return EVL;
488
  }
489
  case VPInstruction::CanonicalIVIncrementForPart: {
490
    auto *IV = State.get(getOperand(0), VPIteration(0, 0));
491
    if (Part == 0)
492
      return IV;
493

494
    // The canonical IV is incremented by the vectorization factor (num of SIMD
495
    // elements) times the unroll part.
496
    Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
497
    return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
498
                             hasNoSignedWrap());
499
  }
500
  case VPInstruction::BranchOnCond: {
501
    if (Part != 0)
502
      return nullptr;
503

504
    Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
505
    // Replace the temporary unreachable terminator with a new conditional
506
    // branch, hooking it up to backward destination for exiting blocks now and
507
    // to forward destination(s) later when they are created.
508
    BranchInst *CondBr =
509
        Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
510
    CondBr->setSuccessor(0, nullptr);
511
    Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
512

513
    if (!getParent()->isExiting())
514
      return CondBr;
515

516
    VPRegionBlock *ParentRegion = getParent()->getParent();
517
    VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
518
    CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
519
    return CondBr;
520
  }
521
  case VPInstruction::BranchOnCount: {
522
    if (Part != 0)
523
      return nullptr;
524
    // First create the compare.
525
    Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
526
    Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
527
    Value *Cond = Builder.CreateICmpEQ(IV, TC);
528

529
    // Now create the branch.
530
    auto *Plan = getParent()->getPlan();
531
    VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
532
    VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
533

534
    // Replace the temporary unreachable terminator with a new conditional
535
    // branch, hooking it up to backward destination (the header) now and to the
536
    // forward destination (the exit/middle block) later when it is created.
537
    // Note that CreateCondBr expects a valid BB as first argument, so we need
538
    // to set it to nullptr later.
539
    BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
540
                                              State.CFG.VPBB2IRBB[Header]);
541
    CondBr->setSuccessor(0, nullptr);
542
    Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
543
    return CondBr;
544
  }
545
  case VPInstruction::ComputeReductionResult: {
546
    if (Part != 0)
547
      return State.get(this, 0, /*IsScalar*/ true);
548

549
    // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
550
    // and will be removed by breaking up the recipe further.
551
    auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
552
    auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
553
    // Get its reduction variable descriptor.
554
    const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
555

556
    RecurKind RK = RdxDesc.getRecurrenceKind();
557

558
    VPValue *LoopExitingDef = getOperand(1);
559
    Type *PhiTy = OrigPhi->getType();
560
    VectorParts RdxParts(State.UF);
561
    for (unsigned Part = 0; Part < State.UF; ++Part)
562
      RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
563

564
    // If the vector reduction can be performed in a smaller type, we truncate
565
    // then extend the loop exit value to enable InstCombine to evaluate the
566
    // entire expression in the smaller type.
567
    // TODO: Handle this in truncateToMinBW.
568
    if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
569
      Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
570
      for (unsigned Part = 0; Part < State.UF; ++Part)
571
        RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
572
    }
573
    // Reduce all of the unrolled parts into a single vector.
574
    Value *ReducedPartRdx = RdxParts[0];
575
    unsigned Op = RecurrenceDescriptor::getOpcode(RK);
576
    if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
577
      Op = Instruction::Or;
578

579
    if (PhiR->isOrdered()) {
580
      ReducedPartRdx = RdxParts[State.UF - 1];
581
    } else {
582
      // Floating-point operations should have some FMF to enable the reduction.
583
      IRBuilderBase::FastMathFlagGuard FMFG(Builder);
584
      Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
585
      for (unsigned Part = 1; Part < State.UF; ++Part) {
586
        Value *RdxPart = RdxParts[Part];
587
        if (Op != Instruction::ICmp && Op != Instruction::FCmp)
588
          ReducedPartRdx = Builder.CreateBinOp(
589
              (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
590
        else
591
          ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
592
      }
593
    }
594

595
    // Create the reduction after the loop. Note that inloop reductions create
596
    // the target reduction in the loop using a Reduction recipe.
597
    if ((State.VF.isVector() ||
598
         RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
599
        !PhiR->isInLoop()) {
600
      ReducedPartRdx =
601
          createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
602
      // If the reduction can be performed in a smaller type, we need to extend
603
      // the reduction to the wider type before we branch to the original loop.
604
      if (PhiTy != RdxDesc.getRecurrenceType())
605
        ReducedPartRdx = RdxDesc.isSigned()
606
                             ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
607
                             : Builder.CreateZExt(ReducedPartRdx, PhiTy);
608
    }
609

610
    // If there were stores of the reduction value to a uniform memory address
611
    // inside the loop, create the final store here.
612
    if (StoreInst *SI = RdxDesc.IntermediateStore) {
613
      auto *NewSI = Builder.CreateAlignedStore(
614
          ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
615
      propagateMetadata(NewSI, SI);
616
    }
617

618
    return ReducedPartRdx;
619
  }
620
  case VPInstruction::ExtractFromEnd: {
621
    if (Part != 0)
622
      return State.get(this, 0, /*IsScalar*/ true);
623

624
    auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
625
    unsigned Offset = CI->getZExtValue();
626
    assert(Offset > 0 && "Offset from end must be positive");
627
    Value *Res;
628
    if (State.VF.isVector()) {
629
      assert(Offset <= State.VF.getKnownMinValue() &&
630
             "invalid offset to extract from");
631
      // Extract lane VF - Offset from the operand.
632
      Res = State.get(
633
          getOperand(0),
634
          VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));
635
    } else {
636
      assert(Offset <= State.UF && "invalid offset to extract from");
637
      // When loop is unrolled without vectorizing, retrieve UF - Offset.
638
      Res = State.get(getOperand(0), State.UF - Offset);
639
    }
640
    if (isa<ExtractElementInst>(Res))
641
      Res->setName(Name);
642
    return Res;
643
  }
644
  case VPInstruction::LogicalAnd: {
645
    Value *A = State.get(getOperand(0), Part);
646
    Value *B = State.get(getOperand(1), Part);
647
    return Builder.CreateLogicalAnd(A, B, Name);
648
  }
649
  case VPInstruction::PtrAdd: {
650
    assert(vputils::onlyFirstLaneUsed(this) &&
651
           "can only generate first lane for PtrAdd");
652
    Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
653
    Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
654
    return Builder.CreatePtrAdd(Ptr, Addend, Name);
655
  }
656
  case VPInstruction::ResumePhi: {
657
    if (Part != 0)
658
      return State.get(this, 0, /*IsScalar*/ true);
659
    Value *IncomingFromVPlanPred =
660
        State.get(getOperand(0), Part, /* IsScalar */ true);
661
    Value *IncomingFromOtherPreds =
662
        State.get(getOperand(1), Part, /* IsScalar */ true);
663
    auto *NewPhi =
664
        Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
665
    BasicBlock *VPlanPred =
666
        State.CFG
667
            .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
668
    NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
669
    for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
670
      assert(OtherPred != VPlanPred &&
671
             "VPlan predecessors should not be connected yet");
672
      NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
673
    }
674
    return NewPhi;
675
  }
676

677
  default:
678
    llvm_unreachable("Unsupported opcode for instruction");
679
  }
680
}
681

682
bool VPInstruction::isVectorToScalar() const {
683
  return getOpcode() == VPInstruction::ExtractFromEnd ||
684
         getOpcode() == VPInstruction::ComputeReductionResult;
685
}
686

687
bool VPInstruction::isSingleScalar() const {
688
  return getOpcode() == VPInstruction::ResumePhi;
689
}
690

691
#if !defined(NDEBUG)
692
bool VPInstruction::isFPMathOp() const {
693
  // Inspired by FPMathOperator::classof. Notable differences are that we don't
694
  // support Call, PHI and Select opcodes here yet.
695
  return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
696
         Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
697
         Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
698
         Opcode == Instruction::FCmp || Opcode == Instruction::Select;
699
}
700
#endif
701

702
void VPInstruction::execute(VPTransformState &State) {
703
  assert(!State.Instance && "VPInstruction executing an Instance");
704
  IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
705
  assert((hasFastMathFlags() == isFPMathOp() ||
706
          getOpcode() == Instruction::Select) &&
707
         "Recipe not a FPMathOp but has fast-math flags?");
708
  if (hasFastMathFlags())
709
    State.Builder.setFastMathFlags(getFastMathFlags());
710
  State.setDebugLocFrom(getDebugLoc());
711
  bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
712
                                   (vputils::onlyFirstLaneUsed(this) ||
713
                                    isVectorToScalar() || isSingleScalar());
714
  bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
715
  bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this);
716
  for (unsigned Part = 0; Part < State.UF; ++Part) {
717
    if (GeneratesPerAllLanes) {
718
      for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
719
           Lane != NumLanes; ++Lane) {
720
        Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
721
        assert(GeneratedValue && "generatePerLane must produce a value");
722
        State.set(this, GeneratedValue, VPIteration(Part, Lane));
723
      }
724
      continue;
725
    }
726

727
    if (Part != 0 && OnlyFirstPartUsed && hasResult()) {
728
      Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly);
729
      State.set(this, Part0, Part,
730
                /*IsScalar*/ GeneratesPerFirstLaneOnly);
731
      continue;
732
    }
733

734
    Value *GeneratedValue = generatePerPart(State, Part);
735
    if (!hasResult())
736
      continue;
737
    assert(GeneratedValue && "generatePerPart must produce a value");
738
    assert((GeneratedValue->getType()->isVectorTy() ==
739
                !GeneratesPerFirstLaneOnly ||
740
            State.VF.isScalar()) &&
741
           "scalar value but not only first lane defined");
742
    State.set(this, GeneratedValue, Part,
743
              /*IsScalar*/ GeneratesPerFirstLaneOnly);
744
  }
745
}
746

747
bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
748
  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
749
  if (Instruction::isBinaryOp(getOpcode()))
750
    return vputils::onlyFirstLaneUsed(this);
751

752
  switch (getOpcode()) {
753
  default:
754
    return false;
755
  case Instruction::ICmp:
756
  case VPInstruction::PtrAdd:
757
    // TODO: Cover additional opcodes.
758
    return vputils::onlyFirstLaneUsed(this);
759
  case VPInstruction::ActiveLaneMask:
760
  case VPInstruction::ExplicitVectorLength:
761
  case VPInstruction::CalculateTripCountMinusVF:
762
  case VPInstruction::CanonicalIVIncrementForPart:
763
  case VPInstruction::BranchOnCount:
764
  case VPInstruction::BranchOnCond:
765
  case VPInstruction::ResumePhi:
766
    return true;
767
  };
768
  llvm_unreachable("switch should return");
769
}
770

771
bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
772
  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
773
  if (Instruction::isBinaryOp(getOpcode()))
774
    return vputils::onlyFirstPartUsed(this);
775

776
  switch (getOpcode()) {
777
  default:
778
    return false;
779
  case Instruction::ICmp:
780
  case Instruction::Select:
781
    return vputils::onlyFirstPartUsed(this);
782
  case VPInstruction::BranchOnCount:
783
  case VPInstruction::BranchOnCond:
784
  case VPInstruction::CanonicalIVIncrementForPart:
785
    return true;
786
  };
787
  llvm_unreachable("switch should return");
788
}
789

790
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
791
void VPInstruction::dump() const {
792
  VPSlotTracker SlotTracker(getParent()->getPlan());
793
  print(dbgs(), "", SlotTracker);
794
}
795

796
void VPInstruction::print(raw_ostream &O, const Twine &Indent,
797
                          VPSlotTracker &SlotTracker) const {
798
  O << Indent << "EMIT ";
799

800
  if (hasResult()) {
801
    printAsOperand(O, SlotTracker);
802
    O << " = ";
803
  }
804

805
  switch (getOpcode()) {
806
  case VPInstruction::Not:
807
    O << "not";
808
    break;
809
  case VPInstruction::SLPLoad:
810
    O << "combined load";
811
    break;
812
  case VPInstruction::SLPStore:
813
    O << "combined store";
814
    break;
815
  case VPInstruction::ActiveLaneMask:
816
    O << "active lane mask";
817
    break;
818
  case VPInstruction::ResumePhi:
819
    O << "resume-phi";
820
    break;
821
  case VPInstruction::ExplicitVectorLength:
822
    O << "EXPLICIT-VECTOR-LENGTH";
823
    break;
824
  case VPInstruction::FirstOrderRecurrenceSplice:
825
    O << "first-order splice";
826
    break;
827
  case VPInstruction::BranchOnCond:
828
    O << "branch-on-cond";
829
    break;
830
  case VPInstruction::CalculateTripCountMinusVF:
831
    O << "TC > VF ? TC - VF : 0";
832
    break;
833
  case VPInstruction::CanonicalIVIncrementForPart:
834
    O << "VF * Part +";
835
    break;
836
  case VPInstruction::BranchOnCount:
837
    O << "branch-on-count";
838
    break;
839
  case VPInstruction::ExtractFromEnd:
840
    O << "extract-from-end";
841
    break;
842
  case VPInstruction::ComputeReductionResult:
843
    O << "compute-reduction-result";
844
    break;
845
  case VPInstruction::LogicalAnd:
846
    O << "logical-and";
847
    break;
848
  case VPInstruction::PtrAdd:
849
    O << "ptradd";
850
    break;
851
  default:
852
    O << Instruction::getOpcodeName(getOpcode());
853
  }
854

855
  printFlags(O);
856
  printOperands(O, SlotTracker);
857

858
  if (auto DL = getDebugLoc()) {
859
    O << ", !dbg ";
860
    DL.print(O);
861
  }
862
}
863
#endif
864

865
void VPWidenCallRecipe::execute(VPTransformState &State) {
866
  assert(State.VF.isVector() && "not widening");
867
  Function *CalledScalarFn = getCalledScalarFunction();
868
  assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
869
         "DbgInfoIntrinsic should have been dropped during VPlan construction");
870
  State.setDebugLocFrom(getDebugLoc());
871

872
  bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
873
  FunctionType *VFTy = nullptr;
874
  if (Variant)
875
    VFTy = Variant->getFunctionType();
876
  for (unsigned Part = 0; Part < State.UF; ++Part) {
877
    SmallVector<Type *, 2> TysForDecl;
878
    // Add return type if intrinsic is overloaded on it.
879
    if (UseIntrinsic &&
880
        isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
881
      TysForDecl.push_back(VectorType::get(
882
          CalledScalarFn->getReturnType()->getScalarType(), State.VF));
883
    SmallVector<Value *, 4> Args;
884
    for (const auto &I : enumerate(arg_operands())) {
885
      // Some intrinsics have a scalar argument - don't replace it with a
886
      // vector.
887
      Value *Arg;
888
      if (UseIntrinsic &&
889
          isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
890
        Arg = State.get(I.value(), VPIteration(0, 0));
891
      // Some vectorized function variants may also take a scalar argument,
892
      // e.g. linear parameters for pointers. This needs to be the scalar value
893
      // from the start of the respective part when interleaving.
894
      else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
895
        Arg = State.get(I.value(), VPIteration(Part, 0));
896
      else
897
        Arg = State.get(I.value(), Part);
898
      if (UseIntrinsic &&
899
          isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
900
        TysForDecl.push_back(Arg->getType());
901
      Args.push_back(Arg);
902
    }
903

904
    Function *VectorF;
905
    if (UseIntrinsic) {
906
      // Use vector version of the intrinsic.
907
      Module *M = State.Builder.GetInsertBlock()->getModule();
908
      VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
909
      assert(VectorF && "Can't retrieve vector intrinsic.");
910
    } else {
911
#ifndef NDEBUG
912
      assert(Variant != nullptr && "Can't create vector function.");
913
#endif
914
      VectorF = Variant;
915
    }
916

917
    auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
918
    SmallVector<OperandBundleDef, 1> OpBundles;
919
    if (CI)
920
      CI->getOperandBundlesAsDefs(OpBundles);
921

922
    CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
923

924
    if (isa<FPMathOperator>(V))
925
      V->copyFastMathFlags(CI);
926

927
    if (!V->getType()->isVoidTy())
928
      State.set(this, V, Part);
929
    State.addMetadata(V, CI);
930
  }
931
}
932

933
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
934
void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
935
                              VPSlotTracker &SlotTracker) const {
936
  O << Indent << "WIDEN-CALL ";
937

938
  Function *CalledFn = getCalledScalarFunction();
939
  if (CalledFn->getReturnType()->isVoidTy())
940
    O << "void ";
941
  else {
942
    printAsOperand(O, SlotTracker);
943
    O << " = ";
944
  }
945

946
  O << "call @" << CalledFn->getName() << "(";
947
  interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
948
    Op->printAsOperand(O, SlotTracker);
949
  });
950
  O << ")";
951

952
  if (VectorIntrinsicID)
953
    O << " (using vector intrinsic)";
954
  else {
955
    O << " (using library function";
956
    if (Variant->hasName())
957
      O << ": " << Variant->getName();
958
    O << ")";
959
  }
960
}
961

962
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
963
                                VPSlotTracker &SlotTracker) const {
964
  O << Indent << "WIDEN-SELECT ";
965
  printAsOperand(O, SlotTracker);
966
  O << " = select ";
967
  getOperand(0)->printAsOperand(O, SlotTracker);
968
  O << ", ";
969
  getOperand(1)->printAsOperand(O, SlotTracker);
970
  O << ", ";
971
  getOperand(2)->printAsOperand(O, SlotTracker);
972
  O << (isInvariantCond() ? " (condition is loop invariant)" : "");
973
}
974
#endif
975

976
void VPWidenSelectRecipe::execute(VPTransformState &State) {
977
  State.setDebugLocFrom(getDebugLoc());
978

979
  // The condition can be loop invariant but still defined inside the
980
  // loop. This means that we can't just use the original 'cond' value.
981
  // We have to take the 'vectorized' value and pick the first lane.
982
  // Instcombine will make this a no-op.
983
  auto *InvarCond =
984
      isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
985

986
  for (unsigned Part = 0; Part < State.UF; ++Part) {
987
    Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
988
    Value *Op0 = State.get(getOperand(1), Part);
989
    Value *Op1 = State.get(getOperand(2), Part);
990
    Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
991
    State.set(this, Sel, Part);
992
    State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
993
  }
994
}
995

996
VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
997
    const FastMathFlags &FMF) {
998
  AllowReassoc = FMF.allowReassoc();
999
  NoNaNs = FMF.noNaNs();
1000
  NoInfs = FMF.noInfs();
1001
  NoSignedZeros = FMF.noSignedZeros();
1002
  AllowReciprocal = FMF.allowReciprocal();
1003
  AllowContract = FMF.allowContract();
1004
  ApproxFunc = FMF.approxFunc();
1005
}
1006

1007
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1008
void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1009
  switch (OpType) {
1010
  case OperationType::Cmp:
1011
    O << " " << CmpInst::getPredicateName(getPredicate());
1012
    break;
1013
  case OperationType::DisjointOp:
1014
    if (DisjointFlags.IsDisjoint)
1015
      O << " disjoint";
1016
    break;
1017
  case OperationType::PossiblyExactOp:
1018
    if (ExactFlags.IsExact)
1019
      O << " exact";
1020
    break;
1021
  case OperationType::OverflowingBinOp:
1022
    if (WrapFlags.HasNUW)
1023
      O << " nuw";
1024
    if (WrapFlags.HasNSW)
1025
      O << " nsw";
1026
    break;
1027
  case OperationType::FPMathOp:
1028
    getFastMathFlags().print(O);
1029
    break;
1030
  case OperationType::GEPOp:
1031
    if (GEPFlags.IsInBounds)
1032
      O << " inbounds";
1033
    break;
1034
  case OperationType::NonNegOp:
1035
    if (NonNegFlags.NonNeg)
1036
      O << " nneg";
1037
    break;
1038
  case OperationType::Other:
1039
    break;
1040
  }
1041
  if (getNumOperands() > 0)
1042
    O << " ";
1043
}
1044
#endif
1045

1046
void VPWidenRecipe::execute(VPTransformState &State) {
1047
  State.setDebugLocFrom(getDebugLoc());
1048
  auto &Builder = State.Builder;
1049
  switch (Opcode) {
1050
  case Instruction::Call:
1051
  case Instruction::Br:
1052
  case Instruction::PHI:
1053
  case Instruction::GetElementPtr:
1054
  case Instruction::Select:
1055
    llvm_unreachable("This instruction is handled by a different recipe.");
1056
  case Instruction::UDiv:
1057
  case Instruction::SDiv:
1058
  case Instruction::SRem:
1059
  case Instruction::URem:
1060
  case Instruction::Add:
1061
  case Instruction::FAdd:
1062
  case Instruction::Sub:
1063
  case Instruction::FSub:
1064
  case Instruction::FNeg:
1065
  case Instruction::Mul:
1066
  case Instruction::FMul:
1067
  case Instruction::FDiv:
1068
  case Instruction::FRem:
1069
  case Instruction::Shl:
1070
  case Instruction::LShr:
1071
  case Instruction::AShr:
1072
  case Instruction::And:
1073
  case Instruction::Or:
1074
  case Instruction::Xor: {
1075
    // Just widen unops and binops.
1076
    for (unsigned Part = 0; Part < State.UF; ++Part) {
1077
      SmallVector<Value *, 2> Ops;
1078
      for (VPValue *VPOp : operands())
1079
        Ops.push_back(State.get(VPOp, Part));
1080

1081
      Value *V = Builder.CreateNAryOp(Opcode, Ops);
1082

1083
      if (auto *VecOp = dyn_cast<Instruction>(V))
1084
        setFlags(VecOp);
1085

1086
      // Use this vector value for all users of the original instruction.
1087
      State.set(this, V, Part);
1088
      State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1089
    }
1090

1091
    break;
1092
  }
1093
  case Instruction::Freeze: {
1094
    for (unsigned Part = 0; Part < State.UF; ++Part) {
1095
      Value *Op = State.get(getOperand(0), Part);
1096

1097
      Value *Freeze = Builder.CreateFreeze(Op);
1098
      State.set(this, Freeze, Part);
1099
    }
1100
    break;
1101
  }
1102
  case Instruction::ICmp:
1103
  case Instruction::FCmp: {
1104
    // Widen compares. Generate vector compares.
1105
    bool FCmp = Opcode == Instruction::FCmp;
1106
    for (unsigned Part = 0; Part < State.UF; ++Part) {
1107
      Value *A = State.get(getOperand(0), Part);
1108
      Value *B = State.get(getOperand(1), Part);
1109
      Value *C = nullptr;
1110
      if (FCmp) {
1111
        // Propagate fast math flags.
1112
        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1113
        if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
1114
          Builder.setFastMathFlags(I->getFastMathFlags());
1115
        C = Builder.CreateFCmp(getPredicate(), A, B);
1116
      } else {
1117
        C = Builder.CreateICmp(getPredicate(), A, B);
1118
      }
1119
      State.set(this, C, Part);
1120
      State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1121
    }
1122

1123
    break;
1124
  }
1125
  default:
1126
    // This instruction is not vectorized by simple widening.
1127
    LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1128
                      << Instruction::getOpcodeName(Opcode));
1129
    llvm_unreachable("Unhandled instruction!");
1130
  } // end of switch.
1131

1132
#if !defined(NDEBUG)
1133
  // Verify that VPlan type inference results agree with the type of the
1134
  // generated values.
1135
  for (unsigned Part = 0; Part < State.UF; ++Part) {
1136
    assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
1137
                           State.VF) == State.get(this, Part)->getType() &&
1138
           "inferred type and type from generated instructions do not match");
1139
  }
1140
#endif
1141
}
1142

1143
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1144
void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1145
                          VPSlotTracker &SlotTracker) const {
1146
  O << Indent << "WIDEN ";
1147
  printAsOperand(O, SlotTracker);
1148
  O << " = " << Instruction::getOpcodeName(Opcode);
1149
  printFlags(O);
1150
  printOperands(O, SlotTracker);
1151
}
1152
#endif
1153

1154
void VPWidenCastRecipe::execute(VPTransformState &State) {
1155
  State.setDebugLocFrom(getDebugLoc());
1156
  auto &Builder = State.Builder;
1157
  /// Vectorize casts.
1158
  assert(State.VF.isVector() && "Not vectorizing?");
1159
  Type *DestTy = VectorType::get(getResultType(), State.VF);
1160
  VPValue *Op = getOperand(0);
1161
  for (unsigned Part = 0; Part < State.UF; ++Part) {
1162
    if (Part > 0 && Op->isLiveIn()) {
1163
      // FIXME: Remove once explicit unrolling is implemented using VPlan.
1164
      State.set(this, State.get(this, 0), Part);
1165
      continue;
1166
    }
1167
    Value *A = State.get(Op, Part);
1168
    Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1169
    State.set(this, Cast, Part);
1170
    State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1171
  }
1172
}
1173

1174
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1175
void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1176
                              VPSlotTracker &SlotTracker) const {
1177
  O << Indent << "WIDEN-CAST ";
1178
  printAsOperand(O, SlotTracker);
1179
  O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1180
  printFlags(O);
1181
  printOperands(O, SlotTracker);
1182
  O << " to " << *getResultType();
1183
}
1184
#endif
1185

1186
/// This function adds
1187
/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1188
/// to each vector element of Val. The sequence starts at StartIndex.
1189
/// \p Opcode is relevant for FP induction variable.
1190
static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1191
                            Instruction::BinaryOps BinOp, ElementCount VF,
1192
                            IRBuilderBase &Builder) {
1193
  assert(VF.isVector() && "only vector VFs are supported");
1194

1195
  // Create and check the types.
1196
  auto *ValVTy = cast<VectorType>(Val->getType());
1197
  ElementCount VLen = ValVTy->getElementCount();
1198

1199
  Type *STy = Val->getType()->getScalarType();
1200
  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1201
         "Induction Step must be an integer or FP");
1202
  assert(Step->getType() == STy && "Step has wrong type");
1203

1204
  SmallVector<Constant *, 8> Indices;
1205

1206
  // Create a vector of consecutive numbers from zero to VF.
1207
  VectorType *InitVecValVTy = ValVTy;
1208
  if (STy->isFloatingPointTy()) {
1209
    Type *InitVecValSTy =
1210
        IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1211
    InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1212
  }
1213
  Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1214

1215
  // Splat the StartIdx
1216
  Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1217

1218
  if (STy->isIntegerTy()) {
1219
    InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1220
    Step = Builder.CreateVectorSplat(VLen, Step);
1221
    assert(Step->getType() == Val->getType() && "Invalid step vec");
1222
    // FIXME: The newly created binary instructions should contain nsw/nuw
1223
    // flags, which can be found from the original scalar operations.
1224
    Step = Builder.CreateMul(InitVec, Step);
1225
    return Builder.CreateAdd(Val, Step, "induction");
1226
  }
1227

1228
  // Floating point induction.
1229
  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1230
         "Binary Opcode should be specified for FP induction");
1231
  InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1232
  InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1233

1234
  Step = Builder.CreateVectorSplat(VLen, Step);
1235
  Value *MulOp = Builder.CreateFMul(InitVec, Step);
1236
  return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1237
}
1238

1239
/// A helper function that returns an integer or floating-point constant with
1240
/// value C.
1241
static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1242
  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1243
                           : ConstantFP::get(Ty, C);
1244
}
1245

1246
static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1247
                                  ElementCount VF) {
1248
  assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1249
  Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1250
  Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1251
  return B.CreateUIToFP(RuntimeVF, FTy);
1252
}
1253

1254
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1255
  assert(!State.Instance && "Int or FP induction being replicated.");
1256

1257
  Value *Start = getStartValue()->getLiveInIRValue();
1258
  const InductionDescriptor &ID = getInductionDescriptor();
1259
  TruncInst *Trunc = getTruncInst();
1260
  IRBuilderBase &Builder = State.Builder;
1261
  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1262
  assert(State.VF.isVector() && "must have vector VF");
1263

1264
  // The value from the original loop to which we are mapping the new induction
1265
  // variable.
1266
  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1267

1268
  // Fast-math-flags propagate from the original induction instruction.
1269
  IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1270
  if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1271
    Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1272

1273
  // Now do the actual transformations, and start with fetching the step value.
1274
  Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1275

1276
  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1277
         "Expected either an induction phi-node or a truncate of it!");
1278

1279
  // Construct the initial value of the vector IV in the vector loop preheader
1280
  auto CurrIP = Builder.saveIP();
1281
  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1282
  Builder.SetInsertPoint(VectorPH->getTerminator());
1283
  if (isa<TruncInst>(EntryVal)) {
1284
    assert(Start->getType()->isIntegerTy() &&
1285
           "Truncation requires an integer type");
1286
    auto *TruncType = cast<IntegerType>(EntryVal->getType());
1287
    Step = Builder.CreateTrunc(Step, TruncType);
1288
    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1289
  }
1290

1291
  Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1292
  Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1293
  Value *SteppedStart = getStepVector(
1294
      SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1295

1296
  // We create vector phi nodes for both integer and floating-point induction
1297
  // variables. Here, we determine the kind of arithmetic we will perform.
1298
  Instruction::BinaryOps AddOp;
1299
  Instruction::BinaryOps MulOp;
1300
  if (Step->getType()->isIntegerTy()) {
1301
    AddOp = Instruction::Add;
1302
    MulOp = Instruction::Mul;
1303
  } else {
1304
    AddOp = ID.getInductionOpcode();
1305
    MulOp = Instruction::FMul;
1306
  }
1307

1308
  // Multiply the vectorization factor by the step using integer or
1309
  // floating-point arithmetic as appropriate.
1310
  Type *StepType = Step->getType();
1311
  Value *RuntimeVF;
1312
  if (Step->getType()->isFloatingPointTy())
1313
    RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
1314
  else
1315
    RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
1316
  Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1317

1318
  // Create a vector splat to use in the induction update.
1319
  //
1320
  // FIXME: If the step is non-constant, we create the vector splat with
1321
  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1322
  //        handle a constant vector splat.
1323
  Value *SplatVF = isa<Constant>(Mul)
1324
                       ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1325
                       : Builder.CreateVectorSplat(State.VF, Mul);
1326
  Builder.restoreIP(CurrIP);
1327

1328
  // We may need to add the step a number of times, depending on the unroll
1329
  // factor. The last of those goes into the PHI.
1330
  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1331
  VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1332
  VecInd->setDebugLoc(EntryVal->getDebugLoc());
1333
  Instruction *LastInduction = VecInd;
1334
  for (unsigned Part = 0; Part < State.UF; ++Part) {
1335
    State.set(this, LastInduction, Part);
1336

1337
    if (isa<TruncInst>(EntryVal))
1338
      State.addMetadata(LastInduction, EntryVal);
1339

1340
    LastInduction = cast<Instruction>(
1341
        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1342
    LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1343
  }
1344

1345
  LastInduction->setName("vec.ind.next");
1346
  VecInd->addIncoming(SteppedStart, VectorPH);
1347
  // Add induction update using an incorrect block temporarily. The phi node
1348
  // will be fixed after VPlan execution. Note that at this point the latch
1349
  // block cannot be used, as it does not exist yet.
1350
  // TODO: Model increment value in VPlan, by turning the recipe into a
1351
  // multi-def and a subclass of VPHeaderPHIRecipe.
1352
  VecInd->addIncoming(LastInduction, VectorPH);
1353
}
1354

1355
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1356
void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1357
                                          VPSlotTracker &SlotTracker) const {
1358
  O << Indent << "WIDEN-INDUCTION";
1359
  if (getTruncInst()) {
1360
    O << "\\l\"";
1361
    O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1362
    O << " +\n" << Indent << "\"  ";
1363
    getVPValue(0)->printAsOperand(O, SlotTracker);
1364
  } else
1365
    O << " " << VPlanIngredient(IV);
1366

1367
  O << ", ";
1368
  getStepValue()->printAsOperand(O, SlotTracker);
1369
}
1370
#endif
1371

1372
bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1373
  // The step may be defined by a recipe in the preheader (e.g. if it requires
1374
  // SCEV expansion), but for the canonical induction the step is required to be
1375
  // 1, which is represented as live-in.
1376
  if (getStepValue()->getDefiningRecipe())
1377
    return false;
1378
  auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1379
  auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1380
  auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1381
  return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1382
         getScalarType() == CanIV->getScalarType();
1383
}
1384

1385
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1386
void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1387
                              VPSlotTracker &SlotTracker) const {
1388
  O << Indent;
1389
  printAsOperand(O, SlotTracker);
1390
  O << Indent << "= DERIVED-IV ";
1391
  getStartValue()->printAsOperand(O, SlotTracker);
1392
  O << " + ";
1393
  getOperand(1)->printAsOperand(O, SlotTracker);
1394
  O << " * ";
1395
  getStepValue()->printAsOperand(O, SlotTracker);
1396
}
1397
#endif
1398

1399
void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1400
  // Fast-math-flags propagate from the original induction instruction.
1401
  IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1402
  if (hasFastMathFlags())
1403
    State.Builder.setFastMathFlags(getFastMathFlags());
1404

1405
  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1406
  /// variable on which to base the steps, \p Step is the size of the step.
1407

1408
  Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1409
  Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1410
  IRBuilderBase &Builder = State.Builder;
1411

1412
  // Ensure step has the same type as that of scalar IV.
1413
  Type *BaseIVTy = BaseIV->getType()->getScalarType();
1414
  assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1415

1416
  // We build scalar steps for both integer and floating-point induction
1417
  // variables. Here, we determine the kind of arithmetic we will perform.
1418
  Instruction::BinaryOps AddOp;
1419
  Instruction::BinaryOps MulOp;
1420
  if (BaseIVTy->isIntegerTy()) {
1421
    AddOp = Instruction::Add;
1422
    MulOp = Instruction::Mul;
1423
  } else {
1424
    AddOp = InductionOpcode;
1425
    MulOp = Instruction::FMul;
1426
  }
1427

1428
  // Determine the number of scalars we need to generate for each unroll
1429
  // iteration.
1430
  bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1431
  // Compute the scalar steps and save the results in State.
1432
  Type *IntStepTy =
1433
      IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1434
  Type *VecIVTy = nullptr;
1435
  Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1436
  if (!FirstLaneOnly && State.VF.isScalable()) {
1437
    VecIVTy = VectorType::get(BaseIVTy, State.VF);
1438
    UnitStepVec =
1439
        Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1440
    SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1441
    SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1442
  }
1443

1444
  unsigned StartPart = 0;
1445
  unsigned EndPart = State.UF;
1446
  unsigned StartLane = 0;
1447
  unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1448
  if (State.Instance) {
1449
    StartPart = State.Instance->Part;
1450
    EndPart = StartPart + 1;
1451
    StartLane = State.Instance->Lane.getKnownLane();
1452
    EndLane = StartLane + 1;
1453
  }
1454
  for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1455
    Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1456

1457
    if (!FirstLaneOnly && State.VF.isScalable()) {
1458
      auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1459
      auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1460
      if (BaseIVTy->isFloatingPointTy())
1461
        InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1462
      auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1463
      auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1464
      State.set(this, Add, Part);
1465
      // It's useful to record the lane values too for the known minimum number
1466
      // of elements so we do those below. This improves the code quality when
1467
      // trying to extract the first element, for example.
1468
    }
1469

1470
    if (BaseIVTy->isFloatingPointTy())
1471
      StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1472

1473
    for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1474
      Value *StartIdx = Builder.CreateBinOp(
1475
          AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1476
      // The step returned by `createStepForVF` is a runtime-evaluated value
1477
      // when VF is scalable. Otherwise, it should be folded into a Constant.
1478
      assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1479
             "Expected StartIdx to be folded to a constant when VF is not "
1480
             "scalable");
1481
      auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1482
      auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1483
      State.set(this, Add, VPIteration(Part, Lane));
1484
    }
1485
  }
1486
}
1487

1488
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1489
void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1490
                                  VPSlotTracker &SlotTracker) const {
1491
  O << Indent;
1492
  printAsOperand(O, SlotTracker);
1493
  O << " = SCALAR-STEPS ";
1494
  printOperands(O, SlotTracker);
1495
}
1496
#endif
1497

1498
void VPWidenGEPRecipe::execute(VPTransformState &State) {
1499
  assert(State.VF.isVector() && "not widening");
1500
  auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1501
  // Construct a vector GEP by widening the operands of the scalar GEP as
1502
  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1503
  // results in a vector of pointers when at least one operand of the GEP
1504
  // is vector-typed. Thus, to keep the representation compact, we only use
1505
  // vector-typed operands for loop-varying values.
1506

1507
  if (areAllOperandsInvariant()) {
1508
    // If we are vectorizing, but the GEP has only loop-invariant operands,
1509
    // the GEP we build (by only using vector-typed operands for
1510
    // loop-varying values) would be a scalar pointer. Thus, to ensure we
1511
    // produce a vector of pointers, we need to either arbitrarily pick an
1512
    // operand to broadcast, or broadcast a clone of the original GEP.
1513
    // Here, we broadcast a clone of the original.
1514
    //
1515
    // TODO: If at some point we decide to scalarize instructions having
1516
    //       loop-invariant operands, this special case will no longer be
1517
    //       required. We would add the scalarization decision to
1518
    //       collectLoopScalars() and teach getVectorValue() to broadcast
1519
    //       the lane-zero scalar value.
1520
    SmallVector<Value *> Ops;
1521
    for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1522
      Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1523

1524
    auto *NewGEP =
1525
        State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1526
                                ArrayRef(Ops).drop_front(), "", isInBounds());
1527
    for (unsigned Part = 0; Part < State.UF; ++Part) {
1528
      Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1529
      State.set(this, EntryPart, Part);
1530
      State.addMetadata(EntryPart, GEP);
1531
    }
1532
  } else {
1533
    // If the GEP has at least one loop-varying operand, we are sure to
1534
    // produce a vector of pointers. But if we are only unrolling, we want
1535
    // to produce a scalar GEP for each unroll part. Thus, the GEP we
1536
    // produce with the code below will be scalar (if VF == 1) or vector
1537
    // (otherwise). Note that for the unroll-only case, we still maintain
1538
    // values in the vector mapping with initVector, as we do for other
1539
    // instructions.
1540
    for (unsigned Part = 0; Part < State.UF; ++Part) {
1541
      // The pointer operand of the new GEP. If it's loop-invariant, we
1542
      // won't broadcast it.
1543
      auto *Ptr = isPointerLoopInvariant()
1544
                      ? State.get(getOperand(0), VPIteration(0, 0))
1545
                      : State.get(getOperand(0), Part);
1546

1547
      // Collect all the indices for the new GEP. If any index is
1548
      // loop-invariant, we won't broadcast it.
1549
      SmallVector<Value *, 4> Indices;
1550
      for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1551
        VPValue *Operand = getOperand(I);
1552
        if (isIndexLoopInvariant(I - 1))
1553
          Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1554
        else
1555
          Indices.push_back(State.get(Operand, Part));
1556
      }
1557

1558
      // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1559
      // but it should be a vector, otherwise.
1560
      auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1561
                                             Indices, "", isInBounds());
1562
      assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1563
             "NewGEP is not a pointer vector");
1564
      State.set(this, NewGEP, Part);
1565
      State.addMetadata(NewGEP, GEP);
1566
    }
1567
  }
1568
}
1569

1570
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1571
void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1572
                             VPSlotTracker &SlotTracker) const {
1573
  O << Indent << "WIDEN-GEP ";
1574
  O << (isPointerLoopInvariant() ? "Inv" : "Var");
1575
  for (size_t I = 0; I < getNumOperands() - 1; ++I)
1576
    O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1577

1578
  O << " ";
1579
  printAsOperand(O, SlotTracker);
1580
  O << " = getelementptr";
1581
  printFlags(O);
1582
  printOperands(O, SlotTracker);
1583
}
1584
#endif
1585

1586
void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1587
  auto &Builder = State.Builder;
1588
  State.setDebugLocFrom(getDebugLoc());
1589
  for (unsigned Part = 0; Part < State.UF; ++Part) {
1590
    // Calculate the pointer for the specific unroll-part.
1591
    Value *PartPtr = nullptr;
1592
    // Use i32 for the gep index type when the value is constant,
1593
    // or query DataLayout for a more suitable index type otherwise.
1594
    const DataLayout &DL =
1595
        Builder.GetInsertBlock()->getDataLayout();
1596
    Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1597
                        ? DL.getIndexType(IndexedTy->getPointerTo())
1598
                        : Builder.getInt32Ty();
1599
    Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1600
    bool InBounds = isInBounds();
1601
    if (IsReverse) {
1602
      // If the address is consecutive but reversed, then the
1603
      // wide store needs to start at the last vector element.
1604
      // RunTimeVF =  VScale * VF.getKnownMinValue()
1605
      // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1606
      Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1607
      // NumElt = -Part * RunTimeVF
1608
      Value *NumElt = Builder.CreateMul(
1609
          ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1610
      // LastLane = 1 - RunTimeVF
1611
      Value *LastLane =
1612
          Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1613
      PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1614
      PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1615
    } else {
1616
      Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1617
      PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1618
    }
1619

1620
    State.set(this, PartPtr, Part, /*IsScalar*/ true);
1621
  }
1622
}
1623

1624
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1625
void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1626
                                  VPSlotTracker &SlotTracker) const {
1627
  O << Indent;
1628
  printAsOperand(O, SlotTracker);
1629
  O << " = vector-pointer ";
1630
  if (IsReverse)
1631
    O << "(reverse) ";
1632

1633
  printOperands(O, SlotTracker);
1634
}
1635
#endif
1636

1637
void VPBlendRecipe::execute(VPTransformState &State) {
1638
  State.setDebugLocFrom(getDebugLoc());
1639
  // We know that all PHIs in non-header blocks are converted into
1640
  // selects, so we don't have to worry about the insertion order and we
1641
  // can just use the builder.
1642
  // At this point we generate the predication tree. There may be
1643
  // duplications since this is a simple recursive scan, but future
1644
  // optimizations will clean it up.
1645

1646
  unsigned NumIncoming = getNumIncomingValues();
1647

1648
  // Generate a sequence of selects of the form:
1649
  // SELECT(Mask3, In3,
1650
  //        SELECT(Mask2, In2,
1651
  //               SELECT(Mask1, In1,
1652
  //                      In0)))
1653
  // Note that Mask0 is never used: lanes for which no path reaches this phi and
1654
  // are essentially undef are taken from In0.
1655
 VectorParts Entry(State.UF);
1656
 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
1657
 for (unsigned In = 0; In < NumIncoming; ++In) {
1658
   for (unsigned Part = 0; Part < State.UF; ++Part) {
1659
     // We might have single edge PHIs (blocks) - use an identity
1660
     // 'select' for the first PHI operand.
1661
     Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);
1662
     if (In == 0)
1663
       Entry[Part] = In0; // Initialize with the first incoming value.
1664
     else {
1665
       // Select between the current value and the previous incoming edge
1666
       // based on the incoming mask.
1667
       Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);
1668
       Entry[Part] =
1669
           State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1670
     }
1671
   }
1672
 }
1673
  for (unsigned Part = 0; Part < State.UF; ++Part)
1674
    State.set(this, Entry[Part], Part, OnlyFirstLaneUsed);
1675
}
1676

1677
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1678
void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1679
                          VPSlotTracker &SlotTracker) const {
1680
  O << Indent << "BLEND ";
1681
  printAsOperand(O, SlotTracker);
1682
  O << " =";
1683
  if (getNumIncomingValues() == 1) {
1684
    // Not a User of any mask: not really blending, this is a
1685
    // single-predecessor phi.
1686
    O << " ";
1687
    getIncomingValue(0)->printAsOperand(O, SlotTracker);
1688
  } else {
1689
    for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1690
      O << " ";
1691
      getIncomingValue(I)->printAsOperand(O, SlotTracker);
1692
      if (I == 0)
1693
        continue;
1694
      O << "/";
1695
      getMask(I)->printAsOperand(O, SlotTracker);
1696
    }
1697
  }
1698
}
1699
#endif
1700

1701
void VPReductionRecipe::execute(VPTransformState &State) {
1702
  assert(!State.Instance && "Reduction being replicated.");
1703
  Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
1704
  RecurKind Kind = RdxDesc.getRecurrenceKind();
1705
  // Propagate the fast-math flags carried by the underlying instruction.
1706
  IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1707
  State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1708
  for (unsigned Part = 0; Part < State.UF; ++Part) {
1709
    Value *NewVecOp = State.get(getVecOp(), Part);
1710
    if (VPValue *Cond = getCondOp()) {
1711
      Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
1712
      VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1713
      Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1714
      Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
1715
                                                  RdxDesc.getFastMathFlags());
1716
      if (State.VF.isVector()) {
1717
        Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
1718
      }
1719

1720
      Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
1721
      NewVecOp = Select;
1722
    }
1723
    Value *NewRed;
1724
    Value *NextInChain;
1725
    if (IsOrdered) {
1726
      if (State.VF.isVector())
1727
        NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
1728
                                        PrevInChain);
1729
      else
1730
        NewRed = State.Builder.CreateBinOp(
1731
            (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1732
            NewVecOp);
1733
      PrevInChain = NewRed;
1734
    } else {
1735
      PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
1736
      NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
1737
    }
1738
    if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
1739
      NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1740
                                   NewRed, PrevInChain);
1741
    } else if (IsOrdered)
1742
      NextInChain = NewRed;
1743
    else
1744
      NextInChain = State.Builder.CreateBinOp(
1745
          (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
1746
    State.set(this, NextInChain, Part, /*IsScalar*/ true);
1747
  }
1748
}
1749

1750
void VPReductionEVLRecipe::execute(VPTransformState &State) {
1751
  assert(!State.Instance && "Reduction being replicated.");
1752
  assert(State.UF == 1 &&
1753
         "Expected only UF == 1 when vectorizing with explicit vector length.");
1754

1755
  auto &Builder = State.Builder;
1756
  // Propagate the fast-math flags carried by the underlying instruction.
1757
  IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
1758
  const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1759
  Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1760

1761
  RecurKind Kind = RdxDesc.getRecurrenceKind();
1762
  Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true);
1763
  Value *VecOp = State.get(getVecOp(), 0);
1764
  Value *EVL = State.get(getEVL(), VPIteration(0, 0));
1765

1766
  VectorBuilder VBuilder(Builder);
1767
  VBuilder.setEVL(EVL);
1768
  Value *Mask;
1769
  // TODO: move the all-true mask generation into VectorBuilder.
1770
  if (VPValue *CondOp = getCondOp())
1771
    Mask = State.get(CondOp, 0);
1772
  else
1773
    Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
1774
  VBuilder.setMask(Mask);
1775

1776
  Value *NewRed;
1777
  if (isOrdered()) {
1778
    NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
1779
  } else {
1780
    NewRed = createSimpleTargetReduction(VBuilder, VecOp, RdxDesc);
1781
    if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1782
      NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
1783
    else
1784
      NewRed = Builder.CreateBinOp(
1785
          (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
1786
  }
1787
  State.set(this, NewRed, 0, /*IsScalar*/ true);
1788
}
1789

1790
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1791
void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1792
                              VPSlotTracker &SlotTracker) const {
1793
  O << Indent << "REDUCE ";
1794
  printAsOperand(O, SlotTracker);
1795
  O << " = ";
1796
  getChainOp()->printAsOperand(O, SlotTracker);
1797
  O << " +";
1798
  if (isa<FPMathOperator>(getUnderlyingInstr()))
1799
    O << getUnderlyingInstr()->getFastMathFlags();
1800
  O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1801
  getVecOp()->printAsOperand(O, SlotTracker);
1802
  if (isConditional()) {
1803
    O << ", ";
1804
    getCondOp()->printAsOperand(O, SlotTracker);
1805
  }
1806
  O << ")";
1807
  if (RdxDesc.IntermediateStore)
1808
    O << " (with final reduction value stored in invariant address sank "
1809
         "outside of loop)";
1810
}
1811

1812
void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1813
                                 VPSlotTracker &SlotTracker) const {
1814
  const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1815
  O << Indent << "REDUCE ";
1816
  printAsOperand(O, SlotTracker);
1817
  O << " = ";
1818
  getChainOp()->printAsOperand(O, SlotTracker);
1819
  O << " +";
1820
  if (isa<FPMathOperator>(getUnderlyingInstr()))
1821
    O << getUnderlyingInstr()->getFastMathFlags();
1822
  O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1823
  getVecOp()->printAsOperand(O, SlotTracker);
1824
  O << ", ";
1825
  getEVL()->printAsOperand(O, SlotTracker);
1826
  if (isConditional()) {
1827
    O << ", ";
1828
    getCondOp()->printAsOperand(O, SlotTracker);
1829
  }
1830
  O << ")";
1831
  if (RdxDesc.IntermediateStore)
1832
    O << " (with final reduction value stored in invariant address sank "
1833
         "outside of loop)";
1834
}
1835
#endif
1836

1837
bool VPReplicateRecipe::shouldPack() const {
1838
  // Find if the recipe is used by a widened recipe via an intervening
1839
  // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1840
  return any_of(users(), [](const VPUser *U) {
1841
    if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1842
      return any_of(PredR->users(), [PredR](const VPUser *U) {
1843
        return !U->usesScalars(PredR);
1844
      });
1845
    return false;
1846
  });
1847
}
1848

1849
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1850
void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1851
                              VPSlotTracker &SlotTracker) const {
1852
  O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1853

1854
  if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1855
    printAsOperand(O, SlotTracker);
1856
    O << " = ";
1857
  }
1858
  if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1859
    O << "call";
1860
    printFlags(O);
1861
    O << "@" << CB->getCalledFunction()->getName() << "(";
1862
    interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1863
                    O, [&O, &SlotTracker](VPValue *Op) {
1864
                      Op->printAsOperand(O, SlotTracker);
1865
                    });
1866
    O << ")";
1867
  } else {
1868
    O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1869
    printFlags(O);
1870
    printOperands(O, SlotTracker);
1871
  }
1872

1873
  if (shouldPack())
1874
    O << " (S->V)";
1875
}
1876
#endif
1877

1878
/// Checks if \p C is uniform across all VFs and UFs. It is considered as such
1879
/// if it is either defined outside the vector region or its operand is known to
1880
/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
1881
/// TODO: Uniformity should be associated with a VPValue and there should be a
1882
/// generic way to check.
1883
static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
1884
  return C->isDefinedOutsideVectorRegions() ||
1885
         isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
1886
         isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
1887
}
1888

1889
Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
1890
  assert(vputils::onlyFirstLaneUsed(this) &&
1891
         "Codegen only implemented for first lane.");
1892
  switch (Opcode) {
1893
  case Instruction::SExt:
1894
  case Instruction::ZExt:
1895
  case Instruction::Trunc: {
1896
    // Note: SExt/ZExt not used yet.
1897
    Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
1898
    return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
1899
  }
1900
  default:
1901
    llvm_unreachable("opcode not implemented yet");
1902
  }
1903
}
1904

1905
void VPScalarCastRecipe ::execute(VPTransformState &State) {
1906
  bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
1907
  for (unsigned Part = 0; Part != State.UF; ++Part) {
1908
    Value *Res;
1909
    // Only generate a single instance, if the recipe is uniform across UFs and
1910
    // VFs.
1911
    if (Part > 0 && IsUniformAcrossVFsAndUFs)
1912
      Res = State.get(this, VPIteration(0, 0));
1913
    else
1914
      Res = generate(State, Part);
1915
    State.set(this, Res, VPIteration(Part, 0));
1916
  }
1917
}
1918

1919
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1920
void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
1921
                                VPSlotTracker &SlotTracker) const {
1922
  O << Indent << "SCALAR-CAST ";
1923
  printAsOperand(O, SlotTracker);
1924
  O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1925
  printOperands(O, SlotTracker);
1926
  O << " to " << *ResultTy;
1927
}
1928
#endif
1929

1930
void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1931
  assert(State.Instance && "Branch on Mask works only on single instance.");
1932

1933
  unsigned Part = State.Instance->Part;
1934
  unsigned Lane = State.Instance->Lane.getKnownLane();
1935

1936
  Value *ConditionBit = nullptr;
1937
  VPValue *BlockInMask = getMask();
1938
  if (BlockInMask) {
1939
    ConditionBit = State.get(BlockInMask, Part);
1940
    if (ConditionBit->getType()->isVectorTy())
1941
      ConditionBit = State.Builder.CreateExtractElement(
1942
          ConditionBit, State.Builder.getInt32(Lane));
1943
  } else // Block in mask is all-one.
1944
    ConditionBit = State.Builder.getTrue();
1945

1946
  // Replace the temporary unreachable terminator with a new conditional branch,
1947
  // whose two destinations will be set later when they are created.
1948
  auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1949
  assert(isa<UnreachableInst>(CurrentTerminator) &&
1950
         "Expected to replace unreachable terminator with conditional branch.");
1951
  auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1952
  CondBr->setSuccessor(0, nullptr);
1953
  ReplaceInstWithInst(CurrentTerminator, CondBr);
1954
}
1955

1956
void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1957
  assert(State.Instance && "Predicated instruction PHI works per instance.");
1958
  Instruction *ScalarPredInst =
1959
      cast<Instruction>(State.get(getOperand(0), *State.Instance));
1960
  BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1961
  BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1962
  assert(PredicatingBB && "Predicated block has no single predecessor.");
1963
  assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1964
         "operand must be VPReplicateRecipe");
1965

1966
  // By current pack/unpack logic we need to generate only a single phi node: if
1967
  // a vector value for the predicated instruction exists at this point it means
1968
  // the instruction has vector users only, and a phi for the vector value is
1969
  // needed. In this case the recipe of the predicated instruction is marked to
1970
  // also do that packing, thereby "hoisting" the insert-element sequence.
1971
  // Otherwise, a phi node for the scalar value is needed.
1972
  unsigned Part = State.Instance->Part;
1973
  if (State.hasVectorValue(getOperand(0), Part)) {
1974
    Value *VectorValue = State.get(getOperand(0), Part);
1975
    InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1976
    PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1977
    VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1978
    VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1979
    if (State.hasVectorValue(this, Part))
1980
      State.reset(this, VPhi, Part);
1981
    else
1982
      State.set(this, VPhi, Part);
1983
    // NOTE: Currently we need to update the value of the operand, so the next
1984
    // predicated iteration inserts its generated value in the correct vector.
1985
    State.reset(getOperand(0), VPhi, Part);
1986
  } else {
1987
    Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1988
    PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1989
    Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1990
                     PredicatingBB);
1991
    Phi->addIncoming(ScalarPredInst, PredicatedBB);
1992
    if (State.hasScalarValue(this, *State.Instance))
1993
      State.reset(this, Phi, *State.Instance);
1994
    else
1995
      State.set(this, Phi, *State.Instance);
1996
    // NOTE: Currently we need to update the value of the operand, so the next
1997
    // predicated iteration inserts its generated value in the correct vector.
1998
    State.reset(getOperand(0), Phi, *State.Instance);
1999
  }
2000
}
2001

2002
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2003
void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2004
                                VPSlotTracker &SlotTracker) const {
2005
  O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2006
  printAsOperand(O, SlotTracker);
2007
  O << " = ";
2008
  printOperands(O, SlotTracker);
2009
}
2010

2011
void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2012
                              VPSlotTracker &SlotTracker) const {
2013
  O << Indent << "WIDEN ";
2014
  printAsOperand(O, SlotTracker);
2015
  O << " = load ";
2016
  printOperands(O, SlotTracker);
2017
}
2018

2019
void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2020
                                 VPSlotTracker &SlotTracker) const {
2021
  O << Indent << "WIDEN ";
2022
  printAsOperand(O, SlotTracker);
2023
  O << " = vp.load ";
2024
  printOperands(O, SlotTracker);
2025
}
2026

2027
void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2028
                               VPSlotTracker &SlotTracker) const {
2029
  O << Indent << "WIDEN store ";
2030
  printOperands(O, SlotTracker);
2031
}
2032

2033
void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2034
                                  VPSlotTracker &SlotTracker) const {
2035
  O << Indent << "WIDEN vp.store ";
2036
  printOperands(O, SlotTracker);
2037
}
2038
#endif
2039

2040
static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2041
                                     VectorType *DstVTy, const DataLayout &DL) {
2042
  // Verify that V is a vector type with same number of elements as DstVTy.
2043
  auto VF = DstVTy->getElementCount();
2044
  auto *SrcVecTy = cast<VectorType>(V->getType());
2045
  assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2046
  Type *SrcElemTy = SrcVecTy->getElementType();
2047
  Type *DstElemTy = DstVTy->getElementType();
2048
  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2049
         "Vector elements must have same size");
2050

2051
  // Do a direct cast if element types are castable.
2052
  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2053
    return Builder.CreateBitOrPointerCast(V, DstVTy);
2054
  }
2055
  // V cannot be directly casted to desired vector type.
2056
  // May happen when V is a floating point vector but DstVTy is a vector of
2057
  // pointers or vice-versa. Handle this using a two-step bitcast using an
2058
  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2059
  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2060
         "Only one type should be a pointer type");
2061
  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2062
         "Only one type should be a floating point type");
2063
  Type *IntTy =
2064
      IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2065
  auto *VecIntTy = VectorType::get(IntTy, VF);
2066
  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2067
  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2068
}
2069

2070
/// Return a vector containing interleaved elements from multiple
2071
/// smaller input vectors.
2072
static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2073
                                const Twine &Name) {
2074
  unsigned Factor = Vals.size();
2075
  assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2076

2077
  VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
2078
#ifndef NDEBUG
2079
  for (Value *Val : Vals)
2080
    assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2081
#endif
2082

2083
  // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2084
  // must use intrinsics to interleave.
2085
  if (VecTy->isScalableTy()) {
2086
    VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2087
    return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2088
                                   Vals,
2089
                                   /*FMFSource=*/nullptr, Name);
2090
  }
2091

2092
  // Fixed length. Start by concatenating all vectors into a wide vector.
2093
  Value *WideVec = concatenateVectors(Builder, Vals);
2094

2095
  // Interleave the elements into the wide vector.
2096
  const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2097
  return Builder.CreateShuffleVector(
2098
      WideVec, createInterleaveMask(NumElts, Factor), Name);
2099
}
2100

2101
// Try to vectorize the interleave group that \p Instr belongs to.
2102
//
2103
// E.g. Translate following interleaved load group (factor = 3):
2104
//   for (i = 0; i < N; i+=3) {
2105
//     R = Pic[i];             // Member of index 0
2106
//     G = Pic[i+1];           // Member of index 1
2107
//     B = Pic[i+2];           // Member of index 2
2108
//     ... // do something to R, G, B
2109
//   }
2110
// To:
2111
//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2112
//   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2113
//   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2114
//   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2115
//
2116
// Or translate following interleaved store group (factor = 3):
2117
//   for (i = 0; i < N; i+=3) {
2118
//     ... do something to R, G, B
2119
//     Pic[i]   = R;           // Member of index 0
2120
//     Pic[i+1] = G;           // Member of index 1
2121
//     Pic[i+2] = B;           // Member of index 2
2122
//   }
2123
// To:
2124
//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2125
//   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2126
//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2127
//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2128
//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2129
void VPInterleaveRecipe::execute(VPTransformState &State) {
2130
  assert(!State.Instance && "Interleave group being replicated.");
2131
  const InterleaveGroup<Instruction> *Group = IG;
2132
  Instruction *Instr = Group->getInsertPos();
2133

2134
  // Prepare for the vector type of the interleaved load/store.
2135
  Type *ScalarTy = getLoadStoreType(Instr);
2136
  unsigned InterleaveFactor = Group->getFactor();
2137
  auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2138

2139
  // Prepare for the new pointers.
2140
  SmallVector<Value *, 2> AddrParts;
2141
  unsigned Index = Group->getIndex(Instr);
2142

2143
  // TODO: extend the masked interleaved-group support to reversed access.
2144
  VPValue *BlockInMask = getMask();
2145
  assert((!BlockInMask || !Group->isReverse()) &&
2146
         "Reversed masked interleave-group not supported.");
2147

2148
  Value *Idx;
2149
  // If the group is reverse, adjust the index to refer to the last vector lane
2150
  // instead of the first. We adjust the index from the first vector lane,
2151
  // rather than directly getting the pointer for lane VF - 1, because the
2152
  // pointer operand of the interleaved access is supposed to be uniform. For
2153
  // uniform instructions, we're only required to generate a value for the
2154
  // first vector lane in each unroll iteration.
2155
  if (Group->isReverse()) {
2156
    Value *RuntimeVF =
2157
        getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2158
    Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2159
    Idx = State.Builder.CreateMul(Idx,
2160
                                  State.Builder.getInt32(Group->getFactor()));
2161
    Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
2162
    Idx = State.Builder.CreateNeg(Idx);
2163
  } else
2164
    Idx = State.Builder.getInt32(-Index);
2165

2166
  VPValue *Addr = getAddr();
2167
  for (unsigned Part = 0; Part < State.UF; Part++) {
2168
    Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2169
    if (auto *I = dyn_cast<Instruction>(AddrPart))
2170
      State.setDebugLocFrom(I->getDebugLoc());
2171

2172
    // Notice current instruction could be any index. Need to adjust the address
2173
    // to the member of index 0.
2174
    //
2175
    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2176
    //       b = A[i];       // Member of index 0
2177
    // Current pointer is pointed to A[i+1], adjust it to A[i].
2178
    //
2179
    // E.g.  A[i+1] = a;     // Member of index 1
2180
    //       A[i]   = b;     // Member of index 0
2181
    //       A[i+2] = c;     // Member of index 2 (Current instruction)
2182
    // Current pointer is pointed to A[i+2], adjust it to A[i].
2183

2184
    bool InBounds = false;
2185
    if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2186
      InBounds = gep->isInBounds();
2187
    AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2188
    AddrParts.push_back(AddrPart);
2189
  }
2190

2191
  State.setDebugLocFrom(Instr->getDebugLoc());
2192
  Value *PoisonVec = PoisonValue::get(VecTy);
2193

2194
  auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor](
2195
                             unsigned Part, Value *MaskForGaps) -> Value * {
2196
    if (State.VF.isScalable()) {
2197
      assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2198
      assert(InterleaveFactor == 2 &&
2199
             "Unsupported deinterleave factor for scalable vectors");
2200
      auto *BlockInMaskPart = State.get(BlockInMask, Part);
2201
      SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2202
      auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2203
                                     State.VF.getKnownMinValue() * 2, true);
2204
      return State.Builder.CreateIntrinsic(
2205
          MaskTy, Intrinsic::vector_interleave2, Ops,
2206
          /*FMFSource=*/nullptr, "interleaved.mask");
2207
    }
2208

2209
    if (!BlockInMask)
2210
      return MaskForGaps;
2211

2212
    Value *BlockInMaskPart = State.get(BlockInMask, Part);
2213
    Value *ShuffledMask = State.Builder.CreateShuffleVector(
2214
        BlockInMaskPart,
2215
        createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2216
        "interleaved.mask");
2217
    return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
2218
                                                   ShuffledMask, MaskForGaps)
2219
                       : ShuffledMask;
2220
  };
2221

2222
  const DataLayout &DL = Instr->getDataLayout();
2223
  // Vectorize the interleaved load group.
2224
  if (isa<LoadInst>(Instr)) {
2225
    Value *MaskForGaps = nullptr;
2226
    if (NeedsMaskForGaps) {
2227
      MaskForGaps = createBitMaskForGaps(State.Builder,
2228
                                         State.VF.getKnownMinValue(), *Group);
2229
      assert(MaskForGaps && "Mask for Gaps is required but it is null");
2230
    }
2231

2232
    // For each unroll part, create a wide load for the group.
2233
    SmallVector<Value *, 2> NewLoads;
2234
    for (unsigned Part = 0; Part < State.UF; Part++) {
2235
      Instruction *NewLoad;
2236
      if (BlockInMask || MaskForGaps) {
2237
        Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2238
        NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part],
2239
                                                 Group->getAlign(), GroupMask,
2240
                                                 PoisonVec, "wide.masked.vec");
2241
      } else
2242
        NewLoad = State.Builder.CreateAlignedLoad(
2243
            VecTy, AddrParts[Part], Group->getAlign(), "wide.vec");
2244
      Group->addMetadata(NewLoad);
2245
      NewLoads.push_back(NewLoad);
2246
    }
2247

2248
    ArrayRef<VPValue *> VPDefs = definedValues();
2249
    const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2250
    if (VecTy->isScalableTy()) {
2251
      assert(InterleaveFactor == 2 &&
2252
             "Unsupported deinterleave factor for scalable vectors");
2253

2254
      for (unsigned Part = 0; Part < State.UF; ++Part) {
2255
        // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2256
        // so must use intrinsics to deinterleave.
2257
        Value *DI = State.Builder.CreateIntrinsic(
2258
            Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
2259
            /*FMFSource=*/nullptr, "strided.vec");
2260
        unsigned J = 0;
2261
        for (unsigned I = 0; I < InterleaveFactor; ++I) {
2262
          Instruction *Member = Group->getMember(I);
2263

2264
          if (!Member)
2265
            continue;
2266

2267
          Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
2268
          // If this member has different type, cast the result type.
2269
          if (Member->getType() != ScalarTy) {
2270
            VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2271
            StridedVec =
2272
                createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2273
          }
2274

2275
          if (Group->isReverse())
2276
            StridedVec =
2277
                State.Builder.CreateVectorReverse(StridedVec, "reverse");
2278

2279
          State.set(VPDefs[J], StridedVec, Part);
2280
          ++J;
2281
        }
2282
      }
2283

2284
      return;
2285
    }
2286

2287
    // For each member in the group, shuffle out the appropriate data from the
2288
    // wide loads.
2289
    unsigned J = 0;
2290
    for (unsigned I = 0; I < InterleaveFactor; ++I) {
2291
      Instruction *Member = Group->getMember(I);
2292

2293
      // Skip the gaps in the group.
2294
      if (!Member)
2295
        continue;
2296

2297
      auto StrideMask =
2298
          createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
2299
      for (unsigned Part = 0; Part < State.UF; Part++) {
2300
        Value *StridedVec = State.Builder.CreateShuffleVector(
2301
            NewLoads[Part], StrideMask, "strided.vec");
2302

2303
        // If this member has different type, cast the result type.
2304
        if (Member->getType() != ScalarTy) {
2305
          assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2306
          VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2307
          StridedVec =
2308
              createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2309
        }
2310

2311
        if (Group->isReverse())
2312
          StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2313

2314
        State.set(VPDefs[J], StridedVec, Part);
2315
      }
2316
      ++J;
2317
    }
2318
    return;
2319
  }
2320

2321
  // The sub vector type for current instruction.
2322
  auto *SubVT = VectorType::get(ScalarTy, State.VF);
2323

2324
  // Vectorize the interleaved store group.
2325
  Value *MaskForGaps =
2326
      createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
2327
  assert((!MaskForGaps || !State.VF.isScalable()) &&
2328
         "masking gaps for scalable vectors is not yet supported.");
2329
  ArrayRef<VPValue *> StoredValues = getStoredValues();
2330
  for (unsigned Part = 0; Part < State.UF; Part++) {
2331
    // Collect the stored vector from each member.
2332
    SmallVector<Value *, 4> StoredVecs;
2333
    unsigned StoredIdx = 0;
2334
    for (unsigned i = 0; i < InterleaveFactor; i++) {
2335
      assert((Group->getMember(i) || MaskForGaps) &&
2336
             "Fail to get a member from an interleaved store group");
2337
      Instruction *Member = Group->getMember(i);
2338

2339
      // Skip the gaps in the group.
2340
      if (!Member) {
2341
        Value *Undef = PoisonValue::get(SubVT);
2342
        StoredVecs.push_back(Undef);
2343
        continue;
2344
      }
2345

2346
      Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2347
      ++StoredIdx;
2348

2349
      if (Group->isReverse())
2350
        StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
2351

2352
      // If this member has different type, cast it to a unified type.
2353

2354
      if (StoredVec->getType() != SubVT)
2355
        StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
2356

2357
      StoredVecs.push_back(StoredVec);
2358
    }
2359

2360
    // Interleave all the smaller vectors into one wider vector.
2361
    Value *IVec =
2362
        interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
2363
    Instruction *NewStoreInstr;
2364
    if (BlockInMask || MaskForGaps) {
2365
      Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2366
      NewStoreInstr = State.Builder.CreateMaskedStore(
2367
          IVec, AddrParts[Part], Group->getAlign(), GroupMask);
2368
    } else
2369
      NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part],
2370
                                                       Group->getAlign());
2371

2372
    Group->addMetadata(NewStoreInstr);
2373
  }
2374
}
2375

2376
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2377
void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
2378
                               VPSlotTracker &SlotTracker) const {
2379
  O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
2380
  IG->getInsertPos()->printAsOperand(O, false);
2381
  O << ", ";
2382
  getAddr()->printAsOperand(O, SlotTracker);
2383
  VPValue *Mask = getMask();
2384
  if (Mask) {
2385
    O << ", ";
2386
    Mask->printAsOperand(O, SlotTracker);
2387
  }
2388

2389
  unsigned OpIdx = 0;
2390
  for (unsigned i = 0; i < IG->getFactor(); ++i) {
2391
    if (!IG->getMember(i))
2392
      continue;
2393
    if (getNumStoreOperands() > 0) {
2394
      O << "\n" << Indent << "  store ";
2395
      getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
2396
      O << " to index " << i;
2397
    } else {
2398
      O << "\n" << Indent << "  ";
2399
      getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
2400
      O << " = load from index " << i;
2401
    }
2402
    ++OpIdx;
2403
  }
2404
}
2405
#endif
2406

2407
void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
2408
  Value *Start = getStartValue()->getLiveInIRValue();
2409
  PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
2410
  EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2411

2412
  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2413
  EntryPart->addIncoming(Start, VectorPH);
2414
  EntryPart->setDebugLoc(getDebugLoc());
2415
  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2416
    State.set(this, EntryPart, Part, /*IsScalar*/ true);
2417
}
2418

2419
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2420
void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2421
                                   VPSlotTracker &SlotTracker) const {
2422
  O << Indent << "EMIT ";
2423
  printAsOperand(O, SlotTracker);
2424
  O << " = CANONICAL-INDUCTION ";
2425
  printOperands(O, SlotTracker);
2426
}
2427
#endif
2428

2429
bool VPCanonicalIVPHIRecipe::isCanonical(
2430
    InductionDescriptor::InductionKind Kind, VPValue *Start,
2431
    VPValue *Step) const {
2432
  // Must be an integer induction.
2433
  if (Kind != InductionDescriptor::IK_IntInduction)
2434
    return false;
2435
  // Start must match the start value of this canonical induction.
2436
  if (Start != getStartValue())
2437
    return false;
2438

2439
  // If the step is defined by a recipe, it is not a ConstantInt.
2440
  if (Step->getDefiningRecipe())
2441
    return false;
2442

2443
  ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
2444
  return StepC && StepC->isOne();
2445
}
2446

2447
bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
2448
  return IsScalarAfterVectorization &&
2449
         (!IsScalable || vputils::onlyFirstLaneUsed(this));
2450
}
2451

2452
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2453
void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2454
                                          VPSlotTracker &SlotTracker) const {
2455
  O << Indent << "EMIT ";
2456
  printAsOperand(O, SlotTracker);
2457
  O << " = WIDEN-POINTER-INDUCTION ";
2458
  getStartValue()->printAsOperand(O, SlotTracker);
2459
  O << ", " << *IndDesc.getStep();
2460
}
2461
#endif
2462

2463
void VPExpandSCEVRecipe::execute(VPTransformState &State) {
2464
  assert(!State.Instance && "cannot be used in per-lane");
2465
  const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2466
  SCEVExpander Exp(SE, DL, "induction");
2467

2468
  Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
2469
                                 &*State.Builder.GetInsertPoint());
2470
  assert(!State.ExpandedSCEVs.contains(Expr) &&
2471
         "Same SCEV expanded multiple times");
2472
  State.ExpandedSCEVs[Expr] = Res;
2473
  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2474
    State.set(this, Res, {Part, 0});
2475
}
2476

2477
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2478
void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
2479
                               VPSlotTracker &SlotTracker) const {
2480
  O << Indent << "EMIT ";
2481
  getVPSingleValue()->printAsOperand(O, SlotTracker);
2482
  O << " = EXPAND SCEV " << *Expr;
2483
}
2484
#endif
2485

2486
void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
2487
  Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
2488
  Type *STy = CanonicalIV->getType();
2489
  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
2490
  ElementCount VF = State.VF;
2491
  Value *VStart = VF.isScalar()
2492
                      ? CanonicalIV
2493
                      : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
2494
  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2495
    Value *VStep = createStepForVF(Builder, STy, VF, Part);
2496
    if (VF.isVector()) {
2497
      VStep = Builder.CreateVectorSplat(VF, VStep);
2498
      VStep =
2499
          Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
2500
    }
2501
    Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
2502
    State.set(this, CanonicalVectorIV, Part);
2503
  }
2504
}
2505

2506
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2507
void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
2508
                                     VPSlotTracker &SlotTracker) const {
2509
  O << Indent << "EMIT ";
2510
  printAsOperand(O, SlotTracker);
2511
  O << " = WIDEN-CANONICAL-INDUCTION ";
2512
  printOperands(O, SlotTracker);
2513
}
2514
#endif
2515

2516
void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
2517
  auto &Builder = State.Builder;
2518
  // Create a vector from the initial value.
2519
  auto *VectorInit = getStartValue()->getLiveInIRValue();
2520

2521
  Type *VecTy = State.VF.isScalar()
2522
                    ? VectorInit->getType()
2523
                    : VectorType::get(VectorInit->getType(), State.VF);
2524

2525
  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2526
  if (State.VF.isVector()) {
2527
    auto *IdxTy = Builder.getInt32Ty();
2528
    auto *One = ConstantInt::get(IdxTy, 1);
2529
    IRBuilder<>::InsertPointGuard Guard(Builder);
2530
    Builder.SetInsertPoint(VectorPH->getTerminator());
2531
    auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
2532
    auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
2533
    VectorInit = Builder.CreateInsertElement(
2534
        PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
2535
  }
2536

2537
  // Create a phi node for the new recurrence.
2538
  PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
2539
  EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2540
  EntryPart->addIncoming(VectorInit, VectorPH);
2541
  State.set(this, EntryPart, 0);
2542
}
2543

2544
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2545
void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
2546
                                            VPSlotTracker &SlotTracker) const {
2547
  O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
2548
  printAsOperand(O, SlotTracker);
2549
  O << " = phi ";
2550
  printOperands(O, SlotTracker);
2551
}
2552
#endif
2553

2554
void VPReductionPHIRecipe::execute(VPTransformState &State) {
2555
  auto &Builder = State.Builder;
2556

2557
  // Reductions do not have to start at zero. They can start with
2558
  // any loop invariant values.
2559
  VPValue *StartVPV = getStartValue();
2560
  Value *StartV = StartVPV->getLiveInIRValue();
2561

2562
  // In order to support recurrences we need to be able to vectorize Phi nodes.
2563
  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
2564
  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
2565
  // this value when we vectorize all of the instructions that use the PHI.
2566
  bool ScalarPHI = State.VF.isScalar() || IsInLoop;
2567
  Type *VecTy = ScalarPHI ? StartV->getType()
2568
                          : VectorType::get(StartV->getType(), State.VF);
2569

2570
  BasicBlock *HeaderBB = State.CFG.PrevBB;
2571
  assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
2572
         "recipe must be in the vector loop header");
2573
  unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
2574
  for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2575
    Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
2576
    EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
2577
    State.set(this, EntryPart, Part, IsInLoop);
2578
  }
2579

2580
  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2581

2582
  Value *Iden = nullptr;
2583
  RecurKind RK = RdxDesc.getRecurrenceKind();
2584
  if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
2585
      RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
2586
    // MinMax and AnyOf reductions have the start value as their identity.
2587
    if (ScalarPHI) {
2588
      Iden = StartV;
2589
    } else {
2590
      IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2591
      Builder.SetInsertPoint(VectorPH->getTerminator());
2592
      StartV = Iden =
2593
          Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
2594
    }
2595
  } else {
2596
    Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
2597
                                         RdxDesc.getFastMathFlags());
2598

2599
    if (!ScalarPHI) {
2600
      Iden = Builder.CreateVectorSplat(State.VF, Iden);
2601
      IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2602
      Builder.SetInsertPoint(VectorPH->getTerminator());
2603
      Constant *Zero = Builder.getInt32(0);
2604
      StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
2605
    }
2606
  }
2607

2608
  for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2609
    Value *EntryPart = State.get(this, Part, IsInLoop);
2610
    // Make sure to add the reduction start value only to the
2611
    // first unroll part.
2612
    Value *StartVal = (Part == 0) ? StartV : Iden;
2613
    cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
2614
  }
2615
}
2616

2617
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2618
void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2619
                                 VPSlotTracker &SlotTracker) const {
2620
  O << Indent << "WIDEN-REDUCTION-PHI ";
2621

2622
  printAsOperand(O, SlotTracker);
2623
  O << " = phi ";
2624
  printOperands(O, SlotTracker);
2625
}
2626
#endif
2627

2628
void VPWidenPHIRecipe::execute(VPTransformState &State) {
2629
  assert(EnableVPlanNativePath &&
2630
         "Non-native vplans are not expected to have VPWidenPHIRecipes.");
2631

2632
  Value *Op0 = State.get(getOperand(0), 0);
2633
  Type *VecTy = Op0->getType();
2634
  Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
2635
  State.set(this, VecPhi, 0);
2636
}
2637

2638
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2639
void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2640
                             VPSlotTracker &SlotTracker) const {
2641
  O << Indent << "WIDEN-PHI ";
2642

2643
  auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
2644
  // Unless all incoming values are modeled in VPlan  print the original PHI
2645
  // directly.
2646
  // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
2647
  // values as VPValues.
2648
  if (getNumOperands() != OriginalPhi->getNumOperands()) {
2649
    O << VPlanIngredient(OriginalPhi);
2650
    return;
2651
  }
2652

2653
  printAsOperand(O, SlotTracker);
2654
  O << " = phi ";
2655
  printOperands(O, SlotTracker);
2656
}
2657
#endif
2658

2659
// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
2660
// remove VPActiveLaneMaskPHIRecipe.
2661
void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
2662
  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2663
  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2664
    Value *StartMask = State.get(getOperand(0), Part);
2665
    PHINode *EntryPart =
2666
        State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
2667
    EntryPart->addIncoming(StartMask, VectorPH);
2668
    EntryPart->setDebugLoc(getDebugLoc());
2669
    State.set(this, EntryPart, Part);
2670
  }
2671
}
2672

2673
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2674
void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2675
                                      VPSlotTracker &SlotTracker) const {
2676
  O << Indent << "ACTIVE-LANE-MASK-PHI ";
2677

2678
  printAsOperand(O, SlotTracker);
2679
  O << " = phi ";
2680
  printOperands(O, SlotTracker);
2681
}
2682
#endif
2683

2684
void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
2685
  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2686
  assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
2687
  Value *Start = State.get(getOperand(0), VPIteration(0, 0));
2688
  PHINode *EntryPart =
2689
      State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
2690
  EntryPart->addIncoming(Start, VectorPH);
2691
  EntryPart->setDebugLoc(getDebugLoc());
2692
  State.set(this, EntryPart, 0, /*IsScalar=*/true);
2693
}
2694

2695
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2696
void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2697
                                  VPSlotTracker &SlotTracker) const {
2698
  O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
2699

2700
  printAsOperand(O, SlotTracker);
2701
  O << " = phi ";
2702
  printOperands(O, SlotTracker);
2703
}
2704
#endif
2705

2706
Product

Resources

Company