CoCalc -- AMDGPUImageIntrinsicOptimizer.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
³⁵²⁶⁶ views
1
//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
10
// or dim=2darraymsaa into a single image_msaa_load intrinsic if:
11
//
12
// - they refer to the same vaddr except for sample_id,
13
// - they use a constant sample_id and they fall into the same group,
14
// - they have the same dmask and the number of intrinsics and the number of
15
//   vaddr/vdata dword transfers is reduced by the combine.
16
//
17
// Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
18
//
19
// +----------+-----+-----+-------+---------+------------+---------+----------+
20
// | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |
21
// |  (dmask) |     |     |       | vdata   |            | vdata   |          |
22
// +----------+-----+-----+-------+---------+------------+---------+----------+
23
// |        1 |   0 |   0 |     4 |  12 / 4 |          1 |   3 / 4 | yes      |
24
// +----------+-----+-----+-------+---------+------------+---------+----------+
25
// |        1 |   0 |   0 |     2 |   6 / 2 |          1 |   3 / 4 | yes?     |
26
// +----------+-----+-----+-------+---------+------------+---------+----------+
27
// |        2 |   0 |   0 |     4 |  12 / 8 |          2 |   6 / 8 | yes      |
28
// +----------+-----+-----+-------+---------+------------+---------+----------+
29
// |        2 |   0 |   0 |     2 |   6 / 4 |          2 |   6 / 8 | no       |
30
// +----------+-----+-----+-------+---------+------------+---------+----------+
31
// |        1 |   0 |   1 |     2 |   6 / 2 |          1 |   3 / 2 | yes      |
32
// +----------+-----+-----+-------+---------+------------+---------+----------+
33
//
34
// Some cases are of questionable benefit, like the one marked with "yes?"
35
// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
36
// and TX, but higher vdata. We start by erring on the side of converting these
37
// to MSAA_LOAD.
38
//
39
// clang-format off
40
//
41
// This pass will combine intrinsics such as (not neccessarily consecutive):
42
//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
43
//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
44
//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
45
//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
46
// ==>
47
//  call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
48
//
49
// clang-format on
50
//
51
// Future improvements:
52
//
53
// - We may occasionally not want to do the combine if it increases the maximum
54
//   register pressure.
55
//
56
// - Ensure clausing when multiple MSAA_LOAD are generated.
57
//
58
// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
59
// combine only applies to gfx11, due to a limitation in gfx10: the gfx10
60
// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
61
// we don't know the format at compile time.
62
//===----------------------------------------------------------------------===//
63

64
#include "AMDGPU.h"
65
#include "AMDGPUInstrInfo.h"
66
#include "AMDGPUTargetMachine.h"
67
#include "llvm/IR/Function.h"
68
#include "llvm/IR/IRBuilder.h"
69
#include "llvm/IR/IntrinsicInst.h"
70
#include "llvm/IR/IntrinsicsAMDGPU.h"
71
#include "llvm/Pass.h"
72
#include "llvm/Support/raw_ostream.h"
73

74
using namespace llvm;
75

76
#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
77

78
namespace {
79
class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
80
  const TargetMachine *TM;
81

82
public:
83
  static char ID;
84

85
  AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
86
      : FunctionPass(ID), TM(TM) {}
87

88
  bool runOnFunction(Function &F) override;
89

90
}; // End of class AMDGPUImageIntrinsicOptimizer
91
} // End anonymous namespace
92

93
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
94
                "AMDGPU Image Intrinsic Optimizer", false, false)
95

96
char AMDGPUImageIntrinsicOptimizer::ID = 0;
97

98
void addInstToMergeableList(
99
    IntrinsicInst *II,
100
    SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts,
101
    const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
102
  for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) {
103
    // Check Dim.
104
    if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
105
      continue;
106

107
    // Check D16.
108
    if (IIList.front()->getType() != II->getType())
109
      continue;
110

111
    // Check all arguments (DMask, VAddr, RSrc etc).
112
    bool AllEqual = true;
113
    assert(IIList.front()->arg_size() == II->arg_size());
114
    for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {
115
      Value *ArgList = IIList.front()->getArgOperand(I);
116
      Value *Arg = II->getArgOperand(I);
117
      if (I == ImageDimIntr->VAddrEnd - 1) {
118
        // Check FragId group.
119
        auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I));
120
        auto FragId = cast<ConstantInt>(II->getArgOperand(I));
121
        AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);
122
      } else {
123
        // Check all arguments except FragId.
124
        AllEqual = ArgList == Arg;
125
      }
126
    }
127
    if (!AllEqual)
128
      continue;
129

130
    // Add to the list.
131
    IIList.emplace_back(II);
132
    return;
133
  }
134

135
  // Similar instruction not found, so add a new list.
136
  MergeableInsts.emplace_back(1, II);
137
  LLVM_DEBUG(dbgs() << "New: " << *II << "\n");
138
}
139

140
// Collect list of all instructions we know how to merge in a subset of the
141
// block. It returns an iterator to the instruction after the last one analyzed.
142
BasicBlock::iterator collectMergeableInsts(
143
    BasicBlock::iterator I, BasicBlock::iterator E,
144
    SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) {
145
  for (; I != E; ++I) {
146
    // Don't combine if there is a store in the middle or if there is a memory
147
    // barrier.
148
    if (I->mayHaveSideEffects()) {
149
      ++I;
150
      break;
151
    }
152

153
    // Ignore non-intrinsics.
154
    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
155
      Intrinsic::ID IntrinID = II->getIntrinsicID();
156

157
      // Ignore other intrinsics.
158
      if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
159
          IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
160
        continue;
161

162
      // Check for constant FragId.
163
      const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);
164
      const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
165
      if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))
166
        continue;
167

168
      LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");
169
      addInstToMergeableList(II, MergeableInsts, ImageDimIntr);
170
    }
171
  }
172

173
  return I;
174
}
175

176
bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) {
177
  bool Modified = false;
178

179
  SmallVector<Instruction *, 4> InstrsToErase;
180
  for (const auto &IIList : MergeableInsts) {
181
    if (IIList.size() <= 1)
182
      continue;
183

184
    // Assume the arguments are unchanged and later override them, if needed.
185
    SmallVector<Value *, 16> Args(IIList.front()->args());
186

187
    // Validate function argument and return types, extracting overloaded
188
    // types along the way.
189
    SmallVector<Type *, 6> OverloadTys;
190
    Function *F = IIList.front()->getCalledFunction();
191
    if (!Intrinsic::getIntrinsicSignature(F, OverloadTys))
192
      continue;
193

194
    Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
195
    const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
196
        AMDGPU::getImageDimIntrinsicInfo(IntrinID);
197

198
    Type *EltTy = IIList.front()->getType()->getScalarType();
199
    Type *NewTy = FixedVectorType::get(EltTy, 4);
200
    OverloadTys[0] = NewTy;
201
    bool isD16 = EltTy->isHalfTy();
202

203
    ConstantInt *DMask = cast<ConstantInt>(
204
        IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));
205
    unsigned DMaskVal = DMask->getZExtValue() & 0xf;
206
    unsigned NumElts = popcount(DMaskVal);
207

208
    // Number of instructions and the number of vaddr/vdata dword transfers
209
    // should be reduced.
210
    unsigned NumLoads = IIList.size();
211
    unsigned NumMsaas = NumElts;
212
    unsigned NumVAddrLoads = 3 * NumLoads;
213
    unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
214
    unsigned NumVAddrMsaas = 3 * NumMsaas;
215
    unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
216

217
    if (NumLoads < NumMsaas ||
218
        (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
219
      continue;
220

221
    const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
222
    auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));
223
    const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;
224

225
    // Create the new instructions.
226
    IRBuilder<> B(IIList.front());
227

228
    // Create the new image_msaa_load intrinsic.
229
    SmallVector<Instruction *, 4> NewCalls;
230
    while (DMaskVal != 0) {
231
      unsigned NewMaskVal = 1 << countr_zero(DMaskVal);
232

233
      Intrinsic::ID NewIntrinID;
234
      if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
235
        NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
236
      else
237
        NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
238

239
      Function *NewIntrin = Intrinsic::getDeclaration(
240
          IIList.front()->getModule(), NewIntrinID, OverloadTys);
241
      Args[ImageDimIntr->DMaskIndex] =
242
          ConstantInt::get(DMask->getType(), NewMaskVal);
243
      Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
244
      CallInst *NewCall = B.CreateCall(NewIntrin, Args);
245
      LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
246

247
      NewCalls.push_back(NewCall);
248
      DMaskVal -= NewMaskVal;
249
    }
250

251
    // Create the new extractelement instructions.
252
    for (auto &II : IIList) {
253
      Value *VecOp = nullptr;
254
      auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
255
      B.SetCurrentDebugLocation(II->getDebugLoc());
256
      if (NumElts == 1) {
257
        VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
258
        LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
259
      } else {
260
        VecOp = UndefValue::get(II->getType());
261
        for (unsigned I = 0; I < NumElts; ++I) {
262
          VecOp = B.CreateInsertElement(
263
              VecOp,
264
              B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);
265
          LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
266
        }
267
      }
268

269
      // Replace the old instruction.
270
      II->replaceAllUsesWith(VecOp);
271
      VecOp->takeName(II);
272
      InstrsToErase.push_back(II);
273
    }
274

275
    Modified = true;
276
  }
277

278
  for (auto I : InstrsToErase)
279
    I->eraseFromParent();
280

281
  return Modified;
282
}
283

284
static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
285
  if (!TM)
286
    return false;
287

288
  // This optimization only applies to GFX11 and beyond.
289
  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
290
  if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug())
291
    return false;
292

293
  Module *M = F.getParent();
294

295
  // Early test to determine if the intrinsics are used.
296
  if (llvm::none_of(*M, [](Function &F) {
297
        return !F.users().empty() &&
298
               (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
299
                F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
300
      }))
301
    return false;
302

303
  bool Modified = false;
304
  for (auto &BB : F) {
305
    BasicBlock::iterator SectionEnd;
306
    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
307
         I = SectionEnd) {
308
      SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts;
309

310
      SectionEnd = collectMergeableInsts(I, E, MergeableInsts);
311
      Modified |= optimizeSection(MergeableInsts);
312
    }
313
  }
314

315
  return Modified;
316
}
317

318
bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {
319
  if (skipFunction(F))
320
    return false;
321

322
  return imageIntrinsicOptimizerImpl(F, TM);
323
}
324

325
FunctionPass *
326
llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) {
327
  return new AMDGPUImageIntrinsicOptimizer(TM);
328
}
329

330
PreservedAnalyses
331
AMDGPUImageIntrinsicOptimizerPass::run(Function &F,
332
                                       FunctionAnalysisManager &AM) {
333

334
  bool Changed = imageIntrinsicOptimizerImpl(F, &TM);
335
  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
336
}
337

338
Product

Resources

Company