Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
35266 views
//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa9// or dim=2darraymsaa into a single image_msaa_load intrinsic if:10//11// - they refer to the same vaddr except for sample_id,12// - they use a constant sample_id and they fall into the same group,13// - they have the same dmask and the number of intrinsics and the number of14// vaddr/vdata dword transfers is reduced by the combine.15//16// Examples for the tradeoff (all are assuming 2DMsaa for vaddr):17//18// +----------+-----+-----+-------+---------+------------+---------+----------+19// | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |20// | (dmask) | | | | vdata | | vdata | |21// +----------+-----+-----+-------+---------+------------+---------+----------+22// | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes |23// +----------+-----+-----+-------+---------+------------+---------+----------+24// | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? |25// +----------+-----+-----+-------+---------+------------+---------+----------+26// | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes |27// +----------+-----+-----+-------+---------+------------+---------+----------+28// | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no |29// +----------+-----+-----+-------+---------+------------+---------+----------+30// | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes |31// +----------+-----+-----+-------+---------+------------+---------+----------+32//33// Some cases are of questionable benefit, like the one marked with "yes?"34// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP35// and TX, but higher vdata. We start by erring on the side of converting these36// to MSAA_LOAD.37//38// clang-format off39//40// This pass will combine intrinsics such as (not neccessarily consecutive):41// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)42// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)43// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)44// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)45// ==>46// call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)47//48// clang-format on49//50// Future improvements:51//52// - We may occasionally not want to do the combine if it increases the maximum53// register pressure.54//55// - Ensure clausing when multiple MSAA_LOAD are generated.56//57// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this58// combine only applies to gfx11, due to a limitation in gfx10: the gfx1059// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and60// we don't know the format at compile time.61//===----------------------------------------------------------------------===//6263#include "AMDGPU.h"64#include "AMDGPUInstrInfo.h"65#include "AMDGPUTargetMachine.h"66#include "llvm/IR/Function.h"67#include "llvm/IR/IRBuilder.h"68#include "llvm/IR/IntrinsicInst.h"69#include "llvm/IR/IntrinsicsAMDGPU.h"70#include "llvm/Pass.h"71#include "llvm/Support/raw_ostream.h"7273using namespace llvm;7475#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"7677namespace {78class AMDGPUImageIntrinsicOptimizer : public FunctionPass {79const TargetMachine *TM;8081public:82static char ID;8384AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)85: FunctionPass(ID), TM(TM) {}8687bool runOnFunction(Function &F) override;8889}; // End of class AMDGPUImageIntrinsicOptimizer90} // End anonymous namespace9192INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,93"AMDGPU Image Intrinsic Optimizer", false, false)9495char AMDGPUImageIntrinsicOptimizer::ID = 0;9697void addInstToMergeableList(98IntrinsicInst *II,99SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts,100const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {101for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) {102// Check Dim.103if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())104continue;105106// Check D16.107if (IIList.front()->getType() != II->getType())108continue;109110// Check all arguments (DMask, VAddr, RSrc etc).111bool AllEqual = true;112assert(IIList.front()->arg_size() == II->arg_size());113for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {114Value *ArgList = IIList.front()->getArgOperand(I);115Value *Arg = II->getArgOperand(I);116if (I == ImageDimIntr->VAddrEnd - 1) {117// Check FragId group.118auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I));119auto FragId = cast<ConstantInt>(II->getArgOperand(I));120AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);121} else {122// Check all arguments except FragId.123AllEqual = ArgList == Arg;124}125}126if (!AllEqual)127continue;128129// Add to the list.130IIList.emplace_back(II);131return;132}133134// Similar instruction not found, so add a new list.135MergeableInsts.emplace_back(1, II);136LLVM_DEBUG(dbgs() << "New: " << *II << "\n");137}138139// Collect list of all instructions we know how to merge in a subset of the140// block. It returns an iterator to the instruction after the last one analyzed.141BasicBlock::iterator collectMergeableInsts(142BasicBlock::iterator I, BasicBlock::iterator E,143SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) {144for (; I != E; ++I) {145// Don't combine if there is a store in the middle or if there is a memory146// barrier.147if (I->mayHaveSideEffects()) {148++I;149break;150}151152// Ignore non-intrinsics.153if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {154Intrinsic::ID IntrinID = II->getIntrinsicID();155156// Ignore other intrinsics.157if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&158IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)159continue;160161// Check for constant FragId.162const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);163const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;164if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))165continue;166167LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");168addInstToMergeableList(II, MergeableInsts, ImageDimIntr);169}170}171172return I;173}174175bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) {176bool Modified = false;177178SmallVector<Instruction *, 4> InstrsToErase;179for (const auto &IIList : MergeableInsts) {180if (IIList.size() <= 1)181continue;182183// Assume the arguments are unchanged and later override them, if needed.184SmallVector<Value *, 16> Args(IIList.front()->args());185186// Validate function argument and return types, extracting overloaded187// types along the way.188SmallVector<Type *, 6> OverloadTys;189Function *F = IIList.front()->getCalledFunction();190if (!Intrinsic::getIntrinsicSignature(F, OverloadTys))191continue;192193Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();194const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =195AMDGPU::getImageDimIntrinsicInfo(IntrinID);196197Type *EltTy = IIList.front()->getType()->getScalarType();198Type *NewTy = FixedVectorType::get(EltTy, 4);199OverloadTys[0] = NewTy;200bool isD16 = EltTy->isHalfTy();201202ConstantInt *DMask = cast<ConstantInt>(203IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));204unsigned DMaskVal = DMask->getZExtValue() & 0xf;205unsigned NumElts = popcount(DMaskVal);206207// Number of instructions and the number of vaddr/vdata dword transfers208// should be reduced.209unsigned NumLoads = IIList.size();210unsigned NumMsaas = NumElts;211unsigned NumVAddrLoads = 3 * NumLoads;212unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;213unsigned NumVAddrMsaas = 3 * NumMsaas;214unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;215216if (NumLoads < NumMsaas ||217(NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))218continue;219220const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;221auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));222const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;223224// Create the new instructions.225IRBuilder<> B(IIList.front());226227// Create the new image_msaa_load intrinsic.228SmallVector<Instruction *, 4> NewCalls;229while (DMaskVal != 0) {230unsigned NewMaskVal = 1 << countr_zero(DMaskVal);231232Intrinsic::ID NewIntrinID;233if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)234NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;235else236NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;237238Function *NewIntrin = Intrinsic::getDeclaration(239IIList.front()->getModule(), NewIntrinID, OverloadTys);240Args[ImageDimIntr->DMaskIndex] =241ConstantInt::get(DMask->getType(), NewMaskVal);242Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);243CallInst *NewCall = B.CreateCall(NewIntrin, Args);244LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");245246NewCalls.push_back(NewCall);247DMaskVal -= NewMaskVal;248}249250// Create the new extractelement instructions.251for (auto &II : IIList) {252Value *VecOp = nullptr;253auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));254B.SetCurrentDebugLocation(II->getDebugLoc());255if (NumElts == 1) {256VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));257LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");258} else {259VecOp = UndefValue::get(II->getType());260for (unsigned I = 0; I < NumElts; ++I) {261VecOp = B.CreateInsertElement(262VecOp,263B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);264LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");265}266}267268// Replace the old instruction.269II->replaceAllUsesWith(VecOp);270VecOp->takeName(II);271InstrsToErase.push_back(II);272}273274Modified = true;275}276277for (auto I : InstrsToErase)278I->eraseFromParent();279280return Modified;281}282283static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {284if (!TM)285return false;286287// This optimization only applies to GFX11 and beyond.288const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);289if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug())290return false;291292Module *M = F.getParent();293294// Early test to determine if the intrinsics are used.295if (llvm::none_of(*M, [](Function &F) {296return !F.users().empty() &&297(F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||298F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);299}))300return false;301302bool Modified = false;303for (auto &BB : F) {304BasicBlock::iterator SectionEnd;305for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;306I = SectionEnd) {307SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts;308309SectionEnd = collectMergeableInsts(I, E, MergeableInsts);310Modified |= optimizeSection(MergeableInsts);311}312}313314return Modified;315}316317bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {318if (skipFunction(F))319return false;320321return imageIntrinsicOptimizerImpl(F, TM);322}323324FunctionPass *325llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) {326return new AMDGPUImageIntrinsicOptimizer(TM);327}328329PreservedAnalyses330AMDGPUImageIntrinsicOptimizerPass::run(Function &F,331FunctionAnalysisManager &AM) {332333bool Changed = imageIntrinsicOptimizerImpl(F, &TM);334return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();335}336337338