Path: blob/main/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
35294 views
//===- RISCVMatInt.cpp - Immediate materialisation -------------*- C++ -*--===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//78#include "RISCVMatInt.h"9#include "MCTargetDesc/RISCVMCTargetDesc.h"10#include "llvm/ADT/APInt.h"11#include "llvm/MC/MCInstBuilder.h"12#include "llvm/Support/MathExtras.h"13using namespace llvm;1415static int getInstSeqCost(RISCVMatInt::InstSeq &Res, bool HasRVC) {16if (!HasRVC)17return Res.size();1819int Cost = 0;20for (auto Instr : Res) {21// Assume instructions that aren't listed aren't compressible.22bool Compressed = false;23switch (Instr.getOpcode()) {24case RISCV::SLLI:25case RISCV::SRLI:26Compressed = true;27break;28case RISCV::ADDI:29case RISCV::ADDIW:30case RISCV::LUI:31Compressed = isInt<6>(Instr.getImm());32break;33}34// Two RVC instructions take the same space as one RVI instruction, but35// can take longer to execute than the single RVI instruction. Thus, we36// consider that two RVC instruction are slightly more costly than one37// RVI instruction. For longer sequences of RVC instructions the space38// savings can be worth it, though. The costs below try to model that.39if (!Compressed)40Cost += 100; // Baseline cost of one RVI instruction: 100%.41else42Cost += 70; // 70% cost of baseline.43}44return Cost;45}4647// Recursively generate a sequence for materializing an integer.48static void generateInstSeqImpl(int64_t Val, const MCSubtargetInfo &STI,49RISCVMatInt::InstSeq &Res) {50bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit);5152// Use BSETI for a single bit that can't be expressed by a single LUI or ADDI.53if (STI.hasFeature(RISCV::FeatureStdExtZbs) && isPowerOf2_64(Val) &&54(!isInt<32>(Val) || Val == 0x800)) {55Res.emplace_back(RISCV::BSETI, Log2_64(Val));56return;57}5859if (isInt<32>(Val)) {60// Depending on the active bits in the immediate Value v, the following61// instruction sequences are emitted:62//63// v == 0 : ADDI64// v[0,12) != 0 && v[12,32) == 0 : ADDI65// v[0,12) == 0 && v[12,32) != 0 : LUI66// v[0,32) != 0 : LUI+ADDI(W)67int64_t Hi20 = ((Val + 0x800) >> 12) & 0xFFFFF;68int64_t Lo12 = SignExtend64<12>(Val);6970if (Hi20)71Res.emplace_back(RISCV::LUI, Hi20);7273if (Lo12 || Hi20 == 0) {74unsigned AddiOpc = (IsRV64 && Hi20) ? RISCV::ADDIW : RISCV::ADDI;75Res.emplace_back(AddiOpc, Lo12);76}77return;78}7980assert(IsRV64 && "Can't emit >32-bit imm for non-RV64 target");8182// In the worst case, for a full 64-bit constant, a sequence of 8 instructions83// (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emitted. Note84// that the first two instructions (LUI+ADDIW) can contribute up to 32 bits85// while the following ADDI instructions contribute up to 12 bits each.86//87// On the first glance, implementing this seems to be possible by simply88// emitting the most significant 32 bits (LUI+ADDIW) followed by as many left89// shift (SLLI) and immediate additions (ADDI) as needed. However, due to the90// fact that ADDI performs a sign extended addition, doing it like that would91// only be possible when at most 11 bits of the ADDI instructions are used.92// Using all 12 bits of the ADDI instructions, like done by GAS, actually93// requires that the constant is processed starting with the least significant94// bit.95//96// In the following, constants are processed from LSB to MSB but instruction97// emission is performed from MSB to LSB by recursively calling98// generateInstSeq. In each recursion, first the lowest 12 bits are removed99// from the constant and the optimal shift amount, which can be greater than100// 12 bits if the constant is sparse, is determined. Then, the shifted101// remaining constant is processed recursively and gets emitted as soon as it102// fits into 32 bits. The emission of the shifts and additions is subsequently103// performed when the recursion returns.104105int64_t Lo12 = SignExtend64<12>(Val);106Val = (uint64_t)Val - (uint64_t)Lo12;107108int ShiftAmount = 0;109bool Unsigned = false;110111// Val might now be valid for LUI without needing a shift.112if (!isInt<32>(Val)) {113ShiftAmount = llvm::countr_zero((uint64_t)Val);114Val >>= ShiftAmount;115116// If the remaining bits don't fit in 12 bits, we might be able to reduce117// the // shift amount in order to use LUI which will zero the lower 12118// bits.119if (ShiftAmount > 12 && !isInt<12>(Val)) {120if (isInt<32>((uint64_t)Val << 12)) {121// Reduce the shift amount and add zeros to the LSBs so it will match122// LUI.123ShiftAmount -= 12;124Val = (uint64_t)Val << 12;125} else if (isUInt<32>((uint64_t)Val << 12) &&126STI.hasFeature(RISCV::FeatureStdExtZba)) {127// Reduce the shift amount and add zeros to the LSBs so it will match128// LUI, then shift left with SLLI.UW to clear the upper 32 set bits.129ShiftAmount -= 12;130Val = ((uint64_t)Val << 12) | (0xffffffffull << 32);131Unsigned = true;132}133}134135// Try to use SLLI_UW for Val when it is uint32 but not int32.136if (isUInt<32>((uint64_t)Val) && !isInt<32>((uint64_t)Val) &&137STI.hasFeature(RISCV::FeatureStdExtZba)) {138// Use LUI+ADDI or LUI to compose, then clear the upper 32 bits with139// SLLI_UW.140Val = ((uint64_t)Val) | (0xffffffffull << 32);141Unsigned = true;142}143}144145generateInstSeqImpl(Val, STI, Res);146147// Skip shift if we were able to use LUI directly.148if (ShiftAmount) {149unsigned Opc = Unsigned ? RISCV::SLLI_UW : RISCV::SLLI;150Res.emplace_back(Opc, ShiftAmount);151}152153if (Lo12)154Res.emplace_back(RISCV::ADDI, Lo12);155}156157static unsigned extractRotateInfo(int64_t Val) {158// for case: 0b111..1..xxxxxx1..1..159unsigned LeadingOnes = llvm::countl_one((uint64_t)Val);160unsigned TrailingOnes = llvm::countr_one((uint64_t)Val);161if (TrailingOnes > 0 && TrailingOnes < 64 &&162(LeadingOnes + TrailingOnes) > (64 - 12))163return 64 - TrailingOnes;164165// for case: 0bxxx1..1..1...xxx166unsigned UpperTrailingOnes = llvm::countr_one(Hi_32(Val));167unsigned LowerLeadingOnes = llvm::countl_one(Lo_32(Val));168if (UpperTrailingOnes < 32 &&169(UpperTrailingOnes + LowerLeadingOnes) > (64 - 12))170return 32 - UpperTrailingOnes;171172return 0;173}174175static void generateInstSeqLeadingZeros(int64_t Val, const MCSubtargetInfo &STI,176RISCVMatInt::InstSeq &Res) {177assert(Val > 0 && "Expected postive val");178179unsigned LeadingZeros = llvm::countl_zero((uint64_t)Val);180uint64_t ShiftedVal = (uint64_t)Val << LeadingZeros;181// Fill in the bits that will be shifted out with 1s. An example where this182// helps is trailing one masks with 32 or more ones. This will generate183// ADDI -1 and an SRLI.184ShiftedVal |= maskTrailingOnes<uint64_t>(LeadingZeros);185186RISCVMatInt::InstSeq TmpSeq;187generateInstSeqImpl(ShiftedVal, STI, TmpSeq);188189// Keep the new sequence if it is an improvement or the original is empty.190if ((TmpSeq.size() + 1) < Res.size() ||191(Res.empty() && TmpSeq.size() < 8)) {192TmpSeq.emplace_back(RISCV::SRLI, LeadingZeros);193Res = TmpSeq;194}195196// Some cases can benefit from filling the lower bits with zeros instead.197ShiftedVal &= maskTrailingZeros<uint64_t>(LeadingZeros);198TmpSeq.clear();199generateInstSeqImpl(ShiftedVal, STI, TmpSeq);200201// Keep the new sequence if it is an improvement or the original is empty.202if ((TmpSeq.size() + 1) < Res.size() ||203(Res.empty() && TmpSeq.size() < 8)) {204TmpSeq.emplace_back(RISCV::SRLI, LeadingZeros);205Res = TmpSeq;206}207208// If we have exactly 32 leading zeros and Zba, we can try using zext.w at209// the end of the sequence.210if (LeadingZeros == 32 && STI.hasFeature(RISCV::FeatureStdExtZba)) {211// Try replacing upper bits with 1.212uint64_t LeadingOnesVal = Val | maskLeadingOnes<uint64_t>(LeadingZeros);213TmpSeq.clear();214generateInstSeqImpl(LeadingOnesVal, STI, TmpSeq);215216// Keep the new sequence if it is an improvement.217if ((TmpSeq.size() + 1) < Res.size() ||218(Res.empty() && TmpSeq.size() < 8)) {219TmpSeq.emplace_back(RISCV::ADD_UW, 0);220Res = TmpSeq;221}222}223}224225namespace llvm::RISCVMatInt {226InstSeq generateInstSeq(int64_t Val, const MCSubtargetInfo &STI) {227RISCVMatInt::InstSeq Res;228generateInstSeqImpl(Val, STI, Res);229230// If the low 12 bits are non-zero, the first expansion may end with an ADDI231// or ADDIW. If there are trailing zeros, try generating a sign extended232// constant with no trailing zeros and use a final SLLI to restore them.233if ((Val & 0xfff) != 0 && (Val & 1) == 0 && Res.size() >= 2) {234unsigned TrailingZeros = llvm::countr_zero((uint64_t)Val);235int64_t ShiftedVal = Val >> TrailingZeros;236// If we can use C.LI+C.SLLI instead of LUI+ADDI(W) prefer that since237// its more compressible. But only if LUI+ADDI(W) isn't fusable.238// NOTE: We don't check for C extension to minimize differences in generated239// code.240bool IsShiftedCompressible =241isInt<6>(ShiftedVal) && !STI.hasFeature(RISCV::TuneLUIADDIFusion);242RISCVMatInt::InstSeq TmpSeq;243generateInstSeqImpl(ShiftedVal, STI, TmpSeq);244245// Keep the new sequence if it is an improvement.246if ((TmpSeq.size() + 1) < Res.size() || IsShiftedCompressible) {247TmpSeq.emplace_back(RISCV::SLLI, TrailingZeros);248Res = TmpSeq;249}250}251252// If we have a 1 or 2 instruction sequence this is the best we can do. This253// will always be true for RV32 and will often be true for RV64.254if (Res.size() <= 2)255return Res;256257assert(STI.hasFeature(RISCV::Feature64Bit) &&258"Expected RV32 to only need 2 instructions");259260// If the lower 13 bits are something like 0x17ff, try to add 1 to change the261// lower 13 bits to 0x1800. We can restore this with an ADDI of -1 at the end262// of the sequence. Call generateInstSeqImpl on the new constant which may263// subtract 0xfffffffffffff800 to create another ADDI. This will leave a264// constant with more than 12 trailing zeros for the next recursive step.265if ((Val & 0xfff) != 0 && (Val & 0x1800) == 0x1000) {266int64_t Imm12 = -(0x800 - (Val & 0xfff));267int64_t AdjustedVal = Val - Imm12;268RISCVMatInt::InstSeq TmpSeq;269generateInstSeqImpl(AdjustedVal, STI, TmpSeq);270271// Keep the new sequence if it is an improvement.272if ((TmpSeq.size() + 1) < Res.size()) {273TmpSeq.emplace_back(RISCV::ADDI, Imm12);274Res = TmpSeq;275}276}277278// If the constant is positive we might be able to generate a shifted constant279// with no leading zeros and use a final SRLI to restore them.280if (Val > 0 && Res.size() > 2) {281generateInstSeqLeadingZeros(Val, STI, Res);282}283284// If the constant is negative, trying inverting and using our trailing zero285// optimizations. Use an xori to invert the final value.286if (Val < 0 && Res.size() > 3) {287uint64_t InvertedVal = ~(uint64_t)Val;288RISCVMatInt::InstSeq TmpSeq;289generateInstSeqLeadingZeros(InvertedVal, STI, TmpSeq);290291// Keep it if we found a sequence that is smaller after inverting.292if (!TmpSeq.empty() && (TmpSeq.size() + 1) < Res.size()) {293TmpSeq.emplace_back(RISCV::XORI, -1);294Res = TmpSeq;295}296}297298// If the Low and High halves are the same, use pack. The pack instruction299// packs the XLEN/2-bit lower halves of rs1 and rs2 into rd, with rs1 in the300// lower half and rs2 in the upper half.301if (Res.size() > 2 && STI.hasFeature(RISCV::FeatureStdExtZbkb)) {302int64_t LoVal = SignExtend64<32>(Val);303int64_t HiVal = SignExtend64<32>(Val >> 32);304if (LoVal == HiVal) {305RISCVMatInt::InstSeq TmpSeq;306generateInstSeqImpl(LoVal, STI, TmpSeq);307if ((TmpSeq.size() + 1) < Res.size()) {308TmpSeq.emplace_back(RISCV::PACK, 0);309Res = TmpSeq;310}311}312}313314// Perform optimization with BSETI in the Zbs extension.315if (Res.size() > 2 && STI.hasFeature(RISCV::FeatureStdExtZbs)) {316// Create a simm32 value for LUI+ADDIW by forcing the upper 33 bits to zero.317// Xor that with original value to get which bits should be set by BSETI.318uint64_t Lo = Val & 0x7fffffff;319uint64_t Hi = Val ^ Lo;320assert(Hi != 0);321RISCVMatInt::InstSeq TmpSeq;322323if (Lo != 0)324generateInstSeqImpl(Lo, STI, TmpSeq);325326if (TmpSeq.size() + llvm::popcount(Hi) < Res.size()) {327do {328TmpSeq.emplace_back(RISCV::BSETI, llvm::countr_zero(Hi));329Hi &= (Hi - 1); // Clear lowest set bit.330} while (Hi != 0);331Res = TmpSeq;332}333}334335// Perform optimization with BCLRI in the Zbs extension.336if (Res.size() > 2 && STI.hasFeature(RISCV::FeatureStdExtZbs)) {337// Create a simm32 value for LUI+ADDIW by forcing the upper 33 bits to one.338// Xor that with original value to get which bits should be cleared by339// BCLRI.340uint64_t Lo = Val | 0xffffffff80000000;341uint64_t Hi = Val ^ Lo;342assert(Hi != 0);343344RISCVMatInt::InstSeq TmpSeq;345generateInstSeqImpl(Lo, STI, TmpSeq);346347if (TmpSeq.size() + llvm::popcount(Hi) < Res.size()) {348do {349TmpSeq.emplace_back(RISCV::BCLRI, llvm::countr_zero(Hi));350Hi &= (Hi - 1); // Clear lowest set bit.351} while (Hi != 0);352Res = TmpSeq;353}354}355356// Perform optimization with SH*ADD in the Zba extension.357if (Res.size() > 2 && STI.hasFeature(RISCV::FeatureStdExtZba)) {358int64_t Div = 0;359unsigned Opc = 0;360RISCVMatInt::InstSeq TmpSeq;361// Select the opcode and divisor.362if ((Val % 3) == 0 && isInt<32>(Val / 3)) {363Div = 3;364Opc = RISCV::SH1ADD;365} else if ((Val % 5) == 0 && isInt<32>(Val / 5)) {366Div = 5;367Opc = RISCV::SH2ADD;368} else if ((Val % 9) == 0 && isInt<32>(Val / 9)) {369Div = 9;370Opc = RISCV::SH3ADD;371}372// Build the new instruction sequence.373if (Div > 0) {374generateInstSeqImpl(Val / Div, STI, TmpSeq);375if ((TmpSeq.size() + 1) < Res.size()) {376TmpSeq.emplace_back(Opc, 0);377Res = TmpSeq;378}379} else {380// Try to use LUI+SH*ADD+ADDI.381int64_t Hi52 = ((uint64_t)Val + 0x800ull) & ~0xfffull;382int64_t Lo12 = SignExtend64<12>(Val);383Div = 0;384if (isInt<32>(Hi52 / 3) && (Hi52 % 3) == 0) {385Div = 3;386Opc = RISCV::SH1ADD;387} else if (isInt<32>(Hi52 / 5) && (Hi52 % 5) == 0) {388Div = 5;389Opc = RISCV::SH2ADD;390} else if (isInt<32>(Hi52 / 9) && (Hi52 % 9) == 0) {391Div = 9;392Opc = RISCV::SH3ADD;393}394// Build the new instruction sequence.395if (Div > 0) {396// For Val that has zero Lo12 (implies Val equals to Hi52) should has397// already been processed to LUI+SH*ADD by previous optimization.398assert(Lo12 != 0 &&399"unexpected instruction sequence for immediate materialisation");400assert(TmpSeq.empty() && "Expected empty TmpSeq");401generateInstSeqImpl(Hi52 / Div, STI, TmpSeq);402if ((TmpSeq.size() + 2) < Res.size()) {403TmpSeq.emplace_back(Opc, 0);404TmpSeq.emplace_back(RISCV::ADDI, Lo12);405Res = TmpSeq;406}407}408}409}410411// Perform optimization with rori in the Zbb and th.srri in the XTheadBb412// extension.413if (Res.size() > 2 && (STI.hasFeature(RISCV::FeatureStdExtZbb) ||414STI.hasFeature(RISCV::FeatureVendorXTHeadBb))) {415if (unsigned Rotate = extractRotateInfo(Val)) {416RISCVMatInt::InstSeq TmpSeq;417uint64_t NegImm12 = llvm::rotl<uint64_t>(Val, Rotate);418assert(isInt<12>(NegImm12));419TmpSeq.emplace_back(RISCV::ADDI, NegImm12);420TmpSeq.emplace_back(STI.hasFeature(RISCV::FeatureStdExtZbb)421? RISCV::RORI422: RISCV::TH_SRRI,423Rotate);424Res = TmpSeq;425}426}427return Res;428}429430void generateMCInstSeq(int64_t Val, const MCSubtargetInfo &STI,431MCRegister DestReg, SmallVectorImpl<MCInst> &Insts) {432RISCVMatInt::InstSeq Seq = RISCVMatInt::generateInstSeq(Val, STI);433434MCRegister SrcReg = RISCV::X0;435for (RISCVMatInt::Inst &Inst : Seq) {436switch (Inst.getOpndKind()) {437case RISCVMatInt::Imm:438Insts.push_back(MCInstBuilder(Inst.getOpcode())439.addReg(DestReg)440.addImm(Inst.getImm()));441break;442case RISCVMatInt::RegX0:443Insts.push_back(MCInstBuilder(Inst.getOpcode())444.addReg(DestReg)445.addReg(SrcReg)446.addReg(RISCV::X0));447break;448case RISCVMatInt::RegReg:449Insts.push_back(MCInstBuilder(Inst.getOpcode())450.addReg(DestReg)451.addReg(SrcReg)452.addReg(SrcReg));453break;454case RISCVMatInt::RegImm:455Insts.push_back(MCInstBuilder(Inst.getOpcode())456.addReg(DestReg)457.addReg(SrcReg)458.addImm(Inst.getImm()));459break;460}461462// Only the first instruction has X0 as its source.463SrcReg = DestReg;464}465}466467InstSeq generateTwoRegInstSeq(int64_t Val, const MCSubtargetInfo &STI,468unsigned &ShiftAmt, unsigned &AddOpc) {469int64_t LoVal = SignExtend64<32>(Val);470if (LoVal == 0)471return RISCVMatInt::InstSeq();472473// Subtract the LoVal to emulate the effect of the final ADD.474uint64_t Tmp = (uint64_t)Val - (uint64_t)LoVal;475assert(Tmp != 0);476477// Use trailing zero counts to figure how far we need to shift LoVal to line478// up with the remaining constant.479// TODO: This algorithm assumes all non-zero bits in the low 32 bits of the480// final constant come from LoVal.481unsigned TzLo = llvm::countr_zero((uint64_t)LoVal);482unsigned TzHi = llvm::countr_zero(Tmp);483assert(TzLo < 32 && TzHi >= 32);484ShiftAmt = TzHi - TzLo;485AddOpc = RISCV::ADD;486487if (Tmp == ((uint64_t)LoVal << ShiftAmt))488return RISCVMatInt::generateInstSeq(LoVal, STI);489490// If we have Zba, we can use (ADD_UW X, (SLLI X, 32)).491if (STI.hasFeature(RISCV::FeatureStdExtZba) && Lo_32(Val) == Hi_32(Val)) {492ShiftAmt = 32;493AddOpc = RISCV::ADD_UW;494return RISCVMatInt::generateInstSeq(LoVal, STI);495}496497return RISCVMatInt::InstSeq();498}499500int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI,501bool CompressionCost, bool FreeZeroes) {502bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit);503bool HasRVC = CompressionCost && (STI.hasFeature(RISCV::FeatureStdExtC) ||504STI.hasFeature(RISCV::FeatureStdExtZca));505int PlatRegSize = IsRV64 ? 64 : 32;506507// Split the constant into platform register sized chunks, and calculate cost508// of each chunk.509int Cost = 0;510for (unsigned ShiftVal = 0; ShiftVal < Size; ShiftVal += PlatRegSize) {511APInt Chunk = Val.ashr(ShiftVal).sextOrTrunc(PlatRegSize);512if (FreeZeroes && Chunk.getSExtValue() == 0)513continue;514InstSeq MatSeq = generateInstSeq(Chunk.getSExtValue(), STI);515Cost += getInstSeqCost(MatSeq, HasRVC);516}517return std::max(FreeZeroes ? 0 : 1, Cost);518}519520OpndKind Inst::getOpndKind() const {521switch (Opc) {522default:523llvm_unreachable("Unexpected opcode!");524case RISCV::LUI:525return RISCVMatInt::Imm;526case RISCV::ADD_UW:527return RISCVMatInt::RegX0;528case RISCV::SH1ADD:529case RISCV::SH2ADD:530case RISCV::SH3ADD:531case RISCV::PACK:532return RISCVMatInt::RegReg;533case RISCV::ADDI:534case RISCV::ADDIW:535case RISCV::XORI:536case RISCV::SLLI:537case RISCV::SRLI:538case RISCV::SLLI_UW:539case RISCV::RORI:540case RISCV::BSETI:541case RISCV::BCLRI:542case RISCV::TH_SRRI:543return RISCVMatInt::RegImm;544}545}546547} // namespace llvm::RISCVMatInt548549550