Path: blob/21.2-virgl/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.cpp
4574 views
/*1* Copyright 2020 Red Hat Inc.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice shall be included in11* all copies or substantial portions of the Software.12*13* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL16* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR17* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,18* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR19* OTHER DEALINGS IN THE SOFTWARE.20*/21#include "codegen/nv50_ir.h"22#include "codegen/nv50_ir_build_util.h"2324#include "codegen/nv50_ir_target_nvc0.h"25#include "codegen/nv50_ir_lowering_gv100.h"2627#include <limits>2829namespace nv50_ir {3031bool32GV100LegalizeSSA::handleCMP(Instruction *i)33{34Value *pred = bld.getSSA(1, FILE_PREDICATE);3536bld.mkCmp(OP_SET, reverseCondCode(i->asCmp()->setCond), TYPE_U8, pred,37i->sType, bld.mkImm(0), i->getSrc(2))->ftz = i->ftz;38bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), i->getSrc(0), i->getSrc(1), pred);39return true;40}4142// NIR deals with most of these for us, but codegen generates more in pointer43// calculations from other lowering passes.44bool45GV100LegalizeSSA::handleIADD64(Instruction *i)46{47Value *carry = bld.getSSA(1, FILE_PREDICATE);48Value *def[2] = { bld.getSSA(), bld.getSSA() };49Value *src[2][2];5051for (int s = 0; s < 2; s++) {52if (i->getSrc(s)->reg.size == 8) {53bld.mkSplit(src[s], 4, i->getSrc(s));54} else {55src[s][0] = i->getSrc(s);56src[s][1] = bld.mkImm(0);57}58}5960bld.mkOp2(OP_ADD, TYPE_U32, def[0], src[0][0], src[1][0])->61setFlagsDef(1, carry);62bld.mkOp2(OP_ADD, TYPE_U32, def[1], src[0][1], src[1][1])->63setFlagsSrc(2, carry);64bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);65return true;66}6768bool69GV100LegalizeSSA::handleIMAD_HIGH(Instruction *i)70{71Value *def = bld.getSSA(8), *defs[2];72Value *src2;7374if (i->srcExists(2) &&75(!i->getSrc(2)->asImm() || i->getSrc(2)->asImm()->reg.data.u32)) {76Value *src2s[2] = { bld.getSSA(), bld.getSSA() };77bld.mkMov(src2s[0], bld.mkImm(0));78bld.mkMov(src2s[1], i->getSrc(2));79src2 = bld.mkOp2(OP_MERGE, TYPE_U64, bld.getSSA(8), src2s[0], src2s[1])->getDef(0);80} else {81src2 = bld.mkImm(0);82}8384bld.mkOp3(OP_MAD, isSignedType(i->sType) ? TYPE_S64 : TYPE_U64, def,85i->getSrc(0), i->getSrc(1), src2);8687bld.mkSplit(defs, 4, def);88i->def(0).replace(defs[1], false);89return true;90}9192// XXX: We should be able to do this in GV100LoweringPass, but codegen messes93// up somehow and swaps the condcode without swapping the sources.94// - tests/spec/glsl-1.50/execution/geometry/primitive-id-in.shader_test95bool96GV100LegalizeSSA::handleIMNMX(Instruction *i)97{98Value *pred = bld.getSSA(1, FILE_PREDICATE);99100bld.mkCmp(OP_SET, (i->op == OP_MIN) ? CC_LT : CC_GT, i->dType, pred,101i->sType, i->getSrc(0), i->getSrc(1));102bld.mkOp3(OP_SELP, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1), pred);103return true;104}105106bool107GV100LegalizeSSA::handleIMUL(Instruction *i)108{109if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)110return handleIMAD_HIGH(i);111112bld.mkOp3(OP_MAD, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1),113bld.mkImm(0));114return true;115}116117bool118GV100LegalizeSSA::handleLOP2(Instruction *i)119{120uint8_t src0 = NV50_IR_SUBOP_LOP3_LUT_SRC0;121uint8_t src1 = NV50_IR_SUBOP_LOP3_LUT_SRC1;122uint8_t subOp;123124if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT))125src0 = ~src0;126if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT))127src1 = ~src1;128129switch (i->op) {130case OP_AND: subOp = src0 & src1; break;131case OP_OR : subOp = src0 | src1; break;132case OP_XOR: subOp = src0 ^ src1; break;133default:134unreachable("invalid LOP2 opcode");135}136137bld.mkOp3(OP_LOP3_LUT, TYPE_U32, i->getDef(0), i->getSrc(0), i->getSrc(1),138bld.mkImm(0))->subOp = subOp;139return true;140}141142bool143GV100LegalizeSSA::handleNOT(Instruction *i)144{145bld.mkOp3(OP_LOP3_LUT, TYPE_U32, i->getDef(0), bld.mkImm(0), i->getSrc(0),146bld.mkImm(0))->subOp = (uint8_t)~NV50_IR_SUBOP_LOP3_LUT_SRC1;147return true;148}149150bool151GV100LegalizeSSA::handlePREEX2(Instruction *i)152{153i->def(0).replace(i->src(0), false);154return true;155}156157bool158GV100LegalizeSSA::handleQUADON(Instruction *i)159{160bld.mkBMov(i->getDef(0), bld.mkTSVal(TS_MACTIVE));161Instruction *b = bld.mkBMov(bld.mkTSVal(TS_PQUAD_MACTIVE), i->getDef(0));162b->fixed = 1;163return true;164}165166bool167GV100LegalizeSSA::handleQUADPOP(Instruction *i)168{169Instruction *b = bld.mkBMov(bld.mkTSVal(TS_MACTIVE), i->getSrc(0));170b->fixed = 1;171return true;172}173174bool175GV100LegalizeSSA::handleSET(Instruction *i)176{177Value *src2 = i->srcExists(2) ? i->getSrc(2) : NULL;178Value *pred = bld.getSSA(1, FILE_PREDICATE), *met;179Instruction *xsetp;180181if (isFloatType(i->dType)) {182if (i->sType == TYPE_F32)183return false; // HW has FSET.BF184met = bld.mkImm(0x3f800000);185} else {186met = bld.mkImm(0xffffffff);187}188189xsetp = bld.mkCmp(i->op, i->asCmp()->setCond, TYPE_U8, pred, i->sType,190i->getSrc(0), i->getSrc(1));191xsetp->src(0).mod = i->src(0).mod;192xsetp->src(1).mod = i->src(1).mod;193xsetp->setSrc(2, src2);194xsetp->ftz = i->ftz;195196i = bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), bld.mkImm(0), met, pred);197i->src(2).mod = Modifier(NV50_IR_MOD_NOT);198return true;199}200201bool202GV100LegalizeSSA::handleSHFL(Instruction *i)203{204Instruction *sync = new_Instruction(func, OP_WARPSYNC, TYPE_NONE);205sync->fixed = 1;206sync->setSrc(0, bld.mkImm(0xffffffff));207i->bb->insertBefore(i, sync);208return false;209}210211bool212GV100LegalizeSSA::handleShift(Instruction *i)213{214Value *zero = bld.mkImm(0);215Value *src1 = i->getSrc(1);216Value *src0, *src2;217uint8_t subOp = i->op == OP_SHL ? NV50_IR_SUBOP_SHF_L : NV50_IR_SUBOP_SHF_R;218219if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR) {220src0 = i->getSrc(0);221src2 = zero;222} else {223src0 = zero;224src2 = i->getSrc(0);225subOp |= NV50_IR_SUBOP_SHF_HI;226}227if (i->subOp & NV50_IR_SUBOP_SHIFT_WRAP)228subOp |= NV50_IR_SUBOP_SHF_W;229230bld.mkOp3(OP_SHF, i->dType, i->getDef(0), src0, src1, src2)->subOp = subOp;231return true;232}233234bool235GV100LegalizeSSA::handleSUB(Instruction *i)236{237Instruction *xadd =238bld.mkOp2(OP_ADD, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1));239xadd->src(0).mod = i->src(0).mod;240xadd->src(1).mod = i->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);241xadd->ftz = i->ftz;242return true;243}244245bool246GV100LegalizeSSA::visit(Instruction *i)247{248bool lowered = false;249250bld.setPosition(i, false);251if (i->sType == TYPE_F32 && i->dType != TYPE_F16 &&252prog->getType() != Program::TYPE_COMPUTE)253handleFTZ(i);254255switch (i->op) {256case OP_AND:257case OP_OR:258case OP_XOR:259if (i->def(0).getFile() != FILE_PREDICATE)260lowered = handleLOP2(i);261break;262case OP_NOT:263lowered = handleNOT(i);264break;265case OP_SHL:266case OP_SHR:267lowered = handleShift(i);268break;269case OP_SET:270case OP_SET_AND:271case OP_SET_OR:272case OP_SET_XOR:273if (i->def(0).getFile() != FILE_PREDICATE)274lowered = handleSET(i);275break;276case OP_SLCT:277lowered = handleCMP(i);278break;279case OP_PREEX2:280lowered = handlePREEX2(i);281break;282case OP_MUL:283if (!isFloatType(i->dType))284lowered = handleIMUL(i);285break;286case OP_MAD:287if (!isFloatType(i->dType) && i->subOp == NV50_IR_SUBOP_MUL_HIGH)288lowered = handleIMAD_HIGH(i);289break;290case OP_SHFL:291lowered = handleSHFL(i);292break;293case OP_QUADON:294lowered = handleQUADON(i);295break;296case OP_QUADPOP:297lowered = handleQUADPOP(i);298break;299case OP_SUB:300lowered = handleSUB(i);301break;302case OP_MAX:303case OP_MIN:304if (!isFloatType(i->dType))305lowered = handleIMNMX(i);306break;307case OP_ADD:308if (!isFloatType(i->dType) && typeSizeof(i->dType) == 8)309lowered = handleIADD64(i);310break;311case OP_PFETCH:312handlePFETCH(i);313break;314case OP_LOAD:315handleLOAD(i);316break;317default:318break;319}320321if (lowered)322delete_Instruction(prog, i);323324return true;325}326327bool328GV100LoweringPass::handleDMNMX(Instruction *i)329{330Value *pred = bld.getSSA(1, FILE_PREDICATE);331Value *src0[2], *src1[2], *dest[2];332333bld.mkCmp(OP_SET, (i->op == OP_MIN) ? CC_LT : CC_GT, TYPE_U32, pred,334i->sType, i->getSrc(0), i->getSrc(1));335bld.mkSplit(src0, 4, i->getSrc(0));336bld.mkSplit(src1, 4, i->getSrc(1));337bld.mkSplit(dest, 4, i->getDef(0));338bld.mkOp3(OP_SELP, TYPE_U32, dest[0], src0[0], src1[0], pred);339bld.mkOp3(OP_SELP, TYPE_U32, dest[1], src0[1], src1[1], pred);340bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), dest[0], dest[1]);341return true;342}343344bool345GV100LoweringPass::handleEXTBF(Instruction *i)346{347Value *bit = bld.getScratch();348Value *cnt = bld.getScratch();349Value *mask = bld.getScratch();350Value *zero = bld.mkImm(0);351352bld.mkOp3(OP_PERMT, TYPE_U32, bit, i->getSrc(1), bld.mkImm(0x4440), zero);353bld.mkOp3(OP_PERMT, TYPE_U32, cnt, i->getSrc(1), bld.mkImm(0x4441), zero);354bld.mkOp2(OP_BMSK, TYPE_U32, mask, bit, cnt);355bld.mkOp2(OP_AND, TYPE_U32, mask, i->getSrc(0), mask);356bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), mask, bit);357if (isSignedType(i->dType))358bld.mkOp2(OP_SGXT, TYPE_S32, i->getDef(0), i->getDef(0), cnt);359360return true;361}362363bool364GV100LoweringPass::handleFLOW(Instruction *i)365{366i->op = OP_BRA;367return false;368}369370bool371GV100LoweringPass::handleI2I(Instruction *i)372{373bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), i->sType, i->getSrc(0))->374subOp = i->subOp;375bld.mkCvt(OP_CVT, i->dType, i->getDef(0), TYPE_F32, i->getDef(0));376return true;377}378379bool380GV100LoweringPass::handleINSBF(Instruction *i)381{382Value *bit = bld.getScratch();383Value *cnt = bld.getScratch();384Value *mask = bld.getScratch();385Value *src0 = bld.getScratch();386Value *zero = bld.mkImm(0);387388bld.mkOp3(OP_PERMT, TYPE_U32, bit, i->getSrc(1), bld.mkImm(0x4440), zero);389bld.mkOp3(OP_PERMT, TYPE_U32, cnt, i->getSrc(1), bld.mkImm(0x4441), zero);390bld.mkOp2(OP_BMSK, TYPE_U32, mask, zero, cnt);391392bld.mkOp2(OP_AND, TYPE_U32, src0, i->getSrc(0), mask);393bld.mkOp2(OP_SHL, TYPE_U32, src0, src0, bit);394395bld.mkOp2(OP_SHL, TYPE_U32, mask, mask, bit);396bld.mkOp3(OP_LOP3_LUT, TYPE_U32, i->getDef(0), src0, i->getSrc(2), mask)->397subOp = NV50_IR_SUBOP_LOP3_LUT(a | (b & ~c));398399return true;400}401402bool403GV100LoweringPass::handlePINTERP(Instruction *i)404{405Value *src2 = i->srcExists(2) ? i->getSrc(2) : NULL;406Instruction *ipa, *mul;407408ipa = bld.mkOp2(OP_LINTERP, TYPE_F32, i->getDef(0), i->getSrc(0), src2);409ipa->ipa = i->ipa;410mul = bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), i->getSrc(1));411412if (i->getInterpMode() == NV50_IR_INTERP_SC) {413ipa->setDef(1, bld.getSSA(1, FILE_PREDICATE));414mul->setPredicate(CC_NOT_P, ipa->getDef(1));415}416417return true;418}419420bool421GV100LoweringPass::handlePREFLOW(Instruction *i)422{423return true;424}425426bool427GV100LoweringPass::handlePRESIN(Instruction *i)428{429const float f = 1.0 / (2.0 * 3.14159265);430bld.mkOp2(OP_MUL, i->dType, i->getDef(0), i->getSrc(0), bld.mkImm(f));431return true;432}433434bool435GV100LoweringPass::visit(Instruction *i)436{437bool lowered = false;438439bld.setPosition(i, false);440441switch (i->op) {442case OP_BREAK:443case OP_CONT:444lowered = handleFLOW(i);445break;446case OP_PREBREAK:447case OP_PRECONT:448lowered = handlePREFLOW(i);449break;450case OP_CVT:451if (i->src(0).getFile() != FILE_PREDICATE &&452i->def(0).getFile() != FILE_PREDICATE &&453!isFloatType(i->dType) && !isFloatType(i->sType))454lowered = handleI2I(i);455break;456case OP_EXTBF:457lowered = handleEXTBF(i);458break;459case OP_INSBF:460lowered = handleINSBF(i);461break;462case OP_MAX:463case OP_MIN:464if (i->dType == TYPE_F64)465lowered = handleDMNMX(i);466break;467case OP_PINTERP:468lowered = handlePINTERP(i);469break;470case OP_PRESIN:471lowered = handlePRESIN(i);472break;473default:474break;475}476477if (lowered)478delete_Instruction(prog, i);479480return true;481}482483} // namespace nv50_ir484485486