Path: blob/21.2-virgl/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
4574 views
/*1* Copyright 2011 Christoph Bumiller2* 2014 Red Hat Inc.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice shall be included in12* all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR18* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,19* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR20* OTHER DEALINGS IN THE SOFTWARE.21*/2223#include "codegen/nv50_ir.h"24#include "codegen/nv50_ir_build_util.h"2526#include "codegen/nv50_ir_target_nvc0.h"27#include "codegen/nv50_ir_lowering_gm107.h"2829#include <limits>3031namespace nv50_ir {3233#define QOP_ADD 034#define QOP_SUBR 135#define QOP_SUB 236#define QOP_MOV2 33738// UL UR LL LR39#define QUADOP(q, r, s, t) \40((QOP_##q << 6) | (QOP_##r << 4) | \41(QOP_##s << 2) | (QOP_##t << 0))4243#define SHFL_BOUND_QUAD 0x1c034445void46GM107LegalizeSSA::handlePFETCH(Instruction *i)47{48Value *src0;4950if (i->src(0).getFile() == FILE_GPR && !i->srcExists(1))51return;5253bld.setPosition(i, false);54src0 = bld.getSSA();5556if (i->srcExists(1))57bld.mkOp2(OP_ADD , TYPE_U32, src0, i->getSrc(0), i->getSrc(1));58else59bld.mkOp1(OP_MOV , TYPE_U32, src0, i->getSrc(0));6061i->setSrc(0, src0);62i->setSrc(1, NULL);63}6465void66GM107LegalizeSSA::handleLOAD(Instruction *i)67{68if (i->src(0).getFile() != FILE_MEMORY_CONST)69return;70if (i->src(0).isIndirect(0))71return;72if (typeSizeof(i->dType) != 4)73return;7475i->op = OP_MOV;76}7778void79GM107LegalizeSSA::handleQUADON(Instruction *i)80{81i->setDef(0, NULL);82}8384void85GM107LegalizeSSA::handleQUADPOP(Instruction *i)86{87i->setSrc(0, NULL);88}8990bool91GM107LegalizeSSA::visit(Instruction *i)92{93switch (i->op) {94case OP_QUADON:95handleQUADON(i);96break;97case OP_QUADPOP:98handleQUADPOP(i);99break;100case OP_PFETCH:101handlePFETCH(i);102break;103case OP_LOAD:104handleLOAD(i);105break;106default:107break;108}109return true;110}111112bool113GM107LoweringPass::handleManualTXD(TexInstruction *i)114{115// See NVC0LoweringPass::handleManualTXD for rationale. This function116// implements the same logic, but using SM50-friendly primitives.117static const uint8_t qOps[2] =118{ QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) };119Value *def[4][4];120Value *crd[3], *arr, *shadow;121Value *tmp;122Instruction *tex, *add;123Value *quad = bld.mkImm(SHFL_BOUND_QUAD);124int l, c;125const int dim = i->tex.target.getDim() + i->tex.target.isCube();126const int array = i->tex.target.isArray();127const int indirect = i->tex.rIndirectSrc >= 0;128129i->op = OP_TEX; // no need to clone dPdx/dPdy later130131for (c = 0; c < dim; ++c)132crd[c] = bld.getScratch();133arr = bld.getScratch();134shadow = bld.getScratch();135tmp = bld.getScratch();136137for (l = 0; l < 4; ++l) {138Value *bar = bld.getSSA(4, FILE_BARRIER);139Value *src[3], *val;140Value *lane = bld.mkImm(l);141bld.mkOp(OP_QUADON, TYPE_U32, bar);142// Make sure lane 0 has the appropriate array/depth compare values143if (l != 0) {144if (array)145bld.mkOp3(OP_SHFL, TYPE_F32, arr, i->getSrc(0), lane, quad);146if (i->tex.target.isShadow())147bld.mkOp3(OP_SHFL, TYPE_F32, shadow, i->getSrc(array + dim + indirect), lane, quad);148}149150// mov coordinates from lane l to all lanes151for (c = 0; c < dim; ++c) {152bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), lane, quad);153}154155// add dPdx from lane l to lanes dx156for (c = 0; c < dim; ++c) {157bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), lane, quad);158add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);159add->subOp = qOps[0];160add->lanes = 1; /* abused for .ndv */161}162163// add dPdy from lane l to lanes dy164for (c = 0; c < dim; ++c) {165bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), lane, quad);166add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);167add->subOp = qOps[1];168add->lanes = 1; /* abused for .ndv */169}170171// normalize cube coordinates if necessary172if (i->tex.target.isCube()) {173for (c = 0; c < 3; ++c)174src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);175val = bld.getScratch();176bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);177bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);178bld.mkOp1(OP_RCP, TYPE_F32, val, val);179for (c = 0; c < 3; ++c)180src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);181} else {182for (c = 0; c < dim; ++c)183src[c] = crd[c];184}185186// texture187bld.insert(tex = cloneForward(func, i));188if (l != 0) {189if (array)190tex->setSrc(0, arr);191if (i->tex.target.isShadow())192tex->setSrc(array + dim + indirect, shadow);193}194for (c = 0; c < dim; ++c)195tex->setSrc(c + array, src[c]);196// broadcast results from lane 0 to all lanes197if (l != 0)198for (c = 0; i->defExists(c); ++c)199bld.mkOp3(OP_SHFL, TYPE_F32, tex->getDef(c), tex->getDef(c), bld.mkImm(0), quad);200bld.mkOp1(OP_QUADPOP, TYPE_U32, NULL, bar)->fixed = 1;201202// save results203for (c = 0; i->defExists(c); ++c) {204Instruction *mov;205def[c][l] = bld.getSSA();206mov = bld.mkMov(def[c][l], tex->getDef(c));207mov->fixed = 1;208mov->lanes = 1 << l;209}210}211212for (c = 0; i->defExists(c); ++c) {213Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));214for (l = 0; l < 4; ++l)215u->setSrc(l, def[c][l]);216}217218i->bb->remove(i);219return true;220}221222bool223GM107LoweringPass::handleDFDX(Instruction *insn)224{225Instruction *shfl;226int qop = 0, xid = 0;227228switch (insn->op) {229case OP_DFDX:230qop = QUADOP(SUB, SUBR, SUB, SUBR);231xid = 1;232break;233case OP_DFDY:234qop = QUADOP(SUB, SUB, SUBR, SUBR);235xid = 2;236break;237default:238assert(!"invalid dfdx opcode");239break;240}241242shfl = bld.mkOp3(OP_SHFL, TYPE_F32, bld.getScratch(), insn->getSrc(0),243bld.mkImm(xid), bld.mkImm(SHFL_BOUND_QUAD));244shfl->subOp = NV50_IR_SUBOP_SHFL_BFLY;245insn->op = OP_QUADOP;246insn->subOp = qop;247insn->lanes = 0; /* abused for !.ndv */248insn->setSrc(1, insn->getSrc(0));249insn->setSrc(0, shfl->getDef(0));250return true;251}252253bool254GM107LoweringPass::handlePFETCH(Instruction *i)255{256Value *tmp0 = bld.getScratch();257Value *tmp1 = bld.getScratch();258Value *tmp2 = bld.getScratch();259bld.mkOp1(OP_RDSV, TYPE_U32, tmp0, bld.mkSysVal(SV_INVOCATION_INFO, 0));260bld.mkOp3(OP_PERMT, TYPE_U32, tmp1, tmp0, bld.mkImm(0x4442), bld.mkImm(0));261bld.mkOp3(OP_PERMT, TYPE_U32, tmp0, tmp0, bld.mkImm(0x4440), bld.mkImm(0));262if (i->getSrc(1))263bld.mkOp2(OP_ADD , TYPE_U32, tmp2, i->getSrc(0), i->getSrc(1));264else265bld.mkOp1(OP_MOV , TYPE_U32, tmp2, i->getSrc(0));266bld.mkOp3(OP_MAD , TYPE_U32, tmp0, tmp0, tmp1, tmp2);267i->setSrc(0, tmp0);268i->setSrc(1, NULL);269return true;270}271272bool273GM107LoweringPass::handlePOPCNT(Instruction *i)274{275Value *tmp = bld.mkOp2v(OP_AND, i->sType, bld.getScratch(),276i->getSrc(0), i->getSrc(1));277i->setSrc(0, tmp);278i->setSrc(1, NULL);279return true;280}281282bool283GM107LoweringPass::handleSUQ(TexInstruction *suq)284{285Value *ind = suq->getIndirectR();286Value *handle;287const int slot = suq->tex.r;288const int mask = suq->tex.mask;289290if (suq->tex.bindless)291handle = ind;292else293handle = loadTexHandle(ind, slot + 32);294295suq->tex.r = 0xff;296suq->tex.s = 0x1f;297298suq->setIndirectR(NULL);299suq->setSrc(0, handle);300suq->tex.rIndirectSrc = 0;301suq->setSrc(1, bld.loadImm(NULL, 0));302suq->tex.query = TXQ_DIMS;303suq->op = OP_TXQ;304305// We store CUBE / CUBE_ARRAY as a 2D ARRAY. Make sure that depth gets306// divided by 6.307if (mask & 0x4 && suq->tex.target.isCube()) {308int d = util_bitcount(mask & 0x3);309bld.setPosition(suq, true);310bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d), suq->getDef(d),311bld.loadImm(NULL, 6));312}313314// Samples come from a different query. If we want both samples and dims,315// create a second suq.316if (mask & 0x8) {317int d = util_bitcount(mask & 0x7);318Value *dst = suq->getDef(d);319TexInstruction *samples = suq;320assert(dst);321322if (mask != 0x8) {323suq->setDef(d, NULL);324suq->tex.mask &= 0x7;325samples = cloneShallow(func, suq);326for (int i = 0; i < d; i++)327samples->setDef(d, NULL);328samples->setDef(0, dst);329suq->bb->insertAfter(suq, samples);330}331samples->tex.mask = 0x4;332samples->tex.query = TXQ_TYPE;333}334335if (suq->tex.target.isMS()) {336bld.setPosition(suq, true);337338if (mask & 0x1)339bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(0), suq->getDef(0),340loadMsAdjInfo32(suq->tex.target, 0, slot, ind, suq->tex.bindless));341if (mask & 0x2) {342int d = util_bitcount(mask & 0x1);343bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(d), suq->getDef(d),344loadMsAdjInfo32(suq->tex.target, 1, slot, ind, suq->tex.bindless));345}346}347348return true;349}350351//352// - add quadop dance for texturing353// - put FP outputs in GPRs354// - convert instruction sequences355//356bool357GM107LoweringPass::visit(Instruction *i)358{359bld.setPosition(i, false);360361if (i->cc != CC_ALWAYS)362checkPredicate(i);363364switch (i->op) {365case OP_PFETCH:366return handlePFETCH(i);367case OP_DFDX:368case OP_DFDY:369return handleDFDX(i);370case OP_POPCNT:371return handlePOPCNT(i);372case OP_SUQ:373return handleSUQ(i->asTex());374default:375return NVC0LoweringPass::visit(i);376}377}378379} // namespace nv50_ir380381382