Path: blob/21.2-virgl/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
4574 views
/*1* Copyright 2011 Christoph Bumiller2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice shall be included in11* all copies or substantial portions of the Software.12*13* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL16* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR17* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,18* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR19* OTHER DEALINGS IN THE SOFTWARE.20*/2122#include "codegen/nv50_ir.h"23#include "codegen/nv50_ir_build_util.h"2425#include "codegen/nv50_ir_target_nvc0.h"26#include "codegen/nv50_ir_lowering_nvc0.h"2728#include <limits>2930namespace nv50_ir {3132#define QOP_ADD 033#define QOP_SUBR 134#define QOP_SUB 235#define QOP_MOV2 33637// UL UR LL LR38#define QUADOP(q, r, s, t) \39((QOP_##q << 6) | (QOP_##r << 4) | \40(QOP_##s << 2) | (QOP_##t << 0))4142void43NVC0LegalizeSSA::handleDIV(Instruction *i)44{45FlowInstruction *call;46int builtin;4748bld.setPosition(i, false);4950// Generate movs to the input regs for the call we want to generate51for (int s = 0; i->srcExists(s); ++s) {52Instruction *ld = i->getSrc(s)->getInsn();53// check if we are moving an immediate, propagate it in that case54if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV) ||55!(ld->src(0).getFile() == FILE_IMMEDIATE))56bld.mkMovToReg(s, i->getSrc(s));57else {58assert(ld->getSrc(0) != NULL);59bld.mkMovToReg(s, ld->getSrc(0));60// Clear the src, to make code elimination possible here before we61// delete the instruction i later62i->setSrc(s, NULL);63if (ld->isDead())64delete_Instruction(prog, ld);65}66}6768switch (i->dType) {69case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;70case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;71default:72return;73}74call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);75bld.mkMovFromReg(i->getDef(0), i->op == OP_DIV ? 0 : 1);76bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);77bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);7879call->fixed = 1;80call->absolute = call->builtin = 1;81call->target.builtin = builtin;82delete_Instruction(prog, i);83}8485void86NVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[])87{88FlowInstruction *call;89Value *def[2];90int builtin;9192def[0] = bld.mkMovToReg(0, src[0])->getDef(0);93def[1] = bld.mkMovToReg(1, src[1])->getDef(0);9495if (i->op == OP_RCP)96builtin = NVC0_BUILTIN_RCP_F64;97else98builtin = NVC0_BUILTIN_RSQ_F64;99100call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);101def[0] = bld.getSSA();102def[1] = bld.getSSA();103bld.mkMovFromReg(def[0], 0);104bld.mkMovFromReg(def[1], 1);105bld.mkClobber(FILE_GPR, 0x3fc, 2);106bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0);107bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]);108109call->fixed = 1;110call->absolute = call->builtin = 1;111call->target.builtin = builtin;112delete_Instruction(prog, i);113114prog->fp64 = true;115}116117void118NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)119{120assert(i->dType == TYPE_F64);121// There are instructions that will compute the high 32 bits of the 64-bit122// float. We will just stick 0 in the bottom 32 bits.123124bld.setPosition(i, false);125126// 1. Take the source and it up.127Value *src[2], *dst[2], *def = i->getDef(0);128bld.mkSplit(src, 4, i->getSrc(0));129130int chip = prog->getTarget()->getChipset();131if (chip >= NVISA_GK104_CHIPSET) {132handleRCPRSQLib(i, src);133return;134}135136// 2. We don't care about the low 32 bits of the destination. Stick a 0 in.137dst[0] = bld.loadImm(NULL, 0);138dst[1] = bld.getSSA();139140// 3. The new version of the instruction takes the high 32 bits of the141// source and outputs the high 32 bits of the destination.142i->setSrc(0, src[1]);143i->setDef(0, dst[1]);144i->setType(TYPE_F32);145i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;146147// 4. Recombine the two dst pieces back into the original destination.148bld.setPosition(i, true);149bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);150}151152void153NVC0LegalizeSSA::handleFTZ(Instruction *i)154{155// Only want to flush float inputs156assert(i->sType == TYPE_F32);157158// If we're already flushing denorms (and NaN's) to zero, no need for this.159if (i->dnz)160return;161162// Only certain classes of operations can flush163OpClass cls = prog->getTarget()->getOpClass(i->op);164if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&165cls != OPCLASS_CONVERT)166return;167168i->ftz = true;169}170171void172NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)173{174if (i->tex.levelZero)175return;176177ImmediateValue lod;178179// The LOD argument comes right after the coordinates (before depth bias,180// offsets, etc).181int arg = i->tex.target.getArgCount();182183// SM30+ stores the indirect handle as a separate arg, which comes before184// the LOD.185if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET &&186i->tex.rIndirectSrc >= 0)187arg++;188// SM20 stores indirect handle combined with array coordinate189if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET &&190!i->tex.target.isArray() &&191i->tex.rIndirectSrc >= 0)192arg++;193194if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0))195return;196197if (i->op == OP_TXL)198i->op = OP_TEX;199i->tex.levelZero = true;200i->moveSources(arg + 1, -1);201}202203void204NVC0LegalizeSSA::handleShift(Instruction *lo)205{206Value *shift = lo->getSrc(1);207Value *dst64 = lo->getDef(0);208Value *src[2], *dst[2];209operation op = lo->op;210211bld.setPosition(lo, false);212213bld.mkSplit(src, 4, lo->getSrc(0));214215// SM30 and prior don't have the fancy new SHF.L/R ops. So the logic has to216// be completely emulated. For SM35+, we can use the more directed SHF217// operations.218if (prog->getTarget()->getChipset() < NVISA_GK20A_CHIPSET) {219// The strategy here is to handle shifts >= 32 and less than 32 as220// separate parts.221//222// For SHL:223// If the shift is <= 32, then224// (HI,LO) << x = (HI << x | (LO >> (32 - x)), LO << x)225// If the shift is > 32, then226// (HI,LO) << x = (LO << (x - 32), 0)227//228// For SHR:229// If the shift is <= 32, then230// (HI,LO) >> x = (HI >> x, (HI << (32 - x)) | LO >> x)231// If the shift is > 32, then232// (HI,LO) >> x = (0, HI >> (x - 32))233//234// Note that on NVIDIA hardware, a shift > 32 yields a 0 value, which we235// can use to our advantage. Also note the structural similarities236// between the right/left cases. The main difference is swapping hi/lo237// on input and output.238239Value *x32_minus_shift, *pred, *hi1, *hi2;240DataType type = isSignedIntType(lo->dType) ? TYPE_S32 : TYPE_U32;241operation antiop = op == OP_SHR ? OP_SHL : OP_SHR;242if (op == OP_SHR)243std::swap(src[0], src[1]);244bld.mkOp2(OP_ADD, TYPE_U32, (x32_minus_shift = bld.getSSA()), shift, bld.mkImm(0x20))245->src(0).mod = Modifier(NV50_IR_MOD_NEG);246bld.mkCmp(OP_SET, CC_LE, TYPE_U8, (pred = bld.getSSA(1, FILE_PREDICATE)),247TYPE_U32, shift, bld.mkImm(32));248// Compute HI (shift <= 32)249bld.mkOp2(OP_OR, TYPE_U32, (hi1 = bld.getSSA()),250bld.mkOp2v(op, TYPE_U32, bld.getSSA(), src[1], shift),251bld.mkOp2v(antiop, TYPE_U32, bld.getSSA(), src[0], x32_minus_shift))252->setPredicate(CC_P, pred);253// Compute LO (all shift values)254bld.mkOp2(op, type, (dst[0] = bld.getSSA()), src[0], shift);255// Compute HI (shift > 32)256bld.mkOp2(op, type, (hi2 = bld.getSSA()), src[0],257bld.mkOp1v(OP_NEG, TYPE_S32, bld.getSSA(), x32_minus_shift))258->setPredicate(CC_NOT_P, pred);259bld.mkOp2(OP_UNION, TYPE_U32, (dst[1] = bld.getSSA()), hi1, hi2);260if (op == OP_SHR)261std::swap(dst[0], dst[1]);262bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);263delete_Instruction(prog, lo);264return;265}266267Instruction *hi = new_Instruction(func, op, TYPE_U32);268lo->bb->insertAfter(lo, hi);269270hi->sType = lo->sType;271lo->dType = TYPE_U32;272273hi->setDef(0, (dst[1] = bld.getSSA()));274if (lo->op == OP_SHR)275hi->subOp |= NV50_IR_SUBOP_SHIFT_HIGH;276lo->setDef(0, (dst[0] = bld.getSSA()));277278bld.setPosition(hi, true);279280if (lo->op == OP_SHL)281std::swap(hi, lo);282283hi->setSrc(0, new_ImmediateValue(prog, 0u));284hi->setSrc(1, shift);285hi->setSrc(2, lo->op == OP_SHL ? src[0] : src[1]);286287lo->setSrc(0, src[0]);288lo->setSrc(1, shift);289lo->setSrc(2, src[1]);290291bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);292}293294void295NVC0LegalizeSSA::handleSET(CmpInstruction *cmp)296{297DataType hTy = cmp->sType == TYPE_S64 ? TYPE_S32 : TYPE_U32;298Value *carry;299Value *src0[2], *src1[2];300bld.setPosition(cmp, false);301302bld.mkSplit(src0, 4, cmp->getSrc(0));303bld.mkSplit(src1, 4, cmp->getSrc(1));304bld.mkOp2(OP_SUB, hTy, NULL, src0[0], src1[0])305->setFlagsDef(0, (carry = bld.getSSA(1, FILE_FLAGS)));306cmp->setFlagsSrc(cmp->srcCount(), carry);307cmp->setSrc(0, src0[1]);308cmp->setSrc(1, src1[1]);309cmp->sType = hTy;310}311312void313NVC0LegalizeSSA::handleBREV(Instruction *i)314{315i->op = OP_EXTBF;316i->subOp = NV50_IR_SUBOP_EXTBF_REV;317i->setSrc(1, bld.mkImm(0x2000));318}319320bool321NVC0LegalizeSSA::visit(Function *fn)322{323bld.setProgram(fn->getProgram());324return true;325}326327bool328NVC0LegalizeSSA::visit(BasicBlock *bb)329{330Instruction *next;331for (Instruction *i = bb->getEntry(); i; i = next) {332next = i->next;333334if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE)335handleFTZ(i);336337switch (i->op) {338case OP_DIV:339case OP_MOD:340if (i->sType != TYPE_F32)341handleDIV(i);342break;343case OP_RCP:344case OP_RSQ:345if (i->dType == TYPE_F64)346handleRCPRSQ(i);347break;348case OP_TXL:349case OP_TXF:350handleTEXLOD(i->asTex());351break;352case OP_SHR:353case OP_SHL:354if (typeSizeof(i->sType) == 8)355handleShift(i);356break;357case OP_SET:358case OP_SET_AND:359case OP_SET_OR:360case OP_SET_XOR:361if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64)362handleSET(i->asCmp());363break;364case OP_BREV:365handleBREV(i);366break;367default:368break;369}370}371return true;372}373374NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)375: rZero(NULL),376carry(NULL),377pOne(NULL),378needTexBar(prog->getTarget()->getChipset() >= 0xe0 &&379prog->getTarget()->getChipset() < 0x110)380{381}382383bool384NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,385const Instruction *early) const386{387if (early->bb == later->bb)388return early->serial < later->serial;389return later->bb->dominatedBy(early->bb);390}391392void393NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,394Instruction *usei, const Instruction *texi)395{396bool add = true;397bool dominated = insnDominatedBy(usei, texi);398// Uses before the tex have to all be included. Just because an earlier399// instruction dominates another instruction doesn't mean that there's no400// way to get from the tex to the later instruction. For example you could401// have nested loops, with the tex in the inner loop, and uses before it in402// both loops - even though the outer loop's instruction would dominate the403// inner's, we still want a texbar before the inner loop's instruction.404//405// However we can still use the eliding logic between uses dominated by the406// tex instruction, as that is unambiguously correct.407if (dominated) {408for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) {409if (it->after) {410if (insnDominatedBy(usei, it->insn)) {411add = false;412break;413}414if (insnDominatedBy(it->insn, usei)) {415it = uses.erase(it);416continue;417}418}419++it;420}421}422if (add)423uses.push_back(TexUse(usei, texi, dominated));424}425426// While it might be tempting to use the an algorithm that just looks at tex427// uses, not all texture results are guaranteed to be used on all paths. In428// the case where along some control flow path a texture result is never used,429// we might reuse that register for something else, creating a430// write-after-write hazard. So we have to manually look through all431// instructions looking for ones that reference the registers in question.432void433NVC0LegalizePostRA::findFirstUses(434Instruction *texi, std::list<TexUse> &uses)435{436int minGPR = texi->def(0).rep()->reg.data.id;437int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;438439unordered_set<const BasicBlock *> visited;440findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);441}442443void444NVC0LegalizePostRA::findFirstUsesBB(445int minGPR, int maxGPR, Instruction *start,446const Instruction *texi, std::list<TexUse> &uses,447unordered_set<const BasicBlock *> &visited)448{449const BasicBlock *bb = start->bb;450451// We don't process the whole bb the first time around. This is correct,452// however we might be in a loop and hit this BB again, and need to process453// the full thing. So only mark a bb as visited if we processed it from the454// beginning.455if (start == bb->getEntry()) {456if (visited.find(bb) != visited.end())457return;458visited.insert(bb);459}460461for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {462if (insn->isNop())463continue;464465for (int d = 0; insn->defExists(d); ++d) {466const Value *def = insn->def(d).rep();467if (insn->def(d).getFile() != FILE_GPR ||468def->reg.data.id + def->reg.size / 4 - 1 < minGPR ||469def->reg.data.id > maxGPR)470continue;471addTexUse(uses, insn, texi);472return;473}474475for (int s = 0; insn->srcExists(s); ++s) {476const Value *src = insn->src(s).rep();477if (insn->src(s).getFile() != FILE_GPR ||478src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||479src->reg.data.id > maxGPR)480continue;481addTexUse(uses, insn, texi);482return;483}484}485486for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {487findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),488texi, uses, visited);489}490}491492// Texture barriers:493// This pass is a bit long and ugly and can probably be optimized.494//495// 1. obtain a list of TEXes and their outputs' first use(s)496// 2. calculate the barrier level of each first use (minimal number of TEXes,497// over all paths, between the TEX and the use in question)498// 3. for each barrier, if all paths from the source TEX to that barrier499// contain a barrier of lesser level, it can be culled500bool501NVC0LegalizePostRA::insertTextureBarriers(Function *fn)502{503std::list<TexUse> *uses;504std::vector<Instruction *> texes;505std::vector<int> bbFirstTex;506std::vector<int> bbFirstUse;507std::vector<int> texCounts;508std::vector<TexUse> useVec;509ArrayList insns;510511fn->orderInstructions(insns);512513texCounts.resize(fn->allBBlocks.getSize(), 0);514bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());515bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());516517// tag BB CFG nodes by their id for later518for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {519BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());520if (bb)521bb->cfg.tag = bb->getId();522}523524// gather the first uses for each TEX525for (int i = 0; i < insns.getSize(); ++i) {526Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));527if (isTextureOp(tex->op)) {528texes.push_back(tex);529if (!texCounts.at(tex->bb->getId()))530bbFirstTex[tex->bb->getId()] = texes.size() - 1;531texCounts[tex->bb->getId()]++;532}533}534insns.clear();535if (texes.empty())536return false;537uses = new std::list<TexUse>[texes.size()];538if (!uses)539return false;540for (size_t i = 0; i < texes.size(); ++i) {541findFirstUses(texes[i], uses[i]);542}543544// determine the barrier level at each use545for (size_t i = 0; i < texes.size(); ++i) {546for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();547++u) {548BasicBlock *tb = texes[i]->bb;549BasicBlock *ub = u->insn->bb;550if (tb == ub) {551u->level = 0;552for (size_t j = i + 1; j < texes.size() &&553texes[j]->bb == tb && texes[j]->serial < u->insn->serial;554++j)555u->level++;556} else {557u->level = fn->cfg.findLightestPathWeight(&tb->cfg,558&ub->cfg, texCounts);559if (u->level < 0) {560WARN("Failed to find path TEX -> TEXBAR\n");561u->level = 0;562continue;563}564// this counted all TEXes in the origin block, correct that565u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;566// and did not count the TEXes in the destination block, add those567for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&568texes[j]->bb == ub && texes[j]->serial < u->insn->serial;569++j)570u->level++;571}572assert(u->level >= 0);573useVec.push_back(*u);574}575}576delete[] uses;577578// insert the barriers579for (size_t i = 0; i < useVec.size(); ++i) {580Instruction *prev = useVec[i].insn->prev;581if (useVec[i].level < 0)582continue;583if (prev && prev->op == OP_TEXBAR) {584if (prev->subOp > useVec[i].level)585prev->subOp = useVec[i].level;586prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));587} else {588Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);589bar->fixed = 1;590bar->subOp = useVec[i].level;591// make use explicit to ease latency calculation592bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));593useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);594}595}596597if (fn->getProgram()->optLevel < 3)598return true;599600std::vector<Limits> limitT, limitB, limitS; // entry, exit, single601602limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));603limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));604limitS.resize(fn->allBBlocks.getSize());605606// cull unneeded barriers (should do that earlier, but for simplicity)607IteratorRef bi = fn->cfg.iteratorCFG();608// first calculate min/max outstanding TEXes for each BB609for (bi->reset(); !bi->end(); bi->next()) {610Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());611BasicBlock *bb = BasicBlock::get(n);612int min = 0;613int max = std::numeric_limits<int>::max();614for (Instruction *i = bb->getFirst(); i; i = i->next) {615if (isTextureOp(i->op)) {616min++;617if (max < std::numeric_limits<int>::max())618max++;619} else620if (i->op == OP_TEXBAR) {621min = MIN2(min, i->subOp);622max = MIN2(max, i->subOp);623}624}625// limits when looking at an isolated block626limitS[bb->getId()].min = min;627limitS[bb->getId()].max = max;628}629// propagate the min/max values630for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {631for (bi->reset(); !bi->end(); bi->next()) {632Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());633BasicBlock *bb = BasicBlock::get(n);634const int bbId = bb->getId();635for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {636BasicBlock *in = BasicBlock::get(ei.getNode());637const int inId = in->getId();638limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);639limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);640}641// I just hope this is correct ...642if (limitS[bbId].max == std::numeric_limits<int>::max()) {643// no barrier644limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;645limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;646} else {647// block contained a barrier648limitB[bbId].min = MIN2(limitS[bbId].max,649limitT[bbId].min + limitS[bbId].min);650limitB[bbId].max = MIN2(limitS[bbId].max,651limitT[bbId].max + limitS[bbId].min);652}653}654}655// finally delete unnecessary barriers656for (bi->reset(); !bi->end(); bi->next()) {657Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());658BasicBlock *bb = BasicBlock::get(n);659Instruction *prev = NULL;660Instruction *next;661int max = limitT[bb->getId()].max;662for (Instruction *i = bb->getFirst(); i; i = next) {663next = i->next;664if (i->op == OP_TEXBAR) {665if (i->subOp >= max) {666delete_Instruction(prog, i);667i = NULL;668} else {669max = i->subOp;670if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {671delete_Instruction(prog, prev);672prev = NULL;673}674}675} else676if (isTextureOp(i->op)) {677max++;678}679if (i && !i->isNop())680prev = i;681}682}683return true;684}685686bool687NVC0LegalizePostRA::visit(Function *fn)688{689if (needTexBar)690insertTextureBarriers(fn);691692rZero = new_LValue(fn, FILE_GPR);693pOne = new_LValue(fn, FILE_PREDICATE);694carry = new_LValue(fn, FILE_FLAGS);695696rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63;697carry->reg.data.id = 0;698pOne->reg.data.id = 7;699700return true;701}702703void704NVC0LegalizePostRA::replaceZero(Instruction *i)705{706for (int s = 0; i->srcExists(s); ++s) {707if (s == 2 && i->op == OP_SUCLAMP)708continue;709if (s == 1 && i->op == OP_SHLADD)710continue;711ImmediateValue *imm = i->getSrc(s)->asImm();712if (imm) {713if (i->op == OP_SELP && s == 2) {714i->setSrc(s, pOne);715if (imm->reg.data.u64 == 0)716i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT);717} else if (imm->reg.data.u64 == 0) {718i->setSrc(s, rZero);719}720}721}722}723724// replace CONT with BRA for single unconditional continue725bool726NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)727{728if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)729return false;730Graph::EdgeIterator ei = bb->cfg.incident();731if (ei.getType() != Graph::Edge::BACK)732ei.next();733if (ei.getType() != Graph::Edge::BACK)734return false;735BasicBlock *contBB = BasicBlock::get(ei.getNode());736737if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||738contBB->getExit()->getPredicate())739return false;740contBB->getExit()->op = OP_BRA;741bb->remove(bb->getEntry()); // delete PRECONT742743ei.next();744assert(ei.end() || ei.getType() != Graph::Edge::BACK);745return true;746}747748// replace branches to join blocks with join ops749void750NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)751{752if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)753return;754for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {755BasicBlock *in = BasicBlock::get(ei.getNode());756Instruction *exit = in->getExit();757if (!exit) {758in->insertTail(new FlowInstruction(func, OP_JOIN, bb));759// there should always be a terminator instruction760WARN("inserted missing terminator in BB:%i\n", in->getId());761} else762if (exit->op == OP_BRA) {763exit->op = OP_JOIN;764exit->asFlow()->limit = 1; // must-not-propagate marker765}766}767bb->remove(bb->getEntry());768}769770// replaces instructions which would end up as f2f or i2i with faster771// alternatives:772// - fabs(a) -> fadd(0, abs a)773// - fneg(a) -> fadd(neg 0, neg a)774// - ineg(a) -> iadd(0, neg a)775// - fneg(abs a) -> fadd(neg 0, neg abs a)776// - sat(a) -> sat add(0, a)777void778NVC0LegalizePostRA::replaceCvt(Instruction *cvt)779{780if (!isFloatType(cvt->sType) && typeSizeof(cvt->sType) != 4)781return;782if (cvt->sType != cvt->dType)783return;784// we could make it work, but in this case we have optimizations disabled785// and we don't really care either way.786if (cvt->src(0).getFile() != FILE_GPR &&787cvt->src(0).getFile() != FILE_MEMORY_CONST)788return;789790Modifier mod0, mod1;791792switch (cvt->op) {793case OP_ABS:794if (cvt->src(0).mod)795return;796if (!isFloatType(cvt->sType))797return;798mod0 = 0;799mod1 = NV50_IR_MOD_ABS;800break;801case OP_NEG:802if (!isFloatType(cvt->sType) && cvt->src(0).mod)803return;804if (isFloatType(cvt->sType) &&805(cvt->src(0).mod && cvt->src(0).mod != Modifier(NV50_IR_MOD_ABS)))806return;807808mod0 = isFloatType(cvt->sType) ? NV50_IR_MOD_NEG : 0;809mod1 = cvt->src(0).mod == Modifier(NV50_IR_MOD_ABS) ?810NV50_IR_MOD_NEG_ABS : NV50_IR_MOD_NEG;811break;812case OP_SAT:813if (!isFloatType(cvt->sType) && cvt->src(0).mod.abs())814return;815mod0 = 0;816mod1 = cvt->src(0).mod;817cvt->saturate = true;818break;819default:820return;821}822823cvt->op = OP_ADD;824cvt->moveSources(0, 1);825cvt->setSrc(0, rZero);826cvt->src(0).mod = mod0;827cvt->src(1).mod = mod1;828}829830bool831NVC0LegalizePostRA::visit(BasicBlock *bb)832{833Instruction *i, *next;834835// remove pseudo operations and non-fixed no-ops, split 64 bit operations836for (i = bb->getFirst(); i; i = next) {837next = i->next;838if (i->op == OP_EMIT || i->op == OP_RESTART) {839if (!i->getDef(0)->refCount())840i->setDef(0, NULL);841if (i->src(0).getFile() == FILE_IMMEDIATE)842i->setSrc(0, rZero); // initial value must be 0843replaceZero(i);844} else845if (i->isNop()) {846bb->remove(i);847} else848if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&849prog->getType() != Program::TYPE_COMPUTE) {850// It seems like barriers are never required for tessellation since851// the warp size is 32, and there are always at most 32 tcs threads.852bb->remove(i);853} else854if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {855int offset = i->src(0).get()->reg.data.offset;856if (abs(offset) >= 0x10000)857i->src(0).get()->reg.fileIndex += offset >> 16;858i->src(0).get()->reg.data.offset = (int)(short)offset;859} else {860// TODO: Move this to before register allocation for operations that861// need the $c register !862if (typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) {863Instruction *hi;864hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);865if (hi)866next = hi;867}868869if (i->op != OP_MOV && i->op != OP_PFETCH)870replaceZero(i);871872if (i->op == OP_SAT || i->op == OP_NEG || i->op == OP_ABS)873replaceCvt(i);874}875}876if (!bb->getEntry())877return true;878879if (!tryReplaceContWithBra(bb))880propagateJoin(bb);881882return true;883}884885NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget()),886gpEmitAddress(NULL)887{888bld.setProgram(prog);889}890891bool892NVC0LoweringPass::visit(Function *fn)893{894if (prog->getType() == Program::TYPE_GEOMETRY) {895assert(!strncmp(fn->getName(), "MAIN", 4));896// TODO: when we generate actual functions pass this value along somehow897bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);898gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();899if (fn->cfgExit) {900bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);901if (prog->getTarget()->getChipset() >= NVISA_GV100_CHIPSET)902bld.mkOp1(OP_FINAL, TYPE_NONE, NULL, gpEmitAddress)->fixed = 1;903bld.mkMovToReg(0, gpEmitAddress);904}905}906return true;907}908909bool910NVC0LoweringPass::visit(BasicBlock *bb)911{912return true;913}914915inline Value *916NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)917{918uint8_t b = prog->driver->io.auxCBSlot;919uint32_t off = prog->driver->io.texBindBase + slot * 4;920921if (ptr)922ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2));923924return bld.925mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);926}927928// move array source to first slot, convert to u16, add indirections929bool930NVC0LoweringPass::handleTEX(TexInstruction *i)931{932const int dim = i->tex.target.getDim() + i->tex.target.isCube();933const int arg = i->tex.target.getArgCount();934const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);935const int chipset = prog->getTarget()->getChipset();936937/* Only normalize in the non-explicit derivatives case. For explicit938* derivatives, this is handled in handleManualTXD.939*/940if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {941Value *src[3], *val;942int c;943for (c = 0; c < 3; ++c)944src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));945val = bld.getScratch();946bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);947bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);948bld.mkOp1(OP_RCP, TYPE_F32, val, val);949for (c = 0; c < 3; ++c) {950i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),951i->getSrc(c), val));952}953}954955// Arguments to the TEX instruction are a little insane. Even though the956// encoding is identical between SM20 and SM30, the arguments mean957// different things between Fermi and Kepler+. A lot of arguments are958// optional based on flags passed to the instruction. This summarizes the959// order of things.960//961// Fermi:962// array/indirect963// coords964// sample965// lod bias966// depth compare967// offsets:968// - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)969// - other: 4 bits each, single reg970//971// Kepler+:972// indirect handle973// array (+ offsets for txd in upper 16 bits)974// coords975// sample976// lod bias977// depth compare978// offsets (same as fermi, except txd which takes it with array)979//980// Maxwell (tex):981// array982// coords983// indirect handle984// sample985// lod bias986// depth compare987// offsets988//989// Maxwell (txd):990// indirect handle991// coords992// array + offsets993// derivatives994995if (chipset >= NVISA_GK104_CHIPSET) {996if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {997// XXX this ignores tsc, and assumes a 1:1 mapping998assert(i->tex.rIndirectSrc >= 0);999if (!i->tex.bindless) {1000Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r);1001i->tex.r = 0xff;1002i->tex.s = 0x1f;1003i->setIndirectR(hnd);1004}1005i->setIndirectS(NULL);1006} else if (i->tex.r == i->tex.s || i->op == OP_TXF) {1007if (i->tex.r == 0xffff)1008i->tex.r = prog->driver->io.fbtexBindBase / 4;1009else1010i->tex.r += prog->driver->io.texBindBase / 4;1011i->tex.s = 0; // only a single cX[] value possible here1012} else {1013Value *hnd = bld.getScratch();1014Value *rHnd = loadTexHandle(NULL, i->tex.r);1015Value *sHnd = loadTexHandle(NULL, i->tex.s);10161017bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);10181019i->tex.r = 0; // not used for indirect tex1020i->tex.s = 0;1021i->setIndirectR(hnd);1022}1023if (i->tex.target.isArray()) {1024LValue *layer = new_LValue(func, FILE_GPR);1025Value *src = i->getSrc(lyr);1026const int sat = (i->op == OP_TXF) ? 1 : 0;1027DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;1028bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;1029if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {1030for (int s = dim; s >= 1; --s)1031i->setSrc(s, i->getSrc(s - 1));1032i->setSrc(0, layer);1033} else {1034i->setSrc(dim, layer);1035}1036}1037// Move the indirect reference to the first place1038if (i->tex.rIndirectSrc >= 0 && (1039i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {1040Value *hnd = i->getIndirectR();10411042i->setIndirectR(NULL);1043i->moveSources(0, 1);1044i->setSrc(0, hnd);1045i->tex.rIndirectSrc = 0;1046i->tex.sIndirectSrc = -1;1047}1048// Move the indirect reference to right after the coords1049else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {1050Value *hnd = i->getIndirectR();10511052i->setIndirectR(NULL);1053i->moveSources(arg, 1);1054i->setSrc(arg, hnd);1055i->tex.rIndirectSrc = 0;1056i->tex.sIndirectSrc = -1;1057}1058} else1059// (nvc0) generate and move the tsc/tic/array source to the front1060if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {1061LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa10621063Value *ticRel = i->getIndirectR();1064Value *tscRel = i->getIndirectS();10651066if (i->tex.r == 0xffff) {1067i->tex.r = 0x20;1068i->tex.s = 0x10;1069}10701071if (ticRel) {1072i->setSrc(i->tex.rIndirectSrc, NULL);1073if (i->tex.r)1074ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),1075ticRel, bld.mkImm(i->tex.r));1076}1077if (tscRel) {1078i->setSrc(i->tex.sIndirectSrc, NULL);1079if (i->tex.s)1080tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),1081tscRel, bld.mkImm(i->tex.s));1082}10831084Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;1085if (arrayIndex) {1086for (int s = dim; s >= 1; --s)1087i->setSrc(s, i->getSrc(s - 1));1088i->setSrc(0, arrayIndex);1089} else {1090i->moveSources(0, 1);1091}10921093if (arrayIndex) {1094int sat = (i->op == OP_TXF) ? 1 : 0;1095DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;1096bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;1097} else {1098bld.loadImm(src, 0);1099}11001101if (ticRel)1102bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);1103if (tscRel)1104bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);11051106i->setSrc(0, src);1107}11081109// For nvc0, the sample id has to be in the second operand, as the offset1110// does. Right now we don't know how to pass both in, and this case can't1111// happen with OpenGL. On nve0, the sample id is part of the texture1112// coordinate argument.1113assert(chipset >= NVISA_GK104_CHIPSET ||1114!i->tex.useOffsets || !i->tex.target.isMS());11151116// offset is between lod and dc1117if (i->tex.useOffsets) {1118int n, c;1119int s = i->srcCount(0xff, true);1120if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {1121if (i->tex.target.isShadow())1122s--;1123if (i->srcExists(s)) // move potential predicate out of the way1124i->moveSources(s, 1);1125if (i->tex.useOffsets == 4 && i->srcExists(s + 1))1126i->moveSources(s + 1, 1);1127}1128if (i->op == OP_TXG) {1129// Either there is 1 offset, which goes into the 2 low bytes of the1130// first source, or there are 4 offsets, which go into 2 sources (81131// values, 1 byte each).1132Value *offs[2] = {NULL, NULL};1133for (n = 0; n < i->tex.useOffsets; n++) {1134for (c = 0; c < 2; ++c) {1135if ((n % 2) == 0 && c == 0)1136bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());1137else1138bld.mkOp3(OP_INSBF, TYPE_U32,1139offs[n / 2],1140i->offset[n][c].get(),1141bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),1142offs[n / 2]);1143}1144}1145i->setSrc(s, offs[0]);1146if (offs[1])1147i->setSrc(s + 1, offs[1]);1148} else {1149unsigned imm = 0;1150assert(i->tex.useOffsets == 1);1151for (c = 0; c < 3; ++c) {1152ImmediateValue val;1153if (!i->offset[0][c].getImmediate(val))1154assert(!"non-immediate offset passed to non-TXG");1155imm |= (val.reg.data.u32 & 0xf) << (c * 4);1156}1157if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {1158// The offset goes into the upper 16 bits of the array index. So1159// create it if it's not already there, and INSBF it if it already1160// is.1161s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;1162if (chipset >= NVISA_GM107_CHIPSET)1163s += dim;1164if (i->tex.target.isArray()) {1165Value *offset = bld.getScratch();1166bld.mkOp3(OP_INSBF, TYPE_U32, offset,1167bld.loadImm(NULL, imm), bld.mkImm(0xc10),1168i->getSrc(s));1169i->setSrc(s, offset);1170} else {1171i->moveSources(s, 1);1172i->setSrc(s, bld.loadImm(NULL, imm << 16));1173}1174} else {1175i->setSrc(s, bld.loadImm(NULL, imm));1176}1177}1178}11791180return true;1181}11821183bool1184NVC0LoweringPass::handleManualTXD(TexInstruction *i)1185{1186// Always done from the l0 perspective. This is the way that NVIDIA's1187// driver does it, and doing it from the "current" lane's perspective1188// doesn't seem to always work for reasons that aren't altogether clear,1189// even in frag shaders.1190//1191// Note that we must move not only the coordinates into lane0, but also all1192// ancillary arguments, like array indices and depth compare as they may1193// differ between lanes. Offsets for TXD are supposed to be uniform, so we1194// leave them alone.1195static const uint8_t qOps[2] =1196{ QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) };11971198Value *def[4][4];1199Value *crd[3], *arr[2], *shadow;1200Instruction *tex;1201Value *zero = bld.loadImm(bld.getSSA(), 0);1202int l, c;1203const int dim = i->tex.target.getDim() + i->tex.target.isCube();12041205// This function is invoked after handleTEX lowering, so we have to expect1206// the arguments in the order that the hw wants them. For Fermi, array and1207// indirect are both in the leading arg, while for Kepler, array and1208// indirect are separate (and both precede the coordinates). Maxwell is1209// handled in a separate function.1210int array;1211if (targ->getChipset() < NVISA_GK104_CHIPSET)1212array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;1213else1214array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);12151216i->op = OP_TEX; // no need to clone dPdx/dPdy later12171218for (c = 0; c < dim; ++c)1219crd[c] = bld.getScratch();1220for (c = 0; c < array; ++c)1221arr[c] = bld.getScratch();1222shadow = bld.getScratch();12231224for (l = 0; l < 4; ++l) {1225Value *src[3], *val;12261227bld.mkOp(OP_QUADON, TYPE_NONE, NULL);1228// we're using the texture result from lane 0 in all cases, so make sure1229// that lane 0 is pointing at the proper array index, indirect value,1230// and depth compare.1231if (l != 0) {1232for (c = 0; c < array; ++c)1233bld.mkQuadop(0x00, arr[c], l, i->getSrc(c), zero);1234if (i->tex.target.isShadow()) {1235// The next argument after coords is the depth compare1236bld.mkQuadop(0x00, shadow, l, i->getSrc(array + dim), zero);1237}1238}1239// mov position coordinates from lane l to all lanes1240for (c = 0; c < dim; ++c)1241bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);1242// add dPdx from lane l to lanes dx1243for (c = 0; c < dim; ++c)1244bld.mkQuadop(qOps[0], crd[c], l, i->dPdx[c].get(), crd[c]);1245// add dPdy from lane l to lanes dy1246for (c = 0; c < dim; ++c)1247bld.mkQuadop(qOps[1], crd[c], l, i->dPdy[c].get(), crd[c]);1248// normalize cube coordinates1249if (i->tex.target.isCube()) {1250for (c = 0; c < 3; ++c)1251src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);1252val = bld.getScratch();1253bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);1254bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);1255bld.mkOp1(OP_RCP, TYPE_F32, val, val);1256for (c = 0; c < 3; ++c)1257src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);1258} else {1259for (c = 0; c < dim; ++c)1260src[c] = crd[c];1261}1262// texture1263bld.insert(tex = cloneForward(func, i));1264if (l != 0) {1265for (c = 0; c < array; ++c)1266tex->setSrc(c, arr[c]);1267if (i->tex.target.isShadow())1268tex->setSrc(array + dim, shadow);1269}1270for (c = 0; c < dim; ++c)1271tex->setSrc(c + array, src[c]);1272// broadcast results from lane 0 to all lanes so that the moves *into*1273// the target lane pick up the proper value.1274if (l != 0)1275for (c = 0; i->defExists(c); ++c)1276bld.mkQuadop(0x00, tex->getDef(c), 0, tex->getDef(c), zero);1277bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);12781279// save results1280for (c = 0; i->defExists(c); ++c) {1281Instruction *mov;1282def[c][l] = bld.getSSA();1283mov = bld.mkMov(def[c][l], tex->getDef(c));1284mov->fixed = 1;1285mov->lanes = 1 << l;1286}1287}12881289for (c = 0; i->defExists(c); ++c) {1290Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));1291for (l = 0; l < 4; ++l)1292u->setSrc(l, def[c][l]);1293}12941295i->bb->remove(i);1296return true;1297}12981299bool1300NVC0LoweringPass::handleTXD(TexInstruction *txd)1301{1302int dim = txd->tex.target.getDim() + txd->tex.target.isCube();1303unsigned arg = txd->tex.target.getArgCount();1304unsigned expected_args = arg;1305const int chipset = prog->getTarget()->getChipset();13061307if (chipset >= NVISA_GK104_CHIPSET) {1308if (!txd->tex.target.isArray() && txd->tex.useOffsets)1309expected_args++;1310if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)1311expected_args++;1312} else {1313if (txd->tex.useOffsets)1314expected_args++;1315if (!txd->tex.target.isArray() && (1316txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))1317expected_args++;1318}13191320if (expected_args > 4 ||1321dim > 2 ||1322txd->tex.target.isShadow())1323txd->op = OP_TEX;13241325handleTEX(txd);1326while (txd->srcExists(arg))1327++arg;13281329txd->tex.derivAll = true;1330if (txd->op == OP_TEX)1331return handleManualTXD(txd);13321333assert(arg == expected_args);1334for (int c = 0; c < dim; ++c) {1335txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);1336txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);1337txd->dPdx[c].set(NULL);1338txd->dPdy[c].set(NULL);1339}13401341// In this case we have fewer than 4 "real" arguments, which means that1342// handleTEX didn't apply any padding. However we have to make sure that1343// the second "group" of arguments still gets padded up to 4.1344if (chipset >= NVISA_GK104_CHIPSET) {1345int s = arg + 2 * dim;1346if (s >= 4 && s < 7) {1347if (txd->srcExists(s)) // move potential predicate out of the way1348txd->moveSources(s, 7 - s);1349while (s < 7)1350txd->setSrc(s++, bld.loadImm(NULL, 0));1351}1352}13531354return true;1355}13561357bool1358NVC0LoweringPass::handleTXQ(TexInstruction *txq)1359{1360const int chipset = prog->getTarget()->getChipset();1361if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)1362txq->tex.r += prog->driver->io.texBindBase / 4;13631364if (txq->tex.rIndirectSrc < 0)1365return true;13661367Value *ticRel = txq->getIndirectR();13681369txq->setIndirectS(NULL);1370txq->tex.sIndirectSrc = -1;13711372assert(ticRel);13731374if (chipset < NVISA_GK104_CHIPSET) {1375LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa13761377txq->setSrc(txq->tex.rIndirectSrc, NULL);1378if (txq->tex.r)1379ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),1380ticRel, bld.mkImm(txq->tex.r));13811382bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));13831384txq->moveSources(0, 1);1385txq->setSrc(0, src);1386} else {1387Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r);1388txq->tex.r = 0xff;1389txq->tex.s = 0x1f;13901391txq->setIndirectR(NULL);1392txq->moveSources(0, 1);1393txq->setSrc(0, hnd);1394txq->tex.rIndirectSrc = 0;1395}13961397return true;1398}13991400bool1401NVC0LoweringPass::handleTXLQ(TexInstruction *i)1402{1403/* The outputs are inverted compared to what the TGSI instruction1404* expects. Take that into account in the mask.1405*/1406assert((i->tex.mask & ~3) == 0);1407if (i->tex.mask == 1)1408i->tex.mask = 2;1409else if (i->tex.mask == 2)1410i->tex.mask = 1;1411handleTEX(i);1412bld.setPosition(i, true);14131414/* The returned values are not quite what we want:1415* (a) convert from s16/u16 to f321416* (b) multiply by 1/2561417*/1418for (int def = 0; def < 2; ++def) {1419if (!i->defExists(def))1420continue;1421enum DataType type = TYPE_S16;1422if (i->tex.mask == 2 || def > 0)1423type = TYPE_U16;1424bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));1425bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),1426i->getDef(def), bld.loadImm(NULL, 1.0f / 256));1427}1428if (i->tex.mask == 3) {1429LValue *t = new_LValue(func, FILE_GPR);1430bld.mkMov(t, i->getDef(0));1431bld.mkMov(i->getDef(0), i->getDef(1));1432bld.mkMov(i->getDef(1), t);1433}1434return true;1435}14361437bool1438NVC0LoweringPass::handleBUFQ(Instruction *bufq)1439{1440bufq->op = OP_MOV;1441bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1),1442bufq->getSrc(0)->reg.fileIndex * 16));1443bufq->setIndirect(0, 0, NULL);1444bufq->setIndirect(0, 1, NULL);1445return true;1446}14471448void1449NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)1450{1451assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);14521453BasicBlock *currBB = atom->bb;1454BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);1455BasicBlock *joinBB = atom->bb->splitAfter(atom);1456BasicBlock *setAndUnlockBB = new BasicBlock(func);1457BasicBlock *failLockBB = new BasicBlock(func);14581459bld.setPosition(currBB, true);1460assert(!currBB->joinAt);1461currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);14621463CmpInstruction *pred =1464bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),1465TYPE_U32, bld.mkImm(0), bld.mkImm(1));14661467bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);1468currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);14691470bld.setPosition(tryLockBB, true);14711472Instruction *ld =1473bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),1474atom->getIndirect(0, 0));1475ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));1476ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;14771478bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));1479bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);1480tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);1481tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);14821483tryLockBB->cfg.detach(&joinBB->cfg);1484bld.remove(atom);14851486bld.setPosition(setAndUnlockBB, true);1487Value *stVal;1488if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {1489// Read the old value, and write the new one.1490stVal = atom->getSrc(1);1491} else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {1492CmpInstruction *set =1493bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),1494TYPE_U32, ld->getDef(0), atom->getSrc(1));14951496bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),1497TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));1498} else {1499operation op;15001501switch (atom->subOp) {1502case NV50_IR_SUBOP_ATOM_ADD:1503op = OP_ADD;1504break;1505case NV50_IR_SUBOP_ATOM_AND:1506op = OP_AND;1507break;1508case NV50_IR_SUBOP_ATOM_OR:1509op = OP_OR;1510break;1511case NV50_IR_SUBOP_ATOM_XOR:1512op = OP_XOR;1513break;1514case NV50_IR_SUBOP_ATOM_MIN:1515op = OP_MIN;1516break;1517case NV50_IR_SUBOP_ATOM_MAX:1518op = OP_MAX;1519break;1520default:1521assert(0);1522return;1523}15241525stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),1526atom->getSrc(1));1527}15281529Instruction *st =1530bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),1531atom->getIndirect(0, 0), stVal);1532st->setDef(0, pred->getDef(0));1533st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;15341535bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);1536setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);15371538// Lock until the store has not been performed.1539bld.setPosition(failLockBB, true);1540bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));1541bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);1542failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);1543failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);15441545bld.setPosition(joinBB, false);1546bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;1547}15481549void1550NVC0LoweringPass::handleSharedATOM(Instruction *atom)1551{1552assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);15531554BasicBlock *currBB = atom->bb;1555BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false);1556BasicBlock *joinBB = atom->bb->splitAfter(atom);15571558bld.setPosition(currBB, true);1559assert(!currBB->joinAt);1560currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);15611562bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL);1563currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE);15641565bld.setPosition(tryLockAndSetBB, true);15661567Instruction *ld =1568bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),1569atom->getIndirect(0, 0));1570ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));1571ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;15721573Value *stVal;1574if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {1575// Read the old value, and write the new one.1576stVal = atom->getSrc(1);1577} else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {1578CmpInstruction *set =1579bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),1580TYPE_U32, ld->getDef(0), atom->getSrc(1));1581set->setPredicate(CC_P, ld->getDef(1));15821583Instruction *selp =1584bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0),1585atom->getSrc(2), set->getDef(0));1586selp->src(2).mod = Modifier(NV50_IR_MOD_NOT);1587selp->setPredicate(CC_P, ld->getDef(1));15881589stVal = selp->getDef(0);1590} else {1591operation op;15921593switch (atom->subOp) {1594case NV50_IR_SUBOP_ATOM_ADD:1595op = OP_ADD;1596break;1597case NV50_IR_SUBOP_ATOM_AND:1598op = OP_AND;1599break;1600case NV50_IR_SUBOP_ATOM_OR:1601op = OP_OR;1602break;1603case NV50_IR_SUBOP_ATOM_XOR:1604op = OP_XOR;1605break;1606case NV50_IR_SUBOP_ATOM_MIN:1607op = OP_MIN;1608break;1609case NV50_IR_SUBOP_ATOM_MAX:1610op = OP_MAX;1611break;1612default:1613assert(0);1614return;1615}16161617Instruction *i =1618bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),1619atom->getSrc(1));1620i->setPredicate(CC_P, ld->getDef(1));16211622stVal = i->getDef(0);1623}16241625Instruction *st =1626bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),1627atom->getIndirect(0, 0), stVal);1628st->setPredicate(CC_P, ld->getDef(1));1629st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;16301631// Loop until the lock is acquired.1632bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1));1633tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK);1634tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS);1635bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);16361637bld.remove(atom);16381639bld.setPosition(joinBB, false);1640bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;1641}16421643bool1644NVC0LoweringPass::handleATOM(Instruction *atom)1645{1646SVSemantic sv;1647Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;16481649switch (atom->src(0).getFile()) {1650case FILE_MEMORY_LOCAL:1651sv = SV_LBASE;1652break;1653case FILE_MEMORY_SHARED:1654// For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic1655// operations on shared memory. For Maxwell, ATOMS is enough.1656if (targ->getChipset() < NVISA_GK104_CHIPSET)1657handleSharedATOM(atom);1658else if (targ->getChipset() < NVISA_GM107_CHIPSET)1659handleSharedATOMNVE4(atom);1660return true;1661case FILE_MEMORY_GLOBAL:1662return true;1663default:1664assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);1665base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);1666assert(base->reg.size == 8);1667if (ptr)1668base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);1669assert(base->reg.size == 8);1670atom->setIndirect(0, 0, base);1671atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;16721673// Harden against out-of-bounds accesses1674Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType));1675Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16);1676Value *pred = new_LValue(func, FILE_PREDICATE);1677if (ptr)1678bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr);1679bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);1680atom->setPredicate(CC_NOT_P, pred);1681if (atom->defExists(0)) {1682Value *zero, *dst = atom->getDef(0);1683atom->setDef(0, bld.getSSA());16841685bld.setPosition(atom, true);1686bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))1687->setPredicate(CC_P, pred);1688bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero);1689}16901691return true;1692}1693base =1694bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));16951696atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));1697atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;1698if (ptr)1699base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);1700atom->setIndirect(0, 1, NULL);1701atom->setIndirect(0, 0, base);17021703return true;1704}17051706bool1707NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)1708{1709if (targ->getChipset() < NVISA_GM107_CHIPSET) {1710if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {1711// ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().1712return false;1713}1714}17151716if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&1717cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)1718return false;1719bld.setPosition(cas, true);17201721if (needCctl) {1722Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));1723cctl->setIndirect(0, 0, cas->getIndirect(0, 0));1724cctl->fixed = 1;1725cctl->subOp = NV50_IR_SUBOP_CCTL_IV;1726if (cas->isPredicated())1727cctl->setPredicate(cas->cc, cas->getPredicate());1728}17291730if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS &&1731targ->getChipset() < NVISA_GV100_CHIPSET) {1732// CAS is crazy. It's 2nd source is a double reg, and the 3rd source1733// should be set to the high part of the double reg or bad things will1734// happen elsewhere in the universe.1735// Also, it sometimes returns the new value instead of the old one1736// under mysterious circumstances.1737DataType ty = typeOfSize(typeSizeof(cas->dType) * 2);1738Value *dreg = bld.getSSA(typeSizeof(ty));1739bld.setPosition(cas, false);1740bld.mkOp2(OP_MERGE, ty, dreg, cas->getSrc(1), cas->getSrc(2));1741cas->setSrc(1, dreg);1742cas->setSrc(2, dreg);1743}17441745return true;1746}17471748inline Value *1749NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)1750{1751uint8_t b = prog->driver->io.auxCBSlot;1752off += base;17531754return bld.1755mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);1756}17571758inline Value *1759NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)1760{1761uint8_t b = prog->driver->io.auxCBSlot;1762off += base;17631764if (ptr)1765ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));17661767return bld.1768mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);1769}17701771inline Value *1772NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)1773{1774uint8_t b = prog->driver->io.auxCBSlot;1775off += base;17761777if (ptr)1778ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));17791780return bld.1781mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);1782}17831784inline Value *1785NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)1786{1787return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);1788}17891790inline Value *1791NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)1792{1793return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);1794}17951796inline Value *1797NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)1798{1799return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);1800}18011802inline Value *1803NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)1804{1805return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);1806}18071808inline Value *1809NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)1810{1811uint8_t b = prog->driver->io.msInfoCBSlot;1812off += prog->driver->io.msInfoBase;1813return bld.1814mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);1815}18161817inline Value *1818NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless)1819{1820uint32_t base = slot * NVC0_SU_INFO__STRIDE;18211822// We don't upload surface info for bindless for GM107+1823assert(!bindless || targ->getChipset() < NVISA_GM107_CHIPSET);18241825if (ptr) {1826ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));1827if (bindless)1828ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(511));1829else1830ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));1831ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6));1832base = 0;1833}1834off += base;18351836return loadResInfo32(ptr, off, bindless ? prog->driver->io.bindlessBase :1837prog->driver->io.suInfoBase);1838}18391840Value *1841NVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target, uint32_t index, int slot, Value *ind, bool bindless)1842{1843if (!bindless || targ->getChipset() < NVISA_GM107_CHIPSET)1844return loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(index), bindless);18451846assert(bindless);18471848Value *samples = bld.getSSA();1849// this shouldn't be lowered because it's being inserted before the current instruction1850TexInstruction *tex = new_TexInstruction(func, OP_TXQ);1851tex->tex.target = target;1852tex->tex.query = TXQ_TYPE;1853tex->tex.mask = 0x4;1854tex->tex.r = 0xff;1855tex->tex.s = 0x1f;1856tex->tex.rIndirectSrc = 0;1857tex->setDef(0, samples);1858tex->setSrc(0, ind);1859tex->setSrc(1, bld.loadImm(NULL, 0));1860bld.insert(tex);18611862// doesn't work with sample counts other than 1/2/4/8 but they aren't supported1863switch (index) {1864case 0: {1865Value *tmp = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), samples, bld.mkImm(2));1866return bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(2));1867}1868case 1: {1869Value *tmp = bld.mkCmp(OP_SET, CC_GT, TYPE_U32, bld.getSSA(), TYPE_U32, samples, bld.mkImm(2))->getDef(0);1870return bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(1));1871}1872default: {1873assert(false);1874return NULL;1875}1876}1877}18781879static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)1880{1881switch (su->tex.target.getEnum()) {1882case TEX_TARGET_BUFFER: return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);1883case TEX_TARGET_RECT: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);1884case TEX_TARGET_1D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);1885case TEX_TARGET_1D_ARRAY: return (c == 1) ?1886NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :1887NV50_IR_SUBOP_SUCLAMP_SD(0, 2);1888case TEX_TARGET_2D: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);1889case TEX_TARGET_2D_MS: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);1890case TEX_TARGET_2D_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);1891case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);1892case TEX_TARGET_3D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);1893case TEX_TARGET_CUBE: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);1894case TEX_TARGET_CUBE_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);1895default:1896assert(0);1897return 0;1898}1899}19001901bool1902NVC0LoweringPass::handleSUQ(TexInstruction *suq)1903{1904int mask = suq->tex.mask;1905int dim = suq->tex.target.getDim();1906int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());1907Value *ind = suq->getIndirectR();1908int slot = suq->tex.r;1909int c, d;19101911for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {1912if (c >= arg || !(mask & 1))1913continue;19141915int offset;19161917if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {1918offset = NVC0_SU_INFO_SIZE(2);1919} else {1920offset = NVC0_SU_INFO_SIZE(c);1921}1922bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset, suq->tex.bindless));1923if (c == 2 && suq->tex.target.isCube())1924bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),1925bld.loadImm(NULL, 6));1926}19271928if (mask & 1) {1929if (suq->tex.target.isMS()) {1930Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), suq->tex.bindless);1931Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), suq->tex.bindless);1932Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);1933bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);1934} else {1935bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));1936}1937}19381939bld.remove(suq);1940return true;1941}19421943void1944NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)1945{1946const int arg = tex->tex.target.getArgCount();1947int slot = tex->tex.r;19481949if (tex->tex.target == TEX_TARGET_2D_MS)1950tex->tex.target = TEX_TARGET_2D;1951else1952if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)1953tex->tex.target = TEX_TARGET_2D_ARRAY;1954else1955return;19561957Value *x = tex->getSrc(0);1958Value *y = tex->getSrc(1);1959Value *s = tex->getSrc(arg - 1);19601961Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();1962Value *ind = tex->getIndirectR();19631964Value *ms_x = loadMsAdjInfo32(tex->tex.target, 0, slot, ind, tex->tex.bindless);1965Value *ms_y = loadMsAdjInfo32(tex->tex.target, 1, slot, ind, tex->tex.bindless);19661967bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);1968bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);19691970s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));1971s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));19721973Value *dx = loadMsInfo32(ts, 0x0);1974Value *dy = loadMsInfo32(ts, 0x4);19751976bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);1977bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);19781979tex->setSrc(0, tx);1980tex->setSrc(1, ty);1981tex->moveSources(arg, -1);1982}19831984// Sets 64-bit "generic address", predicate and format sources for SULD/SUST.1985// They're computed from the coordinates using the surface info in c[] space.1986void1987NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)1988{1989Instruction *insn;1990const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;1991const bool raw =1992su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;1993const int slot = su->tex.r;1994const int dim = su->tex.target.getDim();1995const bool array = su->tex.target.isArray() || su->tex.target.isCube();1996const int arg = dim + array;1997int c;1998Value *zero = bld.mkImm(0);1999Value *p1 = NULL;2000Value *v;2001Value *src[3];2002Value *bf, *eau, *off;2003Value *addr, *pred;2004Value *ind = su->getIndirectR();2005Value *y, *z;20062007off = bld.getScratch(4);2008bf = bld.getScratch(4);2009addr = bld.getSSA(8);2010pred = bld.getScratch(1, FILE_PREDICATE);20112012bld.setPosition(su, false);20132014adjustCoordinatesMS(su);20152016// calculate clamped coordinates2017for (c = 0; c < arg; ++c) {2018int dimc = c;20192020if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) {2021// The array index is stored in the Z component for 1D arrays.2022dimc = 2;2023}20242025src[c] = bld.getScratch();2026if (c == 0 && raw)2027v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X, su->tex.bindless);2028else2029v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc), su->tex.bindless);2030bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)2031->subOp = getSuClampSubOp(su, dimc);2032}2033for (; c < 3; ++c)2034src[c] = zero;20352036if (dim == 2 && !array) {2037v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);2038src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),2039v, bld.loadImm(NULL, 16));20402041v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless);2042bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero)2043->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2);2044}20452046// set predicate output2047if (su->tex.target == TEX_TARGET_BUFFER) {2048src[0]->getInsn()->setFlagsDef(1, pred);2049} else2050if (array) {2051p1 = bld.getSSA(1, FILE_PREDICATE);2052src[dim]->getInsn()->setFlagsDef(1, p1);2053}20542055// calculate pixel offset2056if (dim == 1) {2057y = z = zero;2058if (su->tex.target != TEX_TARGET_BUFFER)2059bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));2060} else {2061y = src[1];2062z = src[2];20632064v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);2065bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])2066->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l20672068v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);2069bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])2070->subOp = array ?2071NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l2072}20732074// calculate effective address part 12075if (su->tex.target == TEX_TARGET_BUFFER) {2076if (raw) {2077bf = src[0];2078} else {2079v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);2080bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)2081->subOp = NV50_IR_SUBOP_V1(7,6,8|2);2082}2083} else {2084uint16_t subOp = 0;20852086switch (dim) {2087case 1:2088break;2089case 2:2090if (array) {2091z = off;2092} else {2093subOp = NV50_IR_SUBOP_SUBFM_3D;2094}2095break;2096default:2097subOp = NV50_IR_SUBOP_SUBFM_3D;2098assert(dim == 3);2099break;2100}2101insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);2102insn->subOp = subOp;2103insn->setFlagsDef(1, pred);2104}21052106// part 22107v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless);21082109if (su->tex.target == TEX_TARGET_BUFFER) {2110eau = v;2111} else {2112eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);2113}2114// add array layer offset2115if (array) {2116v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);2117if (dim == 1)2118bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)2119->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u322120else2121bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)2122->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u322123// combine predicates2124assert(p1);2125bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);2126}21272128if (atom) {2129Value *lo = bf;2130if (su->tex.target == TEX_TARGET_BUFFER) {2131lo = zero;2132bld.mkMov(off, bf);2133}2134// bf == g[] address & 0xff2135// eau == g[] address >> 82136bld.mkOp3(OP_PERMT, TYPE_U32, bf, lo, bld.loadImm(NULL, 0x6540), eau);2137bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);2138} else2139if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {2140// Convert from u32 to u8 address format, which is what the library code2141// doing SULDP currently uses.2142// XXX: can SUEAU do this ?2143// XXX: does it matter that we don't mask high bytes in bf ?2144// Grrr.2145bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));2146bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);2147}21482149bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);21502151if (atom && su->tex.target == TEX_TARGET_BUFFER)2152bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);21532154// let's just set it 0 for raw access and hope it works2155v = raw ?2156bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);21572158// get rid of old coordinate sources, make space for fmt info and predicate2159su->moveSources(arg, 3 - arg);2160// set 64 bit address and 32-bit format sources2161su->setSrc(0, addr);2162su->setSrc(1, v);2163su->setSrc(2, pred);2164su->setIndirectR(NULL);21652166// prevent read fault when the image is not actually bound2167CmpInstruction *pred1 =2168bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),2169TYPE_U32, bld.mkImm(0),2170loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));21712172if (su->op != OP_SUSTP && su->tex.format) {2173const TexInstruction::ImgFormatDesc *format = su->tex.format;2174int blockwidth = format->bits[0] + format->bits[1] +2175format->bits[2] + format->bits[3];21762177// make sure that the format doesn't mismatch2178assert(format->components != 0);2179bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0),2180TYPE_U32, bld.loadImm(NULL, blockwidth / 8),2181loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),2182pred1->getDef(0));2183}2184su->setPredicate(CC_NOT_P, pred1->getDef(0));21852186// TODO: initialize def values to 0 when the surface operation is not2187// performed (not needed for stores). Also, fix the "address bounds test"2188// subtests from arb_shader_image_load_store-invalid for buffers, because it2189// seems like that the predicate is not correctly set by suclamp.2190}21912192static DataType2193getSrcType(const TexInstruction::ImgFormatDesc *t, int c)2194{2195switch (t->type) {2196case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;2197case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;2198case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;2199case UINT:2200return (t->bits[c] == 8 ? TYPE_U8 :2201(t->bits[c] == 16 ? TYPE_U16 : TYPE_U32));2202case SINT:2203return (t->bits[c] == 8 ? TYPE_S8 :2204(t->bits[c] == 16 ? TYPE_S16 : TYPE_S32));2205}2206return TYPE_NONE;2207}22082209static DataType2210getDestType(const ImgType type) {2211switch (type) {2212case FLOAT:2213case UNORM:2214case SNORM:2215return TYPE_F32;2216case UINT:2217return TYPE_U32;2218case SINT:2219return TYPE_S32;2220default:2221assert(!"Impossible type");2222return TYPE_NONE;2223}2224}22252226void2227NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su, Instruction **loaded)2228{2229const TexInstruction::ImgFormatDesc *format = su->tex.format;2230int width = format->bits[0] + format->bits[1] +2231format->bits[2] + format->bits[3];2232Value *untypedDst[4] = {};2233Value *typedDst[4] = {};22342235// We must convert this to a generic load.2236su->op = OP_SULDB;22372238su->dType = typeOfSize(width / 8);2239su->sType = TYPE_U8;22402241for (int i = 0; i < width / 32; i++)2242untypedDst[i] = bld.getSSA();2243if (width < 32)2244untypedDst[0] = bld.getSSA();22452246if (loaded && loaded[0]) {2247for (int i = 0; i < 4; i++) {2248if (loaded[i])2249typedDst[i] = loaded[i]->getDef(0);2250}2251} else {2252for (int i = 0; i < 4; i++) {2253typedDst[i] = su->getDef(i);2254}2255}22562257// Set the untyped dsts as the su's destinations2258if (loaded && loaded[0]) {2259for (int i = 0; i < 4; i++)2260if (loaded[i])2261loaded[i]->setDef(0, untypedDst[i]);2262} else {2263for (int i = 0; i < 4; i++)2264su->setDef(i, untypedDst[i]);22652266bld.setPosition(su, true);2267}22682269// Unpack each component into the typed dsts2270int bits = 0;2271for (int i = 0; i < 4; bits += format->bits[i], i++) {2272if (!typedDst[i])2273continue;22742275if (loaded && loaded[0])2276bld.setPosition(loaded[i], true);22772278if (i >= format->components) {2279if (format->type == FLOAT ||2280format->type == UNORM ||2281format->type == SNORM)2282bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);2283else2284bld.loadImm(typedDst[i], i == 3 ? 1 : 0);2285continue;2286}22872288// Get just that component's data into the relevant place2289if (format->bits[i] == 32)2290bld.mkMov(typedDst[i], untypedDst[i]);2291else if (format->bits[i] == 16)2292bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],2293getSrcType(format, i), untypedDst[i / 2])2294->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1);2295else if (format->bits[i] == 8)2296bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],2297getSrcType(format, i), untypedDst[0])->subOp = i;2298else {2299bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32],2300bld.mkImm((bits % 32) | (format->bits[i] << 8)));2301if (format->type == UNORM || format->type == SNORM)2302bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]);2303}23042305// Normalize / convert as necessary2306if (format->type == UNORM)2307bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));2308else if (format->type == SNORM)2309bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));2310else if (format->type == FLOAT && format->bits[i] < 16) {2311bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));2312bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]);2313}2314}23152316if (format->bgra) {2317std::swap(typedDst[0], typedDst[2]);2318}2319}23202321void2322NVC0LoweringPass::insertOOBSurfaceOpResult(TexInstruction *su)2323{2324if (!su->getPredicate())2325return;23262327bld.setPosition(su, true);23282329for (unsigned i = 0; su->defExists(i); ++i) {2330Value *def = su->getDef(i);2331Value *newDef = bld.getSSA();2332su->setDef(i, newDef);23332334Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));2335assert(su->cc == CC_NOT_P);2336mov->setPredicate(CC_P, su->getPredicate());2337Instruction *uni = bld.mkOp2(OP_UNION, TYPE_U32, bld.getSSA(), newDef, mov->getDef(0));2338bld.mkMov(def, uni->getDef(0));2339}2340}23412342void2343NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)2344{2345processSurfaceCoordsNVE4(su);23462347if (su->op == OP_SULDP) {2348convertSurfaceFormat(su, NULL);2349insertOOBSurfaceOpResult(su);2350}23512352if (su->op == OP_SUREDB || su->op == OP_SUREDP) {2353assert(su->getPredicate());2354Value *pred =2355bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE),2356su->getPredicate(), su->getSrc(2));23572358Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA());2359red->subOp = su->subOp;2360red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0));2361red->setSrc(1, su->getSrc(3));2362if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)2363red->setSrc(2, su->getSrc(4));2364red->setIndirect(0, 0, su->getSrc(0));23652366// make sure to initialize dst value when the atomic operation is not2367// performed2368Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));23692370assert(su->cc == CC_NOT_P);2371red->setPredicate(su->cc, pred);2372mov->setPredicate(CC_P, pred);23732374bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0),2375red->getDef(0), mov->getDef(0));23762377delete_Instruction(bld.getProgram(), su);2378handleCasExch(red, true);2379}23802381if (su->op == OP_SUSTB || su->op == OP_SUSTP)2382su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;2383}23842385void2386NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)2387{2388const int slot = su->tex.r;2389const int dim = su->tex.target.getDim();2390const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());2391int c;2392Value *zero = bld.mkImm(0);2393Value *src[3];2394Value *v;2395Value *ind = su->getIndirectR();23962397bld.setPosition(su, false);23982399adjustCoordinatesMS(su);24002401if (ind) {2402Value *ptr;2403ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));2404ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));2405su->setIndirectR(ptr);2406}24072408// get surface coordinates2409for (c = 0; c < arg; ++c)2410src[c] = su->getSrc(c);2411for (; c < 3; ++c)2412src[c] = zero;24132414// calculate pixel offset2415if (su->op == OP_SULDP || su->op == OP_SUREDP) {2416v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless);2417su->setSrc(0, (src[0] = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), src[0], v)));2418}24192420// add array layer offset2421if (su->tex.target.isArray() || su->tex.target.isCube()) {2422v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);2423assert(dim > 1);2424su->setSrc(2, (src[2] = bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v)));2425}24262427// 3d is special-cased. Note that a single "slice" of a 3d image may2428// also be attached as 2d, so we have to do the same 3d processing for2429// 2d as well, just in case. In order to remap a 3d image onto a 2d2430// image, we have to retile it "by hand".2431if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) {2432Value *z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);2433Value *y_size_aligned =2434bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(),2435loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM_Y, su->tex.bindless),2436bld.loadImm(NULL, 0x0000ffff));2437// Add the z coordinate for actual 3d-images2438if (dim > 2)2439src[2] = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), z, src[2]);2440else2441src[2] = z;24422443// Compute the surface parameters from tile shifts2444Value *tile_shift[3];2445Value *tile_extbf[3];2446// Fetch the "real" tiling parameters of the underlying surface2447for (int i = 0; i < 3; i++) {2448tile_extbf[i] =2449bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),2450loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(i), su->tex.bindless),2451bld.loadImm(NULL, 16));2452tile_shift[i] =2453bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),2454loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(i), su->tex.bindless),2455bld.loadImm(NULL, 24));2456}24572458// However for load/atomics, we use byte-indexing. And for byte2459// indexing, the X tile size is always the same. This leads to slightly2460// better code.2461if (su->op == OP_SULDP || su->op == OP_SUREDP) {2462tile_extbf[0] = bld.loadImm(NULL, 0x600);2463tile_shift[0] = bld.loadImm(NULL, 6);2464}24652466// Compute the location of given coordinate, both inside the tile as2467// well as which (linearly-laid out) tile it's in.2468Value *coord_in_tile[3];2469Value *tile[3];2470for (int i = 0; i < 3; i++) {2471coord_in_tile[i] = bld.mkOp2v(OP_EXTBF, TYPE_U32, bld.getSSA(), src[i], tile_extbf[i]);2472tile[i] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), src[i], tile_shift[i]);2473}24742475// Based on the "real" tiling parameters, compute x/y coordinates in the2476// larger surface with 2d tiling that was supplied to the hardware. This2477// was determined and verified with the help of the tiling pseudocode in2478// the envytools docs.2479//2480// adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size +2481// z_coord_in_tile * x_tile_size2482// adj_y = y_coord_in_tile + y_tile * y_tile_size +2483// z_tile * y_tile_size * y_tiles2484//2485// Note: STRIDE_Y = y_tile_size * y_tiles24862487su->setSrc(0, bld.mkOp2v(2488OP_ADD, TYPE_U32, bld.getSSA(),2489bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),2490coord_in_tile[0],2491bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),2492tile[0],2493bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),2494tile_shift[2], tile_shift[0]))),2495bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),2496coord_in_tile[2], tile_shift[0])));24972498su->setSrc(1, bld.mkOp2v(2499OP_ADD, TYPE_U32, bld.getSSA(),2500bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(),2501tile[2], y_size_aligned),2502bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),2503coord_in_tile[1],2504bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),2505tile[1], tile_shift[1]))));25062507if (su->tex.target == TEX_TARGET_3D) {2508su->moveSources(3, -1);2509su->tex.target = TEX_TARGET_2D;2510}2511}25122513// prevent read fault when the image is not actually bound2514CmpInstruction *pred =2515bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),2516TYPE_U32, bld.mkImm(0),2517loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));2518if (su->op != OP_SUSTP && su->tex.format) {2519const TexInstruction::ImgFormatDesc *format = su->tex.format;2520int blockwidth = format->bits[0] + format->bits[1] +2521format->bits[2] + format->bits[3];25222523assert(format->components != 0);2524// make sure that the format doesn't mismatch when it's not FMT_NONE2525bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),2526TYPE_U32, bld.loadImm(NULL, ffs(blockwidth / 8) - 1),2527loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),2528pred->getDef(0));2529}2530su->setPredicate(CC_NOT_P, pred->getDef(0));2531}25322533void2534NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)2535{2536if (su->tex.target == TEX_TARGET_1D_ARRAY) {2537/* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY2538* will simplify the lowering pass and the texture constraints. */2539su->moveSources(1, 1);2540su->setSrc(1, bld.loadImm(NULL, 0));2541su->tex.target = TEX_TARGET_2D_ARRAY;2542}25432544processSurfaceCoordsNVC0(su);25452546if (su->op == OP_SULDP) {2547convertSurfaceFormat(su, NULL);2548insertOOBSurfaceOpResult(su);2549}25502551if (su->op == OP_SUREDB || su->op == OP_SUREDP) {2552const int dim = su->tex.target.getDim();2553const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());2554LValue *addr = bld.getSSA(8);2555Value *def = su->getDef(0);25562557su->op = OP_SULEA;25582559// Set the destination to the address2560su->dType = TYPE_U64;2561su->setDef(0, addr);2562su->setDef(1, su->getPredicate());25632564bld.setPosition(su, true);25652566// Perform the atomic op2567Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());2568red->subOp = su->subOp;2569red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));2570red->setSrc(1, su->getSrc(arg));2571if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)2572red->setSrc(2, su->getSrc(arg + 1));2573red->setIndirect(0, 0, addr);25742575// make sure to initialize dst value when the atomic operation is not2576// performed2577Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));25782579assert(su->cc == CC_NOT_P);2580red->setPredicate(su->cc, su->getPredicate());2581mov->setPredicate(CC_P, su->getPredicate());25822583bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));25842585handleCasExch(red, false);2586}2587}25882589TexInstruction *2590NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su, Instruction *ret[4])2591{2592const int slot = su->tex.r;2593const int dim = su->tex.target.getDim();2594const bool array = su->tex.target.isArray() || su->tex.target.isCube();2595const int arg = dim + array;2596Value *ind = su->getIndirectR();2597Value *handle;2598Instruction *pred = NULL, *pred2d = NULL;2599int pos = 0;26002601bld.setPosition(su, false);26022603adjustCoordinatesMS(su);26042605// add texture handle2606switch (su->op) {2607case OP_SUSTP:2608pos = 4;2609break;2610case OP_SUREDP:2611pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1;2612break;2613default:2614assert(pos == 0);2615break;2616}26172618if (dim == 2 && !array) {2619// This might be a 2d slice of a 3d texture, try to load the z2620// coordinate in.2621Value *v;2622if (!su->tex.bindless)2623v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);2624else2625v = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), ind, bld.mkImm(11));2626Value *is_3d = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), v, bld.mkImm(1));2627pred2d = bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),2628TYPE_U32, bld.mkImm(0), is_3d);26292630bld.mkOp2(OP_SHR, TYPE_U32, v, v, bld.loadImm(NULL, 16));2631su->moveSources(dim, 1);2632su->setSrc(dim, v);2633su->tex.target = nv50_ir::TEX_TARGET_3D;2634pos++;2635}26362637if (su->tex.bindless)2638handle = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ind, bld.mkImm(2047));2639else2640handle = loadTexHandle(ind, slot + 32);26412642su->setSrc(arg + pos, handle);26432644// The address check doesn't make sense here. The format check could make2645// sense but it's a bit of a pain.2646if (!su->tex.bindless) {2647// prevent read fault when the image is not actually bound2648pred =2649bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),2650TYPE_U32, bld.mkImm(0),2651loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));2652if (su->op != OP_SUSTP && su->tex.format) {2653const TexInstruction::ImgFormatDesc *format = su->tex.format;2654int blockwidth = format->bits[0] + format->bits[1] +2655format->bits[2] + format->bits[3];26562657assert(format->components != 0);2658// make sure that the format doesn't mismatch when it's not FMT_NONE2659bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),2660TYPE_U32, bld.loadImm(NULL, blockwidth / 8),2661loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),2662pred->getDef(0));2663}2664}26652666// Now we have "pred" which (optionally) contains whether to do the surface2667// op at all, and a "pred2d" which indicates that, in case of doing the2668// surface op, we have to create a 2d and 3d version, conditioned on pred2d.2669TexInstruction *su2d = NULL;2670if (pred2d) {2671su2d = cloneForward(func, su)->asTex();2672for (unsigned i = 0; su->defExists(i); ++i)2673su2d->setDef(i, bld.getSSA());2674su2d->moveSources(dim + 1, -1);2675su2d->tex.target = nv50_ir::TEX_TARGET_2D;2676}2677if (pred2d && pred) {2678Instruction *pred3d = bld.mkOp2(OP_AND, TYPE_U8,2679bld.getSSA(1, FILE_PREDICATE),2680pred->getDef(0), pred2d->getDef(0));2681pred3d->src(0).mod = Modifier(NV50_IR_MOD_NOT);2682pred3d->src(1).mod = Modifier(NV50_IR_MOD_NOT);2683su->setPredicate(CC_P, pred3d->getDef(0));2684pred2d = bld.mkOp2(OP_AND, TYPE_U8, bld.getSSA(1, FILE_PREDICATE),2685pred->getDef(0), pred2d->getDef(0));2686pred2d->src(0).mod = Modifier(NV50_IR_MOD_NOT);2687} else if (pred) {2688su->setPredicate(CC_NOT_P, pred->getDef(0));2689} else if (pred2d) {2690su->setPredicate(CC_NOT_P, pred2d->getDef(0));2691}2692if (su2d) {2693su2d->setPredicate(CC_P, pred2d->getDef(0));2694bld.insert(su2d);26952696// Create a UNION so that RA assigns the same registers2697bld.setPosition(su, true);2698for (unsigned i = 0; su->defExists(i); ++i) {2699assert(i < 4);27002701Value *def = su->getDef(i);2702Value *newDef = bld.getSSA();2703ValueDef &def2 = su2d->def(i);2704Instruction *mov = NULL;27052706su->setDef(i, newDef);2707if (pred) {2708mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));2709mov->setPredicate(CC_P, pred->getDef(0));2710}27112712Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,2713bld.getSSA(),2714newDef, def2.get());2715if (mov)2716uni->setSrc(2, mov->getDef(0));2717bld.mkMov(def, uni->getDef(0));2718}2719} else if (pred) {2720// Create a UNION so that RA assigns the same registers2721bld.setPosition(su, true);2722for (unsigned i = 0; su->defExists(i); ++i) {2723assert(i < 4);27242725Value *def = su->getDef(i);2726Value *newDef = bld.getSSA();2727su->setDef(i, newDef);27282729Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));2730mov->setPredicate(CC_P, pred->getDef(0));27312732Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,2733bld.getSSA(),2734newDef, mov->getDef(0));2735bld.mkMov(def, uni->getDef(0));2736}2737}27382739return su2d;2740}27412742void2743NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)2744{2745// processSurfaceCoords also takes care of fixing up the outputs and2746// union'ing them with 0 as necessary. Additionally it may create a second2747// surface which needs some of the similar fixups.27482749Instruction *loaded[4] = {};2750TexInstruction *su2 = processSurfaceCoordsGM107(su, loaded);27512752if (su->op == OP_SULDP) {2753convertSurfaceFormat(su, loaded);2754}27552756if (su->op == OP_SUREDP) {2757su->op = OP_SUREDB;2758}27592760// If we fixed up the type of the regular surface load instruction, we also2761// have to fix up the copy.2762if (su2) {2763su2->op = su->op;2764su2->dType = su->dType;2765su2->sType = su->sType;2766}2767}27682769bool2770NVC0LoweringPass::handleWRSV(Instruction *i)2771{2772Instruction *st;2773Symbol *sym;2774uint32_t addr;27752776// must replace, $sreg are not writeable2777addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());2778if (addr >= 0x400)2779return false;2780sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);27812782st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),2783i->getSrc(1));2784st->perPatch = i->perPatch;27852786bld.getBB()->remove(i);2787return true;2788}27892790void2791NVC0LoweringPass::handleLDST(Instruction *i)2792{2793if (i->src(0).getFile() == FILE_SHADER_INPUT) {2794if (prog->getType() == Program::TYPE_COMPUTE) {2795i->getSrc(0)->reg.file = FILE_MEMORY_CONST;2796i->getSrc(0)->reg.fileIndex = 0;2797} else2798if (prog->getType() == Program::TYPE_GEOMETRY &&2799i->src(0).isIndirect(0)) {2800// XXX: this assumes vec4 units2801Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),2802i->getIndirect(0, 0), bld.mkImm(4));2803i->setIndirect(0, 0, ptr);2804i->op = OP_VFETCH;2805} else {2806i->op = OP_VFETCH;2807assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP2808}2809} else if (i->src(0).getFile() == FILE_MEMORY_CONST) {2810int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;2811Value *ind = i->getIndirect(0, 1);28122813if (targ->getChipset() >= NVISA_GK104_CHIPSET &&2814prog->getType() == Program::TYPE_COMPUTE &&2815(fileIndex >= 6 || ind)) {2816// The launch descriptor only allows to set up 8 CBs, but OpenGL2817// requires at least 12 UBOs. To bypass this limitation, for constant2818// buffers 7+, we store the addrs into the driver constbuf and we2819// directly load from the global memory.2820if (ind) {2821// Clamp the UBO index when an indirect access is used to avoid2822// loading information from the wrong place in the driver cb.2823// TODO - synchronize the max with the driver.2824ind = bld.mkOp2v(OP_MIN, TYPE_U32, bld.getSSA(),2825bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),2826ind, bld.loadImm(NULL, fileIndex)),2827bld.loadImm(NULL, 13));2828fileIndex = 0;2829}28302831Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));2832Value *ptr = loadUboInfo64(ind, fileIndex * 16);2833Value *length = loadUboLength32(ind, fileIndex * 16);2834Value *pred = new_LValue(func, FILE_PREDICATE);2835if (i->src(0).isIndirect(0)) {2836bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));2837bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));2838}2839i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;2840i->setIndirect(0, 1, NULL);2841i->setIndirect(0, 0, ptr);2842bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);2843i->setPredicate(CC_NOT_P, pred);2844Value *zero, *dst = i->getDef(0);2845i->setDef(0, bld.getSSA());28462847bld.setPosition(i, true);2848bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))2849->setPredicate(CC_P, pred);2850bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);2851} else if (i->src(0).isIndirect(1)) {2852Value *ptr;2853if (i->src(0).isIndirect(0))2854ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),2855i->getIndirect(0, 1), bld.mkImm(0x1010),2856i->getIndirect(0, 0));2857else2858ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),2859i->getIndirect(0, 1), bld.mkImm(16));2860i->setIndirect(0, 1, NULL);2861i->setIndirect(0, 0, ptr);2862i->subOp = NV50_IR_SUBOP_LDC_IS;2863}2864} else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {2865assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);2866i->op = OP_VFETCH;2867} else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {2868Value *ind = i->getIndirect(0, 1);2869Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);2870// XXX come up with a way not to do this for EVERY little access but2871// rather to batch these up somehow. Unfortunately we've lost the2872// information about the field width by the time we get here.2873Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));2874Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);2875Value *pred = new_LValue(func, FILE_PREDICATE);2876if (i->src(0).isIndirect(0)) {2877bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));2878bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));2879}2880i->setIndirect(0, 1, NULL);2881i->setIndirect(0, 0, ptr);2882i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;2883bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);2884i->setPredicate(CC_NOT_P, pred);2885if (i->defExists(0)) {2886Value *zero, *dst = i->getDef(0);2887i->setDef(0, bld.getSSA());28882889bld.setPosition(i, true);2890bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))2891->setPredicate(CC_P, pred);2892bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);2893}2894}2895}28962897void2898NVC0LoweringPass::readTessCoord(LValue *dst, int c)2899{2900Value *laneid = bld.getSSA();2901Value *x, *y;29022903bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));29042905if (c == 0) {2906x = dst;2907y = NULL;2908} else2909if (c == 1) {2910x = NULL;2911y = dst;2912} else {2913assert(c == 2);2914if (prog->driver_out->prop.tp.domain != PIPE_PRIM_TRIANGLES) {2915bld.mkMov(dst, bld.loadImm(NULL, 0));2916return;2917}2918x = bld.getSSA();2919y = bld.getSSA();2920}2921if (x)2922bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);2923if (y)2924bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);29252926if (c == 2) {2927bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);2928bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);2929}2930}29312932bool2933NVC0LoweringPass::handleRDSV(Instruction *i)2934{2935Symbol *sym = i->getSrc(0)->asSym();2936const SVSemantic sv = sym->reg.data.sv.sv;2937Value *vtx = NULL;2938Instruction *ld;2939uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);29402941if (addr >= 0x400) {2942// mov $sreg2943if (sym->reg.data.sv.index == 3) {2944// TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID2945i->op = OP_MOV;2946i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));2947} else2948if (sv == SV_TID) {2949// Help CSE combine TID fetches2950Value *tid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(),2951bld.mkSysVal(SV_COMBINED_TID, 0));2952i->op = OP_EXTBF;2953i->setSrc(0, tid);2954switch (sym->reg.data.sv.index) {2955case 0: i->setSrc(1, bld.mkImm(0x1000)); break;2956case 1: i->setSrc(1, bld.mkImm(0x0a10)); break;2957case 2: i->setSrc(1, bld.mkImm(0x061a)); break;2958}2959}2960if (sv == SV_VERTEX_COUNT) {2961bld.setPosition(i, true);2962bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));2963}2964return true;2965}29662967switch (sv) {2968case SV_POSITION:2969assert(prog->getType() == Program::TYPE_FRAGMENT);2970if (i->srcExists(1)) {2971// Pass offset through to the interpolation logic2972ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,2973i->getDef(0), addr, NULL);2974ld->setSrc(1, i->getSrc(1));2975} else {2976bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);2977}2978break;2979case SV_FACE:2980{2981Value *face = i->getDef(0);2982bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);2983if (i->dType == TYPE_F32) {2984bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));2985bld.mkOp1(OP_NEG, TYPE_S32, face, face);2986bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);2987}2988}2989break;2990case SV_TESS_COORD:2991assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);2992readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);2993break;2994case SV_NTID:2995case SV_NCTAID:2996case SV_GRIDID:2997assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise2998if (sym->reg.data.sv.index == 3) {2999i->op = OP_MOV;3000i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));3001return true;3002}3003FALLTHROUGH;3004case SV_WORK_DIM:3005addr += prog->driver->prop.cp.gridInfoBase;3006bld.mkLoad(TYPE_U32, i->getDef(0),3007bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,3008TYPE_U32, addr), NULL);3009break;3010case SV_SAMPLE_INDEX:3011// TODO: Properly pass source as an address in the PIX address space3012// (which can be of the form [r0+offset]). But this is currently3013// unnecessary.3014ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));3015ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;3016break;3017case SV_SAMPLE_POS: {3018Value *sampleID = bld.getScratch();3019ld = bld.mkOp1(OP_PIXLD, TYPE_U32, sampleID, bld.mkImm(0));3020ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;3021Value *offset = calculateSampleOffset(sampleID);30223023assert(prog->driver_out->prop.fp.readsSampleLocations);30243025if (targ->getChipset() >= NVISA_GM200_CHIPSET) {3026bld.mkLoad(TYPE_F32,3027i->getDef(0),3028bld.mkSymbol(3029FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,3030TYPE_U32, prog->driver->io.sampleInfoBase),3031offset);3032bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0),3033bld.mkImm(0x040c + sym->reg.data.sv.index * 16));3034bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_U32, i->getDef(0));3035bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), bld.mkImm(1.0f / 16.0f));3036} else {3037bld.mkLoad(TYPE_F32,3038i->getDef(0),3039bld.mkSymbol(3040FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,3041TYPE_U32, prog->driver->io.sampleInfoBase +30424 * sym->reg.data.sv.index),3043offset);3044}3045break;3046}3047case SV_SAMPLE_MASK: {3048ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));3049ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;3050Instruction *sampleid =3051bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0));3052sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;3053Value *masked =3054bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0),3055bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),3056bld.loadImm(NULL, 1), sampleid->getDef(0)));3057if (prog->persampleInvocation) {3058bld.mkMov(i->getDef(0), masked);3059} else {3060bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked,3061bld.mkImm(0))3062->subOp = 1;3063}3064break;3065}3066case SV_BASEVERTEX:3067case SV_BASEINSTANCE:3068case SV_DRAWID:3069ld = bld.mkLoad(TYPE_U32, i->getDef(0),3070bld.mkSymbol(FILE_MEMORY_CONST,3071prog->driver->io.auxCBSlot,3072TYPE_U32,3073prog->driver->io.drawInfoBase +30744 * (sv - SV_BASEVERTEX)),3075NULL);3076break;3077default:3078if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)3079vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));3080if (prog->getType() == Program::TYPE_FRAGMENT) {3081bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);3082} else {3083ld = bld.mkFetch(i->getDef(0), i->dType,3084FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);3085ld->perPatch = i->perPatch;3086}3087break;3088}3089bld.getBB()->remove(i);3090return true;3091}30923093bool3094NVC0LoweringPass::handleDIV(Instruction *i)3095{3096if (!isFloatType(i->dType))3097return true;3098bld.setPosition(i, false);3099Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));3100i->op = OP_MUL;3101i->setSrc(1, rcp->getDef(0));3102return true;3103}31043105bool3106NVC0LoweringPass::handleMOD(Instruction *i)3107{3108if (!isFloatType(i->dType))3109return true;3110LValue *value = bld.getScratch(typeSizeof(i->dType));3111bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));3112bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);3113bld.mkOp1(OP_TRUNC, i->dType, value, value);3114bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);3115i->op = OP_SUB;3116i->setSrc(1, value);3117return true;3118}31193120bool3121NVC0LoweringPass::handleSQRT(Instruction *i)3122{3123if (targ->isOpSupported(OP_SQRT, i->dType))3124return true;31253126if (i->dType == TYPE_F64) {3127Value *pred = bld.getSSA(1, FILE_PREDICATE);3128Value *zero = bld.loadImm(NULL, 0.0);3129Value *dst = bld.getSSA(8);3130bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));3131bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);3132bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);3133i->op = OP_MUL;3134i->setSrc(1, dst);3135// TODO: Handle this properly with a library function3136} else {3137bld.setPosition(i, true);3138i->op = OP_RSQ;3139bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));3140}31413142return true;3143}31443145bool3146NVC0LoweringPass::handlePOW(Instruction *i)3147{3148LValue *val = bld.getScratch();31493150bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));3151bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;3152bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);31533154i->op = OP_EX2;3155i->setSrc(0, val);3156i->setSrc(1, NULL);31573158return true;3159}31603161bool3162NVC0LoweringPass::handleEXPORT(Instruction *i)3163{3164if (prog->getType() == Program::TYPE_FRAGMENT) {3165int id = i->getSrc(0)->reg.data.offset / 4;31663167if (i->src(0).isIndirect(0)) // TODO, ugly3168return false;3169i->op = OP_MOV;3170i->subOp = NV50_IR_SUBOP_MOV_FINAL;3171i->src(0).set(i->src(1));3172i->setSrc(1, NULL);3173i->setDef(0, new_LValue(func, FILE_GPR));3174i->getDef(0)->reg.data.id = id;31753176prog->maxGPR = MAX2(prog->maxGPR, id);3177} else3178if (prog->getType() == Program::TYPE_GEOMETRY) {3179i->setIndirect(0, 1, gpEmitAddress);3180}3181return true;3182}31833184bool3185NVC0LoweringPass::handleOUT(Instruction *i)3186{3187Instruction *prev = i->prev;3188ImmediateValue stream, prevStream;31893190// Only merge if the stream ids match. Also, note that the previous3191// instruction would have already been lowered, so we take arg1 from it.3192if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&3193i->src(0).getImmediate(stream) &&3194prev->src(1).getImmediate(prevStream) &&3195stream.reg.data.u32 == prevStream.reg.data.u32) {3196i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;3197delete_Instruction(prog, i);3198} else {3199assert(gpEmitAddress);3200i->setDef(0, gpEmitAddress);3201i->setSrc(1, i->getSrc(0));3202i->setSrc(0, gpEmitAddress);3203}3204return true;3205}32063207Value *3208NVC0LoweringPass::calculateSampleOffset(Value *sampleID)3209{3210Value *offset = bld.getScratch();3211if (targ->getChipset() >= NVISA_GM200_CHIPSET) {3212// Sample location offsets (in bytes) are calculated like so:3213// offset = (SV_POSITION.y % 4 * 2) + (SV_POSITION.x % 2)3214// offset = offset * 32 + sampleID % 8 * 4;3215// which is equivalent to:3216// offset = (SV_POSITION.y & 0x3) << 6 + (SV_POSITION.x & 0x1) << 5;3217// offset += sampleID << 232183219// The second operand (src1) of the INSBF instructions are like so:3220// 0xssll where ss is the size and ll is the offset.3221// so: dest = src2 | (src0 & (1 << ss - 1)) << ll32223223// Add sample ID (offset = (sampleID & 0x7) << 2)3224bld.mkOp3(OP_INSBF, TYPE_U32, offset, sampleID, bld.mkImm(0x0302), bld.mkImm(0x0));32253226Symbol *xSym = bld.mkSysVal(SV_POSITION, 0);3227Symbol *ySym = bld.mkSysVal(SV_POSITION, 1);3228Value *coord = bld.getScratch();32293230// Add X coordinate (offset |= (SV_POSITION.x & 0x1) << 5)3231bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,3232targ->getSVAddress(FILE_SHADER_INPUT, xSym), NULL);3233bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)3234->rnd = ROUND_ZI;3235bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0105), offset);32363237// Add Y coordinate (offset |= (SV_POSITION.y & 0x3) << 6)3238bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,3239targ->getSVAddress(FILE_SHADER_INPUT, ySym), NULL);3240bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)3241->rnd = ROUND_ZI;3242bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0206), offset);3243} else {3244bld.mkOp2(OP_SHL, TYPE_U32, offset, sampleID, bld.mkImm(3));3245}3246return offset;3247}32483249// Handle programmable sample locations for GM20x+3250void3251NVC0LoweringPass::handlePIXLD(Instruction *i)3252{3253if (i->subOp != NV50_IR_SUBOP_PIXLD_OFFSET)3254return;3255if (targ->getChipset() < NVISA_GM200_CHIPSET)3256return;32573258assert(prog->driver_out->prop.fp.readsSampleLocations);32593260bld.mkLoad(TYPE_F32,3261i->getDef(0),3262bld.mkSymbol(3263FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,3264TYPE_U32, prog->driver->io.sampleInfoBase),3265calculateSampleOffset(i->getSrc(0)));32663267bld.getBB()->remove(i);3268}32693270// Generate a binary predicate if an instruction is predicated by3271// e.g. an f32 value.3272void3273NVC0LoweringPass::checkPredicate(Instruction *insn)3274{3275Value *pred = insn->getPredicate();3276Value *pdst;32773278if (!pred || pred->reg.file == FILE_PREDICATE)3279return;3280pdst = new_LValue(func, FILE_PREDICATE);32813282// CAUTION: don't use pdst->getInsn, the definition might not be unique,3283// delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass32843285bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);32863287insn->setPredicate(insn->cc, pdst);3288}32893290//3291// - add quadop dance for texturing3292// - put FP outputs in GPRs3293// - convert instruction sequences3294//3295bool3296NVC0LoweringPass::visit(Instruction *i)3297{3298bool ret = true;3299bld.setPosition(i, false);33003301if (i->cc != CC_ALWAYS)3302checkPredicate(i);33033304switch (i->op) {3305case OP_TEX:3306case OP_TXB:3307case OP_TXL:3308case OP_TXF:3309case OP_TXG:3310return handleTEX(i->asTex());3311case OP_TXD:3312return handleTXD(i->asTex());3313case OP_TXLQ:3314return handleTXLQ(i->asTex());3315case OP_TXQ:3316return handleTXQ(i->asTex());3317case OP_EX2:3318bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));3319i->setSrc(0, i->getDef(0));3320break;3321case OP_POW:3322return handlePOW(i);3323case OP_DIV:3324return handleDIV(i);3325case OP_MOD:3326return handleMOD(i);3327case OP_SQRT:3328return handleSQRT(i);3329case OP_EXPORT:3330ret = handleEXPORT(i);3331break;3332case OP_EMIT:3333case OP_RESTART:3334return handleOUT(i);3335case OP_RDSV:3336return handleRDSV(i);3337case OP_WRSV:3338return handleWRSV(i);3339case OP_STORE:3340case OP_LOAD:3341handleLDST(i);3342break;3343case OP_ATOM:3344{3345const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;3346handleATOM(i);3347handleCasExch(i, cctl);3348}3349break;3350case OP_SULDB:3351case OP_SULDP:3352case OP_SUSTB:3353case OP_SUSTP:3354case OP_SUREDB:3355case OP_SUREDP:3356if (targ->getChipset() >= NVISA_GM107_CHIPSET)3357handleSurfaceOpGM107(i->asTex());3358else if (targ->getChipset() >= NVISA_GK104_CHIPSET)3359handleSurfaceOpNVE4(i->asTex());3360else3361handleSurfaceOpNVC0(i->asTex());3362break;3363case OP_SUQ:3364handleSUQ(i->asTex());3365break;3366case OP_BUFQ:3367handleBUFQ(i);3368break;3369case OP_PIXLD:3370handlePIXLD(i);3371break;3372default:3373break;3374}33753376/* Kepler+ has a special opcode to compute a new base address to be used3377* for indirect loads.3378*3379* Maxwell+ has an additional similar requirement for indirect3380* interpolation ops in frag shaders.3381*/3382bool doAfetch = false;3383if (targ->getChipset() >= NVISA_GK104_CHIPSET &&3384!i->perPatch &&3385(i->op == OP_VFETCH || i->op == OP_EXPORT) &&3386i->src(0).isIndirect(0)) {3387doAfetch = true;3388}3389if (targ->getChipset() >= NVISA_GM107_CHIPSET &&3390(i->op == OP_LINTERP || i->op == OP_PINTERP) &&3391i->src(0).isIndirect(0)) {3392doAfetch = true;3393}33943395if (doAfetch) {3396Value *addr = cloneShallow(func, i->getSrc(0));3397Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),3398i->getSrc(0));3399afetch->setIndirect(0, 0, i->getIndirect(0, 0));3400addr->reg.data.offset = 0;3401i->setSrc(0, addr);3402i->setIndirect(0, 0, afetch->getDef(0));3403}34043405return ret;3406}34073408bool3409TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const3410{3411if (stage == CG_STAGE_PRE_SSA) {3412NVC0LoweringPass pass(prog);3413return pass.run(prog, false, true);3414} else3415if (stage == CG_STAGE_POST_RA) {3416NVC0LegalizePostRA pass(prog);3417return pass.run(prog, false, true);3418} else3419if (stage == CG_STAGE_SSA) {3420NVC0LegalizeSSA pass;3421return pass.run(prog, false, true);3422}3423return false;3424}34253426} // namespace nv50_ir342734283429