Path: blob/21.2-virgl/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
4574 views
/*1* Copyright 2011 Christoph Bumiller2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice shall be included in11* all copies or substantial portions of the Software.12*13* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL16* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR17* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,18* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR19* OTHER DEALINGS IN THE SOFTWARE.20*/2122#include "codegen/nv50_ir.h"23#include "codegen/nv50_ir_build_util.h"2425#include "codegen/nv50_ir_target_nv50.h"2627#define NV50_SU_INFO_SIZE_X 0x0028#define NV50_SU_INFO_SIZE_Y 0x0429#define NV50_SU_INFO_SIZE_Z 0x0830#define NV50_SU_INFO_BSIZE 0x0c31#define NV50_SU_INFO_STRIDE_Y 0x1032#define NV50_SU_INFO_MS_X 0x1833#define NV50_SU_INFO_MS_Y 0x1c34#define NV50_SU_INFO_TILE_SHIFT_X 0x2035#define NV50_SU_INFO_TILE_SHIFT_Y 0x2436#define NV50_SU_INFO_TILE_SHIFT_Z 0x2837#define NV50_SU_INFO_OFFSET_Z 0x2c3839#define NV50_SU_INFO__STRIDE 0x304041#define NV50_SU_INFO_SIZE(i) (0x00 + (i) * 4)42#define NV50_SU_INFO_MS(i) (0x18 + (i) * 4)43#define NV50_SU_INFO_TILE_SHIFT(i) (0x20 + (i) * 4)4445namespace nv50_ir {4647// nv50 doesn't support 32 bit integer multiplication48//49// ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)50// -------------------51// al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +52// ah*bh 00 00 ( carry1) << 16 + ( carry2)53// al*bl54// ah*bl 0055//56// fffe0001 + fffe000157//58// Note that this sort of splitting doesn't work for signed values, so we59// compute the sign on those manually and then perform an unsigned multiply.60static bool61expandIntegerMUL(BuildUtil *bld, Instruction *mul)62{63const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;64ImmediateValue src1;65bool src1imm = mul->src(1).getImmediate(src1);6667DataType fTy; // full type68switch (mul->sType) {69case TYPE_S32: fTy = TYPE_U32; break;70case TYPE_S64: fTy = TYPE_U64; break;71default: fTy = mul->sType; break;72}7374DataType hTy; // half type75switch (fTy) {76case TYPE_U32: hTy = TYPE_U16; break;77case TYPE_U64: hTy = TYPE_U32; break;78default:79return false;80}81unsigned int fullSize = typeSizeof(fTy);82unsigned int halfSize = typeSizeof(hTy);8384Instruction *i[9];8586bld->setPosition(mul, true);8788Value *s[2];89Value *a[2], *b[2];90Value *t[4];91for (int j = 0; j < 4; ++j)92t[j] = bld->getSSA(fullSize);9394if (isSignedType(mul->sType) && highResult) {95s[0] = bld->getSSA(fullSize);96s[1] = bld->getSSA(fullSize);97bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));98bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));99src1.reg.data.s32 = abs(src1.reg.data.s32);100} else {101s[0] = mul->getSrc(0);102s[1] = mul->getSrc(1);103}104105// split sources into halves106i[0] = bld->mkSplit(a, halfSize, s[0]);107i[1] = bld->mkSplit(b, halfSize, s[1]);108109if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {110i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],111bld->mkImm(src1.reg.data.u32 & 0xffff));112} else {113i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],114src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);115if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {116i[3] = i[2];117t[1] = t[0];118} else {119i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);120}121}122i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));123if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {124i[4] = i[3];125t[3] = t[2];126} else {127i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);128}129130if (highResult) {131Value *c[2];132Value *r[5];133Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));134c[0] = bld->getSSA(1, FILE_FLAGS);135c[1] = bld->getSSA(1, FILE_FLAGS);136for (int j = 0; j < 5; ++j)137r[j] = bld->getSSA(fullSize);138139i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));140i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);141bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);142bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);143i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);144145// set carry defs / sources146i[3]->setFlagsDef(1, c[0]);147// actual result required in negative case, but ignored for148// unsigned. for some reason the compiler ends up dropping the whole149// instruction if the destination is unused but the flags are.150if (isSignedType(mul->sType))151i[4]->setFlagsDef(1, c[1]);152else153i[4]->setFlagsDef(0, c[1]);154i[6]->setPredicate(CC_C, c[0]);155i[5]->setFlagsSrc(3, c[1]);156157if (isSignedType(mul->sType)) {158Value *cc[2];159Value *rr[7];160Value *one = bld->getSSA(fullSize);161bld->loadImm(one, 1);162for (int j = 0; j < 7; j++)163rr[j] = bld->getSSA(fullSize);164165// NOTE: this logic uses predicates because splitting basic blocks is166// ~impossible during the SSA phase. The RA relies on a correlation167// between edge order and phi node sources.168169// Set the sign of the result based on the inputs170bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))171->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));172173// 1s complement of 64-bit value174bld->mkOp1(OP_NOT, fTy, rr[0], r[4])175->setPredicate(CC_S, cc[0]);176bld->mkOp1(OP_NOT, fTy, rr[1], t[3])177->setPredicate(CC_S, cc[0]);178179// add to low 32-bits, keep track of the carry180Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);181n->setPredicate(CC_S, cc[0]);182n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));183184// If there was a carry, add 1 to the upper 32 bits185// XXX: These get executed even if they shouldn't be186bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)187->setPredicate(CC_C, cc[1]);188bld->mkMov(rr[3], rr[0])189->setPredicate(CC_NC, cc[1]);190bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);191192// Merge the results from the negative and non-negative paths193bld->mkMov(rr[5], rr[4])194->setPredicate(CC_S, cc[0]);195bld->mkMov(rr[6], r[4])196->setPredicate(CC_NS, cc[0]);197bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);198} else {199bld->mkMov(mul->getDef(0), r[4]);200}201} else {202bld->mkMov(mul->getDef(0), t[3]);203}204delete_Instruction(bld->getProgram(), mul);205206for (int j = 2; j <= (highResult ? 5 : 4); ++j)207if (i[j])208i[j]->sType = hTy;209210return true;211}212213#define QOP_ADD 0214#define QOP_SUBR 1215#define QOP_SUB 2216#define QOP_MOV2 3217218// UL UR LL LR219#define QUADOP(q, r, s, t) \220((QOP_##q << 6) | (QOP_##r << 4) | \221(QOP_##s << 2) | (QOP_##t << 0))222223class NV50LegalizePostRA : public Pass224{225public:226NV50LegalizePostRA() : r63(NULL) { }227228private:229virtual bool visit(Function *);230virtual bool visit(BasicBlock *);231232void handlePRERET(FlowInstruction *);233void replaceZero(Instruction *);234235BuildUtil bld;236237LValue *r63;238};239240bool241NV50LegalizePostRA::visit(Function *fn)242{243Program *prog = fn->getProgram();244245r63 = new_LValue(fn, FILE_GPR);246// GPR units on nv50 are in half-regs247if (prog->maxGPR < 126)248r63->reg.data.id = 63;249else250r63->reg.data.id = 127;251252// this is actually per-program, but we can do it all on visiting main()253std::list<Instruction *> *outWrites =254reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);255256if (outWrites) {257for (std::list<Instruction *>::iterator it = outWrites->begin();258it != outWrites->end(); ++it)259(*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));260// instructions will be deleted on exit261outWrites->clear();262}263264return true;265}266267void268NV50LegalizePostRA::replaceZero(Instruction *i)269{270for (int s = 0; i->srcExists(s); ++s) {271ImmediateValue *imm = i->getSrc(s)->asImm();272if (imm && imm->reg.data.u64 == 0)273i->setSrc(s, r63);274}275}276277// Emulate PRERET: jump to the target and call to the origin from there278//279// WARNING: atm only works if BBs are affected by at most a single PRERET280//281// BB:0282// preret BB:3283// (...)284// BB:3285// (...)286// --->287// BB:0288// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)289// (...)290// BB:3291// bra BB:3 + n1 (skip the call)292// call BB:0 + n2 (skip bra at beginning of BB:0)293// (...)294void295NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)296{297BasicBlock *bbE = pre->bb;298BasicBlock *bbT = pre->target.bb;299300pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;301bbE->remove(pre);302bbE->insertHead(pre);303304Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);305Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);306307bbT->insertHead(call);308bbT->insertHead(skip);309310// NOTE: maybe split blocks to prevent the instructions from moving ?311312skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;313call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;314}315316bool317NV50LegalizePostRA::visit(BasicBlock *bb)318{319Instruction *i, *next;320321// remove pseudo operations and non-fixed no-ops, split 64 bit operations322for (i = bb->getFirst(); i; i = next) {323next = i->next;324if (i->isNop()) {325bb->remove(i);326} else327if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {328handlePRERET(i->asFlow());329} else {330// TODO: We will want to do this before register allocation,331// since have to use a $c register for the carry flag.332if (typeSizeof(i->dType) == 8) {333Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);334if (hi)335next = hi;336}337338if (i->op != OP_PFETCH && i->op != OP_BAR &&339(!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))340replaceZero(i);341}342}343if (!bb->getEntry())344return true;345346return true;347}348349class NV50LegalizeSSA : public Pass350{351public:352NV50LegalizeSSA(Program *);353354virtual bool visit(BasicBlock *bb);355356private:357void propagateWriteToOutput(Instruction *);358void handleDIV(Instruction *);359void handleMOD(Instruction *);360void handleMUL(Instruction *);361void handleAddrDef(Instruction *);362363inline bool isARL(const Instruction *) const;364365BuildUtil bld;366367std::list<Instruction *> *outWrites;368};369370NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)371{372bld.setProgram(prog);373374if (prog->optLevel >= 2 &&375(prog->getType() == Program::TYPE_GEOMETRY ||376prog->getType() == Program::TYPE_VERTEX))377outWrites =378reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);379else380outWrites = NULL;381}382383void384NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)385{386if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)387return;388389// check def instruction can store390Instruction *di = st->getSrc(1)->defs.front()->getInsn();391392// TODO: move exports (if beneficial) in common opt pass393if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)394return;395396for (int s = 0; di->srcExists(s); ++s)397if (di->src(s).getFile() == FILE_IMMEDIATE ||398di->src(s).getFile() == FILE_MEMORY_LOCAL)399return;400401if (prog->getType() == Program::TYPE_GEOMETRY) {402// Only propagate output writes in geometry shaders when we can be sure403// that we are propagating to the same output vertex.404if (di->bb != st->bb)405return;406Instruction *i;407for (i = di; i != st; i = i->next) {408if (i->op == OP_EMIT || i->op == OP_RESTART)409return;410}411assert(i); // st after di412}413414// We cannot set defs to non-lvalues before register allocation, so415// save & remove (to save registers) the exports and replace later.416outWrites->push_back(st);417st->bb->remove(st);418}419420bool421NV50LegalizeSSA::isARL(const Instruction *i) const422{423ImmediateValue imm;424425if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)426return false;427if (!i->src(1).getImmediate(imm))428return false;429return imm.isInteger(0);430}431432void433NV50LegalizeSSA::handleAddrDef(Instruction *i)434{435Instruction *arl;436437i->getDef(0)->reg.size = 2; // $aX are only 16 bit438439// PFETCH can always write to $a440if (i->op == OP_PFETCH)441return;442// only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid443if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {444if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)445return;446if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)447return;448}449450// turn $a sources into $r sources (can't operate on $a)451for (int s = 0; i->srcExists(s); ++s) {452Value *a = i->getSrc(s);453Value *r;454if (a->reg.file == FILE_ADDRESS) {455if (a->getInsn() && isARL(a->getInsn())) {456i->setSrc(s, a->getInsn()->getSrc(0));457} else {458bld.setPosition(i, false);459r = bld.getSSA();460bld.mkMov(r, a);461i->setSrc(s, r);462}463}464}465if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)466return;467468// turn result back into $a469bld.setPosition(i, true);470arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));471i->setDef(0, arl->getSrc(0));472}473474void475NV50LegalizeSSA::handleMUL(Instruction *mul)476{477if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)478return;479Value *def = mul->getDef(0);480Value *pred = mul->getPredicate();481CondCode cc = mul->cc;482if (pred)483mul->setPredicate(CC_ALWAYS, NULL);484485if (mul->op == OP_MAD) {486Instruction *add = mul;487bld.setPosition(add, false);488Value *res = cloneShallow(func, mul->getDef(0));489mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));490add->op = OP_ADD;491add->setSrc(0, mul->getDef(0));492add->setSrc(1, add->getSrc(2));493for (int s = 2; add->srcExists(s); ++s)494add->setSrc(s, NULL);495mul->subOp = add->subOp;496add->subOp = 0;497}498expandIntegerMUL(&bld, mul);499if (pred)500def->getInsn()->setPredicate(cc, pred);501}502503// Use f32 division: first compute an approximate result, use it to reduce504// the dividend, which should then be representable as f32, divide the reduced505// dividend, and add the quotients.506void507NV50LegalizeSSA::handleDIV(Instruction *div)508{509const DataType ty = div->sType;510511if (ty != TYPE_U32 && ty != TYPE_S32)512return;513514Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;515516bld.setPosition(div, false);517518Value *a, *af = bld.getSSA();519Value *b, *bf = bld.getSSA();520521bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));522bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));523524if (isSignedType(ty)) {525af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);526bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);527a = bld.getSSA();528b = bld.getSSA();529bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));530bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));531} else {532a = div->getSrc(0);533b = div->getSrc(1);534}535536bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);537bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));538539bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;540bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;541542// get error of 1st result543expandIntegerMUL(&bld,544bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));545bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);546547bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);548549bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;550bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)551->rnd = ROUND_Z;552bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients553554// correction: if modulus >= divisor, add 1555expandIntegerMUL(&bld,556bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));557bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);558bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);559if (!isSignedType(ty)) {560div->op = OP_SUB;561div->setSrc(0, q);562div->setSrc(1, s);563} else {564t = q;565bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);566s = bld.getSSA();567t = bld.getSSA();568// fix the sign569bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))570->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));571bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);572bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);573574div->op = OP_UNION;575div->setSrc(0, s);576div->setSrc(1, t);577}578}579580void581NV50LegalizeSSA::handleMOD(Instruction *mod)582{583if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)584return;585bld.setPosition(mod, false);586587Value *q = bld.getSSA();588Value *m = bld.getSSA();589590bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));591handleDIV(q->getInsn());592593bld.setPosition(mod, false);594expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));595596mod->op = OP_SUB;597mod->setSrc(1, m);598}599600bool601NV50LegalizeSSA::visit(BasicBlock *bb)602{603Instruction *insn, *next;604// skipping PHIs (don't pass them to handleAddrDef) !605for (insn = bb->getEntry(); insn; insn = next) {606next = insn->next;607608if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)609handleAddrDef(insn);610611switch (insn->op) {612case OP_EXPORT:613if (outWrites)614propagateWriteToOutput(insn);615break;616case OP_DIV:617handleDIV(insn);618break;619case OP_MOD:620handleMOD(insn);621break;622case OP_MAD:623case OP_MUL:624handleMUL(insn);625break;626default:627break;628}629}630return true;631}632633class NV50LoweringPreSSA : public Pass634{635public:636NV50LoweringPreSSA(Program *);637638private:639virtual bool visit(Instruction *);640virtual bool visit(Function *);641642bool handleRDSV(Instruction *);643bool handleWRSV(Instruction *);644645bool handlePFETCH(Instruction *);646bool handleEXPORT(Instruction *);647bool handleLOAD(Instruction *);648bool handleLDST(Instruction *);649bool handleMEMBAR(Instruction *);650bool handleSharedATOM(Instruction *);651bool handleSULDP(TexInstruction *);652bool handleSUREDP(TexInstruction *);653bool handleSUSTP(TexInstruction *);654Value *processSurfaceCoords(TexInstruction *);655656bool handleDIV(Instruction *);657bool handleSQRT(Instruction *);658bool handlePOW(Instruction *);659660bool handleSET(Instruction *);661bool handleSLCT(CmpInstruction *);662bool handleSELP(Instruction *);663664bool handleTEX(TexInstruction *);665bool handleTXB(TexInstruction *); // I really666bool handleTXL(TexInstruction *); // hate667bool handleTXD(TexInstruction *); // these 3668bool handleTXLQ(TexInstruction *);669bool handleTXQ(TexInstruction *);670bool handleSUQ(TexInstruction *);671bool handleBUFQ(Instruction *);672673bool handleCALL(Instruction *);674bool handlePRECONT(Instruction *);675bool handleCONT(Instruction *);676677void checkPredicate(Instruction *);678void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);679void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);680Value *loadSuInfo(int slot, uint32_t off);681Value *loadSuInfo16(int slot, uint32_t off);682683private:684const Target *const targ;685686BuildUtil bld;687688Value *tid;689};690691NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :692targ(prog->getTarget()), tid(NULL)693{694bld.setProgram(prog);695}696697bool698NV50LoweringPreSSA::visit(Function *f)699{700BasicBlock *root = BasicBlock::get(func->cfg.getRoot());701702if (prog->getType() == Program::TYPE_COMPUTE) {703// Add implicit "thread id" argument in $r0 to the function704Value *arg = new_LValue(func, FILE_GPR);705arg->reg.data.id = 0;706f->ins.push_back(arg);707708bld.setPosition(root, false);709tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);710}711712return true;713}714715void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,716Value **ms_x, Value **ms_y) {717// This loads the texture-indexed ms setting from the constant buffer718Value *tmp = new_LValue(func, FILE_GPR);719uint8_t b = prog->driver->io.auxCBSlot;720off += prog->driver->io.suInfoBase;721if (prog->getType() > Program::TYPE_VERTEX)722off += 16 * 2 * 4;723if (prog->getType() > Program::TYPE_GEOMETRY)724off += 16 * 2 * 4;725if (prog->getType() > Program::TYPE_FRAGMENT)726off += 16 * 2 * 4;727*ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(728FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);729*ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(730FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);731*ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);732}733734void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {735// Given a MS level, and a sample id, compute the delta x/y736uint8_t b = prog->driver->io.msInfoCBSlot;737Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);738739// The required information is at mslevel * 16 * 4 + sample * 8740// = (mslevel * 8 + sample) * 8741bld.mkOp2(OP_SHL,742TYPE_U32,743off,744bld.mkOp2v(OP_ADD, TYPE_U32, t,745bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),746s),747bld.mkImm(3));748*dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(749FILE_MEMORY_CONST, b, TYPE_U32,750prog->driver->io.msInfoBase), off);751*dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(752FILE_MEMORY_CONST, b, TYPE_U32,753prog->driver->io.msInfoBase + 4), off);754}755756Value *757NV50LoweringPreSSA::loadSuInfo(int slot, uint32_t off)758{759uint8_t b = prog->driver->io.auxCBSlot;760off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;761return bld.mkLoadv(TYPE_U32, bld.mkSymbol(762FILE_MEMORY_CONST, b, TYPE_U32, off), NULL);763}764765Value *766NV50LoweringPreSSA::loadSuInfo16(int slot, uint32_t off)767{768uint8_t b = prog->driver->io.auxCBSlot;769off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;770return bld.mkLoadv(TYPE_U16, bld.mkSymbol(771FILE_MEMORY_CONST, b, TYPE_U16, off), NULL);772}773774bool775NV50LoweringPreSSA::handleTEX(TexInstruction *i)776{777const int arg = i->tex.target.getArgCount();778const int dref = arg;779const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;780781/* Only normalize in the non-explicit derivatives case.782*/783if (i->tex.target.isCube() && i->op != OP_TXD) {784Value *src[3], *val;785int c;786for (c = 0; c < 3; ++c)787src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));788val = bld.getScratch();789bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);790bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);791bld.mkOp1(OP_RCP, TYPE_F32, val, val);792for (c = 0; c < 3; ++c) {793i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),794i->getSrc(c), val));795}796}797798// handle MS, which means looking up the MS params for this texture, and799// adjusting the input coordinates to point at the right sample.800if (i->tex.target.isMS()) {801Value *x = i->getSrc(0);802Value *y = i->getSrc(1);803Value *s = i->getSrc(arg - 1);804Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),805*ms, *ms_x, *ms_y, *dx, *dy;806807i->tex.target.clearMS();808809loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);810loadMsInfo(ms, s, &dx, &dy);811812bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);813bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);814bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);815bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);816i->setSrc(0, tx);817i->setSrc(1, ty);818i->setSrc(arg - 1, bld.loadImm(NULL, 0));819}820821// dref comes before bias/lod822if (i->tex.target.isShadow())823if (i->op == OP_TXB || i->op == OP_TXL)824i->swapSources(dref, lod);825826if (i->tex.target.isArray()) {827if (i->op != OP_TXF) {828// array index must be converted to u32, but it's already an integer829// for TXF830Value *layer = i->getSrc(arg - 1);831LValue *src = new_LValue(func, FILE_GPR);832bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);833bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));834i->setSrc(arg - 1, src);835}836if (i->tex.target.isCube() && i->srcCount() > 4) {837std::vector<Value *> acube, a2d;838int c;839840acube.resize(4);841for (c = 0; c < 4; ++c)842acube[c] = i->getSrc(c);843a2d.resize(4);844for (c = 0; c < 3; ++c)845a2d[c] = new_LValue(func, FILE_GPR);846a2d[3] = NULL;847848bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,849a2d, acube)->asTex()->tex.mask = 0x7;850851for (c = 0; c < 3; ++c)852i->setSrc(c, a2d[c]);853for (; i->srcExists(c + 1); ++c)854i->setSrc(c, i->getSrc(c + 1));855i->setSrc(c, NULL);856assert(c <= 4);857858i->tex.target = i->tex.target.isShadow() ?859TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;860}861}862863// texel offsets are 3 immediate fields in the instruction,864// nv50 cannot do textureGatherOffsets865assert(i->tex.useOffsets <= 1);866if (i->tex.useOffsets) {867for (int c = 0; c < 3; ++c) {868ImmediateValue val;869if (!i->offset[0][c].getImmediate(val))870assert(!"non-immediate offset");871i->tex.offset[c] = val.reg.data.u32;872i->offset[0][c].set(NULL);873}874}875876return true;877}878879// Bias must be equal for all threads of a quad or lod calculation will fail.880//881// The lanes of a quad are grouped by the bit in the condition register they882// have set, which is selected by differing bias values.883// Move the input values for TEX into a new register set for each group and884// execute TEX only for a specific group.885// We always need to use 4 new registers for the inputs/outputs because the886// implicitly calculated derivatives must be correct.887//888// TODO: move to SSA phase so we can easily determine whether bias is constant889bool890NV50LoweringPreSSA::handleTXB(TexInstruction *i)891{892const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };893int l, d;894895// We can't actually apply bias *and* do a compare for a cube896// texture. Since the compare has to be done before the filtering, just897// drop the bias on the floor.898if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {899i->op = OP_TEX;900i->setSrc(3, i->getSrc(4));901i->setSrc(4, NULL);902return handleTEX(i);903}904905handleTEX(i);906Value *bias = i->getSrc(i->tex.target.getArgCount());907if (bias->isUniform())908return true;909910Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),911bld.loadImm(NULL, 1));912bld.setPosition(cond, false);913914for (l = 1; l < 4; ++l) {915const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);916Value *bit = bld.getSSA();917Value *pred = bld.getScratch(1, FILE_FLAGS);918Value *imm = bld.loadImm(NULL, (1 << l));919bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;920bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);921cond->setSrc(l, bit);922}923Value *flags = bld.getScratch(1, FILE_FLAGS);924bld.setPosition(cond, true);925bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;926927Instruction *tex[4];928for (l = 0; l < 4; ++l) {929(tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);930bld.insert(tex[l]);931}932933Value *res[4][4];934for (d = 0; i->defExists(d); ++d)935res[0][d] = tex[0]->getDef(d);936for (l = 1; l < 4; ++l) {937for (d = 0; tex[l]->defExists(d); ++d) {938res[l][d] = cloneShallow(func, res[0][d]);939bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);940}941}942943for (d = 0; i->defExists(d); ++d) {944Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));945for (l = 0; l < 4; ++l)946dst->setSrc(l, res[l][d]);947}948delete_Instruction(prog, i);949return true;950}951952// LOD must be equal for all threads of a quad.953// Unlike with TXB, here we can just diverge since there's no LOD calculation954// that would require all 4 threads' sources to be set up properly.955bool956NV50LoweringPreSSA::handleTXL(TexInstruction *i)957{958handleTEX(i);959Value *lod = i->getSrc(i->tex.target.getArgCount());960if (lod->isUniform())961return true;962963BasicBlock *currBB = i->bb;964BasicBlock *texiBB = i->bb->splitBefore(i, false);965BasicBlock *joinBB = i->bb->splitAfter(i);966967bld.setPosition(currBB, true);968assert(!currBB->joinAt);969currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);970971for (int l = 0; l <= 3; ++l) {972const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);973Value *pred = bld.getScratch(1, FILE_FLAGS);974bld.setPosition(currBB, true);975bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;976bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;977currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);978if (l <= 2) {979BasicBlock *laneBB = new BasicBlock(func);980currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);981currBB = laneBB;982}983}984bld.setPosition(joinBB, false);985bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;986return true;987}988989bool990NV50LoweringPreSSA::handleTXD(TexInstruction *i)991{992static const uint8_t qOps[4][2] =993{994{ QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0995{ QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1996{ QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2997{ QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3998};999Value *def[4][4];1000Value *crd[3];1001Instruction *tex;1002Value *zero = bld.loadImm(bld.getSSA(), 0);1003int l, c;1004const int dim = i->tex.target.getDim() + i->tex.target.isCube();10051006handleTEX(i);1007i->op = OP_TEX; // no need to clone dPdx/dPdy later1008i->tex.derivAll = true;10091010for (c = 0; c < dim; ++c)1011crd[c] = bld.getScratch();10121013bld.mkOp(OP_QUADON, TYPE_NONE, NULL);1014for (l = 0; l < 4; ++l) {1015Value *src[3], *val;1016// mov coordinates from lane l to all lanes1017for (c = 0; c < dim; ++c)1018bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);1019// add dPdx from lane l to lanes dx1020for (c = 0; c < dim; ++c)1021bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);1022// add dPdy from lane l to lanes dy1023for (c = 0; c < dim; ++c)1024bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);1025// normalize cube coordinates if necessary1026if (i->tex.target.isCube()) {1027for (c = 0; c < 3; ++c)1028src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);1029val = bld.getScratch();1030bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);1031bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);1032bld.mkOp1(OP_RCP, TYPE_F32, val, val);1033for (c = 0; c < 3; ++c)1034src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);1035} else {1036for (c = 0; c < dim; ++c)1037src[c] = crd[c];1038}1039// texture1040bld.insert(tex = cloneForward(func, i));1041for (c = 0; c < dim; ++c)1042tex->setSrc(c, src[c]);1043// save results1044for (c = 0; i->defExists(c); ++c) {1045Instruction *mov;1046def[c][l] = bld.getSSA();1047mov = bld.mkMov(def[c][l], tex->getDef(c));1048mov->fixed = 1;1049mov->lanes = 1 << l;1050}1051}1052bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);10531054for (c = 0; i->defExists(c); ++c) {1055Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));1056for (l = 0; l < 4; ++l)1057u->setSrc(l, def[c][l]);1058}10591060i->bb->remove(i);1061return true;1062}10631064bool1065NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)1066{1067handleTEX(i);1068bld.setPosition(i, true);10691070/* The returned values are not quite what we want:1071* (a) convert from s32 to f321072* (b) multiply by 1/2561073*/1074for (int def = 0; def < 2; ++def) {1075if (!i->defExists(def))1076continue;1077bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));1078bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),1079i->getDef(def), bld.loadImm(NULL, 1.0f / 256));1080}1081return true;1082}10831084bool1085NV50LoweringPreSSA::handleTXQ(TexInstruction *i)1086{1087Value *ms, *ms_x, *ms_y;1088if (i->tex.query == TXQ_DIMS) {1089if (i->tex.target.isMS()) {1090bld.setPosition(i, true);1091loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);1092int d = 0;1093if (i->tex.mask & 1) {1094bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_x);1095d++;1096}1097if (i->tex.mask & 2) {1098bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_y);1099d++;1100}1101}1102return true;1103}1104assert(i->tex.query == TXQ_TYPE);1105assert(i->tex.mask == 4);11061107loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);1108bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);1109i->bb->remove(i);11101111return true;1112}11131114bool1115NV50LoweringPreSSA::handleSUQ(TexInstruction *suq)1116{1117const int dim = suq->tex.target.getDim();1118const int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());1119int mask = suq->tex.mask;1120int slot = suq->tex.r;1121int c, d;11221123for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {1124if (c >= arg || !(mask & 1))1125continue;11261127int offset;11281129if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {1130offset = NV50_SU_INFO_SIZE(2);1131} else {1132offset = NV50_SU_INFO_SIZE(c);1133}1134bld.mkMov(suq->getDef(d++), loadSuInfo(slot, offset));1135if (c == 2 && suq->tex.target.isCube())1136bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),1137bld.loadImm(NULL, 6));1138}11391140if (mask & 1) {1141if (suq->tex.target.isMS()) {1142Value *ms_x = loadSuInfo(slot, NV50_SU_INFO_MS(0));1143Value *ms_y = loadSuInfo(slot, NV50_SU_INFO_MS(1));1144Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);1145bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);1146} else {1147bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));1148}1149}11501151bld.remove(suq);1152return true;1153}11541155bool1156NV50LoweringPreSSA::handleBUFQ(Instruction *bufq)1157{1158bufq->op = OP_MOV;1159bufq->setSrc(0, loadSuInfo(bufq->getSrc(0)->reg.fileIndex, NV50_SU_INFO_SIZE_X));1160bufq->setIndirect(0, 0, NULL);1161bufq->setIndirect(0, 1, NULL);1162return true;1163}11641165bool1166NV50LoweringPreSSA::handleSET(Instruction *i)1167{1168if (i->dType == TYPE_F32) {1169bld.setPosition(i, true);1170i->dType = TYPE_U32;1171bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));1172bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));1173}1174return true;1175}11761177bool1178NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)1179{1180Value *src0 = bld.getSSA();1181Value *src1 = bld.getSSA();1182Value *pred = bld.getScratch(1, FILE_FLAGS);11831184Value *v0 = i->getSrc(0);1185Value *v1 = i->getSrc(1);1186// XXX: these probably shouldn't be immediates in the first place ...1187if (v0->asImm())1188v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);1189if (v1->asImm())1190v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);11911192bld.setPosition(i, true);1193bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);1194bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);1195bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);11961197bld.setPosition(i, false);1198i->op = OP_SET;1199i->setFlagsDef(0, pred);1200i->dType = TYPE_U8;1201i->setSrc(0, i->getSrc(2));1202i->setSrc(2, NULL);1203i->setSrc(1, bld.loadImm(NULL, 0));12041205return true;1206}12071208bool1209NV50LoweringPreSSA::handleSELP(Instruction *i)1210{1211Value *src0 = bld.getSSA();1212Value *src1 = bld.getSSA();12131214Value *v0 = i->getSrc(0);1215Value *v1 = i->getSrc(1);1216if (v0->asImm())1217v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);1218if (v1->asImm())1219v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);12201221bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));1222bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));1223bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);1224delete_Instruction(prog, i);1225return true;1226}12271228bool1229NV50LoweringPreSSA::handleWRSV(Instruction *i)1230{1231Symbol *sym = i->getSrc(0)->asSym();12321233// these are all shader outputs, $sreg are not writeable1234uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);1235if (addr >= 0x400)1236return false;1237sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);12381239bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));12401241bld.getBB()->remove(i);1242return true;1243}12441245bool1246NV50LoweringPreSSA::handleCALL(Instruction *i)1247{1248if (prog->getType() == Program::TYPE_COMPUTE) {1249// Add implicit "thread id" argument in $r0 to the function1250i->setSrc(i->srcCount(), tid);1251}1252return true;1253}12541255bool1256NV50LoweringPreSSA::handlePRECONT(Instruction *i)1257{1258delete_Instruction(prog, i);1259return true;1260}12611262bool1263NV50LoweringPreSSA::handleCONT(Instruction *i)1264{1265i->op = OP_BRA;1266return true;1267}12681269bool1270NV50LoweringPreSSA::handleRDSV(Instruction *i)1271{1272Symbol *sym = i->getSrc(0)->asSym();1273uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);1274Value *def = i->getDef(0);1275SVSemantic sv = sym->reg.data.sv.sv;1276int idx = sym->reg.data.sv.index;12771278if (addr >= 0x400) // mov $sreg1279return true;12801281switch (sv) {1282case SV_POSITION:1283assert(prog->getType() == Program::TYPE_FRAGMENT);1284bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);1285break;1286case SV_FACE:1287bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);1288if (i->dType == TYPE_F32) {1289bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));1290bld.mkOp1(OP_NEG, TYPE_S32, def, def);1291bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);1292}1293break;1294case SV_NCTAID:1295case SV_CTAID:1296case SV_NTID: {1297Value *x = bld.getSSA(2);1298bld.mkOp1(OP_LOAD, TYPE_U16, x,1299bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));1300bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);1301break;1302}1303case SV_TID:1304if (idx == 0) {1305bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));1306} else if (idx == 1) {1307bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));1308bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));1309} else if (idx == 2) {1310bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));1311} else {1312bld.mkMov(def, bld.mkImm(0));1313}1314break;1315case SV_COMBINED_TID:1316bld.mkMov(def, tid);1317break;1318case SV_SAMPLE_POS: {1319Value *off = new_LValue(func, FILE_ADDRESS);1320bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));1321bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));1322bld.mkLoad(TYPE_F32,1323def,1324bld.mkSymbol(1325FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,1326TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),1327off);1328break;1329}1330case SV_THREAD_KILL:1331// Not actually supported. But it's implementation-dependent, so we can1332// always just say it's not a helper.1333bld.mkMov(def, bld.loadImm(NULL, 0));1334break;1335default:1336bld.mkFetch(i->getDef(0), i->dType,1337FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);1338break;1339}1340bld.getBB()->remove(i);1341return true;1342}13431344bool1345NV50LoweringPreSSA::handleDIV(Instruction *i)1346{1347if (!isFloatType(i->dType))1348return true;1349bld.setPosition(i, false);1350Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));1351i->op = OP_MUL;1352i->setSrc(1, rcp->getDef(0));1353return true;1354}13551356bool1357NV50LoweringPreSSA::handleSQRT(Instruction *i)1358{1359bld.setPosition(i, true);1360i->op = OP_RSQ;1361bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));13621363return true;1364}13651366bool1367NV50LoweringPreSSA::handlePOW(Instruction *i)1368{1369LValue *val = bld.getScratch();13701371bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));1372bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;1373bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);13741375i->op = OP_EX2;1376i->setSrc(0, val);1377i->setSrc(1, NULL);13781379return true;1380}13811382bool1383NV50LoweringPreSSA::handleEXPORT(Instruction *i)1384{1385if (prog->getType() == Program::TYPE_FRAGMENT) {1386if (i->getIndirect(0, 0)) {1387// TODO: redirect to l[] here, load to GPRs at exit1388return false;1389} else {1390int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units13911392i->op = OP_MOV;1393i->subOp = NV50_IR_SUBOP_MOV_FINAL;1394i->src(0).set(i->src(1));1395i->setSrc(1, NULL);1396i->setDef(0, new_LValue(func, FILE_GPR));1397i->getDef(0)->reg.data.id = id;13981399prog->maxGPR = MAX2(prog->maxGPR, id * 2);1400}1401}1402return true;1403}14041405// Handle indirect addressing in geometry shaders:1406//1407// ld $r0 a[$a1][$a2+k] ->1408// ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit1409//1410bool1411NV50LoweringPreSSA::handleLOAD(Instruction *i)1412{1413ValueRef src = i->src(0);1414Symbol *sym = i->getSrc(0)->asSym();14151416if (prog->getType() == Program::TYPE_COMPUTE) {1417if (sym->inFile(FILE_MEMORY_SHARED) ||1418sym->inFile(FILE_MEMORY_BUFFER) ||1419sym->inFile(FILE_MEMORY_GLOBAL)) {1420return handleLDST(i);1421}1422}14231424if (src.isIndirect(1)) {1425assert(prog->getType() == Program::TYPE_GEOMETRY);1426Value *addr = i->getIndirect(0, 1);14271428if (src.isIndirect(0)) {1429// base address is in an address register, so move to a GPR1430Value *base = bld.getScratch();1431bld.mkMov(base, addr);14321433Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);1434Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);1435Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),1436i->getIndirect(0, 0), bld.mkImm(2));14371438// Calculate final address: addr = base + attr*vstride; use 16-bit1439// multiplication since 32-bit would be lowered to multiple1440// instructions, and we only need the low 16 bits of the result1441Value *a[2], *b[2];1442bld.mkSplit(a, 2, attrib);1443bld.mkSplit(b, 2, vstride);1444Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],1445base);14461447// move address from GPR into an address register1448addr = bld.getSSA(2, FILE_ADDRESS);1449bld.mkMov(addr, sum);1450}14511452i->setIndirect(0, 1, NULL);1453i->setIndirect(0, 0, addr);1454}14551456return true;1457}14581459bool1460NV50LoweringPreSSA::handleSharedATOM(Instruction *atom)1461{1462assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);14631464BasicBlock *currBB = atom->bb;1465BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);1466BasicBlock *joinBB = atom->bb->splitAfter(atom);1467BasicBlock *setAndUnlockBB = new BasicBlock(func);1468BasicBlock *failLockBB = new BasicBlock(func);14691470bld.setPosition(currBB, true);1471assert(!currBB->joinAt);1472currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);14731474bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);1475currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);14761477bld.setPosition(tryLockBB, true);14781479Instruction *ld =1480bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),1481atom->getIndirect(0, 0));1482Value *locked = bld.getSSA(1, FILE_FLAGS);1483if (prog->getTarget()->getChipset() >= 0xa0) {1484ld->setFlagsDef(1, locked);1485ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;1486} else {1487bld.mkMov(locked, bld.loadImm(NULL, 2))1488->flagsDef = 0;1489}14901491bld.mkFlow(OP_BRA, setAndUnlockBB, CC_LT, locked);1492bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);1493tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);1494tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);14951496tryLockBB->cfg.detach(&joinBB->cfg);1497bld.remove(atom);14981499bld.setPosition(setAndUnlockBB, true);1500Value *stVal;1501if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {1502// Read the old value, and write the new one.1503stVal = atom->getSrc(1);1504} else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {1505CmpInstruction *set =1506bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_FLAGS),1507TYPE_U32, ld->getDef(0), atom->getSrc(1));15081509Instruction *selp =1510bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), atom->getSrc(2),1511ld->getDef(0), set->getDef(0));1512stVal = selp->getDef(0);15131514handleSELP(selp);1515} else {1516operation op;15171518switch (atom->subOp) {1519case NV50_IR_SUBOP_ATOM_ADD:1520op = OP_ADD;1521break;1522case NV50_IR_SUBOP_ATOM_AND:1523op = OP_AND;1524break;1525case NV50_IR_SUBOP_ATOM_OR:1526op = OP_OR;1527break;1528case NV50_IR_SUBOP_ATOM_XOR:1529op = OP_XOR;1530break;1531case NV50_IR_SUBOP_ATOM_MIN:1532op = OP_MIN;1533break;1534case NV50_IR_SUBOP_ATOM_MAX:1535op = OP_MAX;1536break;1537default:1538assert(0);1539return false;1540}15411542Instruction *i =1543bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),1544atom->getSrc(1));15451546stVal = i->getDef(0);1547}15481549Instruction *store = bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),1550atom->getIndirect(0, 0), stVal);1551if (prog->getTarget()->getChipset() >= 0xa0) {1552store->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;1553}15541555bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);1556setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);15571558// Loop until the lock is acquired.1559bld.setPosition(failLockBB, true);1560bld.mkFlow(OP_BRA, tryLockBB, CC_GEU, locked);1561bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);1562failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);1563failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);15641565bld.setPosition(joinBB, false);1566bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;15671568return true;1569}15701571bool1572NV50LoweringPreSSA::handleLDST(Instruction *i)1573{1574ValueRef src = i->src(0);1575Symbol *sym = i->getSrc(0)->asSym();15761577if (prog->getType() != Program::TYPE_COMPUTE) {1578return true;1579}15801581// Buffers just map directly to the different global memory spaces1582if (sym->inFile(FILE_MEMORY_BUFFER)) {1583sym->reg.file = FILE_MEMORY_GLOBAL;1584}15851586if (sym->inFile(FILE_MEMORY_SHARED)) {15871588if (src.isIndirect(0)) {1589Value *addr = i->getIndirect(0, 0);15901591if (!addr->inFile(FILE_ADDRESS)) {1592// Move address from GPR into an address register1593Value *new_addr = bld.getSSA(2, FILE_ADDRESS);1594bld.mkMov(new_addr, addr);15951596i->setIndirect(0, 0, new_addr);1597}1598}15991600if (i->op == OP_ATOM)1601handleSharedATOM(i);1602} else if (sym->inFile(FILE_MEMORY_GLOBAL)) {1603// All global access must be indirect. There are no instruction forms1604// with direct access.1605Value *addr = i->getIndirect(0, 0);16061607Value *offset = bld.loadImm(bld.getSSA(), sym->reg.data.offset);1608Value *sum;1609if (addr != NULL)1610sum = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), addr,1611offset);1612else1613sum = offset;16141615i->setIndirect(0, 0, sum);1616sym->reg.data.offset = 0;1617}16181619return true;1620}16211622bool1623NV50LoweringPreSSA::handleMEMBAR(Instruction *i)1624{1625// For global memory, apparently doing a bunch of reads at different1626// addresses forces things to get sufficiently flushed.1627if (i->subOp & NV50_IR_SUBOP_MEMBAR_GL) {1628uint8_t b = prog->driver->io.auxCBSlot;1629Value *base =1630bld.mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32,1631prog->driver->io.membarOffset), NULL);1632Value *physid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), bld.mkSysVal(SV_PHYSID, 0));1633Value *off = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),1634bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(),1635physid, bld.loadImm(NULL, 0x1f)),1636bld.loadImm(NULL, 2));1637base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, off);1638Symbol *gmemMembar = bld.mkSymbol(FILE_MEMORY_GLOBAL, prog->driver->io.gmemMembar, TYPE_U32, 0);1639for (int i = 0; i < 8; i++) {1640if (i != 0) {1641base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, bld.loadImm(NULL, 0x100));1642}1643bld.mkLoad(TYPE_U32, bld.getSSA(), gmemMembar, base)1644->fixed = 1;1645}1646}16471648// Both global and shared memory barriers also need a regular control bar1649// TODO: double-check this is the case1650i->op = OP_BAR;1651i->subOp = NV50_IR_SUBOP_BAR_SYNC;1652i->setSrc(0, bld.mkImm(0u));1653i->setSrc(1, bld.mkImm(0u));16541655return true;1656}16571658// The type that bests represents how each component can be stored when packed.1659static DataType1660getPackedType(const TexInstruction::ImgFormatDesc *t, int c)1661{1662switch (t->type) {1663case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;1664case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;1665case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;1666case UINT:1667return (t->bits[c] == 8 ? TYPE_U8 :1668(t->bits[c] <= 16 ? TYPE_U16 : TYPE_U32));1669case SINT:1670return (t->bits[c] == 8 ? TYPE_S8 :1671(t->bits[c] <= 16 ? TYPE_S16 : TYPE_S32));1672}1673return TYPE_NONE;1674}16751676// The type that the rest of the shader expects to process this image type in.1677static DataType1678getShaderType(const ImgType type) {1679switch (type) {1680case FLOAT:1681case UNORM:1682case SNORM:1683return TYPE_F32;1684case UINT:1685return TYPE_U32;1686case SINT:1687return TYPE_S32;1688default:1689assert(!"Impossible type");1690return TYPE_NONE;1691}1692}16931694// Reads the raw coordinates out of the input instruction, and returns a1695// single-value coordinate which is what the hardware expects to receive in a1696// ld/st op.1697Value *1698NV50LoweringPreSSA::processSurfaceCoords(TexInstruction *su)1699{1700const int slot = su->tex.r;1701const int dim = su->tex.target.getDim();1702const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());17031704const TexInstruction::ImgFormatDesc *format = su->tex.format;1705const uint16_t bytes = (format->bits[0] + format->bits[1] +1706format->bits[2] + format->bits[3]) / 8;1707uint16_t shift = ffs(bytes) - 1;17081709// Buffer sizes don't necessarily fit in 16-bit values1710if (su->tex.target == TEX_TARGET_BUFFER) {1711return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),1712su->getSrc(0), bld.loadImm(NULL, (uint32_t)shift));1713}17141715// For buffers, we just need the byte offset. And for 2d buffers we want1716// the x coordinate in bytes as well.1717Value *coords[3] = {};1718for (int i = 0; i < arg; i++) {1719Value *src[2];1720bld.mkSplit(src, 2, su->getSrc(i));1721coords[i] = src[0];1722// For 1d-images, we want the y coord to be 0, which it will be here.1723if (i == 0)1724coords[1] = src[1];1725}17261727coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),1728coords[0], bld.loadImm(NULL, shift));17291730if (su->tex.target.isMS()) {1731Value *ms_x = loadSuInfo16(slot, NV50_SU_INFO_MS(0));1732Value *ms_y = loadSuInfo16(slot, NV50_SU_INFO_MS(1));1733coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[0], ms_x);1734coords[1] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[1], ms_y);1735}17361737// If there are more dimensions, we just want the y-offset. But that needs1738// to be adjusted up by the y-stride for array images.1739if (su->tex.target.isArray() || su->tex.target.isCube()) {1740Value *index = coords[dim];1741Value *height = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);1742Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4), index, height);1743mul->sType = TYPE_U16;1744Value *muls[2];1745bld.mkSplit(muls, 2, mul->getDef(0));1746if (dim > 1)1747coords[1] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), coords[1], muls[0]);1748else1749coords[1] = muls[0];1750}17511752// 3d is special-cased. Note that a single "slice" of a 3d image may1753// also be attached as 2d, so we have to do the same 3d processing for1754// 2d as well, just in case. In order to remap a 3d image onto a 2d1755// image, we have to retile it "by hand".1756if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) {1757Value *z = loadSuInfo16(slot, NV50_SU_INFO_OFFSET_Z);1758Value *y_size_aligned = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);1759// Add the z coordinate for actual 3d-images1760if (dim > 2)1761coords[2] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), z, coords[2]);1762else1763coords[2] = z;17641765// Compute the surface parameters from tile shifts1766Value *tile_shift[3];1767Value *tile_size[3];1768Value *tile_mask[3];1769// We only ever use one kind of X-tiling.1770tile_shift[0] = bld.loadImm(NULL, (uint16_t)6);1771tile_size[0] = bld.loadImm(NULL, (uint16_t)64);1772tile_mask[0] = bld.loadImm(NULL, (uint16_t)63);1773// Fetch the "real" tiling parameters of the underlying surface1774for (int i = 1; i < 3; i++) {1775tile_shift[i] = loadSuInfo16(slot, NV50_SU_INFO_TILE_SHIFT(i));1776tile_size[i] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), bld.loadImm(NULL, (uint16_t)1), tile_shift[i]);1777tile_mask[i] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), tile_size[i], bld.loadImm(NULL, (uint16_t)-1));1778}17791780// Compute the location of given coordinate, both inside the tile as1781// well as which (linearly-laid out) tile it's in.1782Value *coord_in_tile[3];1783Value *tile[3];1784for (int i = 0; i < 3; i++) {1785coord_in_tile[i] = bld.mkOp2v(OP_AND, TYPE_U16, bld.getSSA(2), coords[i], tile_mask[i]);1786tile[i] = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), coords[i], tile_shift[i]);1787}17881789// Based on the "real" tiling parameters, compute x/y coordinates in the1790// larger surface with 2d tiling that was supplied to the hardware. This1791// was determined and verified with the help of the tiling pseudocode in1792// the envytools docs.1793//1794// adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size +1795// z_coord_in_tile * x_tile_size1796// adj_y = y_coord_in_tile + y_tile * y_tile_size +1797// z_tile * y_tile_size * y_tiles1798//1799// Note: STRIDE_Y = y_tile_size * y_tiles18001801coords[0] = bld.mkOp2v(1802OP_ADD, TYPE_U16, bld.getSSA(2),1803bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),1804coord_in_tile[0],1805bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),1806tile[0],1807bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),1808tile_shift[2], tile_shift[0]))),1809bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),1810coord_in_tile[2], tile_shift[0]));18111812Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4),1813tile[2], y_size_aligned);1814mul->sType = TYPE_U16;1815Value *muls[2];1816bld.mkSplit(muls, 2, mul->getDef(0));18171818coords[1] = bld.mkOp2v(1819OP_ADD, TYPE_U16, bld.getSSA(2),1820muls[0],1821bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),1822coord_in_tile[1],1823bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),1824tile[1], tile_shift[1])));1825}18261827return bld.mkOp2v(OP_MERGE, TYPE_U32, bld.getSSA(), coords[0], coords[1]);1828}18291830// This is largely a copy of NVC0LoweringPass::convertSurfaceFormat, but1831// adjusted to make use of 16-bit math where possible.1832bool1833NV50LoweringPreSSA::handleSULDP(TexInstruction *su)1834{1835const int slot = su->tex.r;1836assert(!su->getIndirectR());18371838bld.setPosition(su, false);18391840const TexInstruction::ImgFormatDesc *format = su->tex.format;1841const int bytes = (su->tex.format->bits[0] +1842su->tex.format->bits[1] +1843su->tex.format->bits[2] +1844su->tex.format->bits[3]) / 8;1845DataType ty = typeOfSize(bytes);18461847Value *coord = processSurfaceCoords(su);18481849Value *untypedDst[4] = {};1850Value *typedDst[4] = {};1851int i;1852for (i = 0; i < bytes / 4; i++)1853untypedDst[i] = bld.getSSA();1854if (bytes < 4)1855untypedDst[0] = bld.getSSA();18561857for (i = 0; i < 4; i++)1858typedDst[i] = su->getDef(i);18591860Instruction *load = bld.mkLoad(ty, NULL, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, ty, 0), coord);1861for (i = 0; i < 4 && untypedDst[i]; i++)1862load->setDef(i, untypedDst[i]);18631864// Unpack each component into the typed dsts1865int bits = 0;1866for (int i = 0; i < 4; bits += format->bits[i], i++) {1867if (!typedDst[i])1868continue;18691870if (i >= format->components) {1871if (format->type == FLOAT ||1872format->type == UNORM ||1873format->type == SNORM)1874bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);1875else1876bld.loadImm(typedDst[i], i == 3 ? 1 : 0);1877continue;1878}18791880// Get just that component's data into the relevant place1881if (format->bits[i] == 32)1882bld.mkMov(typedDst[i], untypedDst[i]);1883else if (format->bits[i] == 16) {1884// We can always convert directly from the appropriate half of the1885// loaded value into the typed result.1886Value *src[2];1887bld.mkSplit(src, 2, untypedDst[i / 2]);1888bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],1889getPackedType(format, i), src[i & 1]);1890}1891else if (format->bits[i] == 8) {1892// Same approach as for 16 bits, but we have to massage the value a1893// bit more, since we have to get the appropriate 8 bits from the1894// half-register. In all cases, we can CVT from a 8-bit source, so we1895// only have to shift when we want the upper 8 bits.1896Value *src[2], *shifted;1897bld.mkSplit(src, 2, untypedDst[0]);1898DataType packedType = getPackedType(format, i);1899if (i & 1)1900shifted = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), src[!!(i & 2)], bld.loadImm(NULL, (uint16_t)8));1901else1902shifted = src[!!(i & 2)];19031904bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],1905packedType, shifted);1906}1907else {1908// The options are 10, 11, and 2. Get it into a 32-bit reg, then1909// shift/mask. That's where it'll have to end up anyways. For signed,1910// we have to make sure to get sign-extension, so we actually have to1911// shift *up* first, and then shift down. There's no advantage to1912// AND'ing, so we don't.1913DataType ty = TYPE_U32;1914if (format->type == SNORM || format->type == SINT) {1915ty = TYPE_S32;1916}19171918// Poor man's EXTBF1919bld.mkOp2(1920OP_SHR, ty, typedDst[i],1921bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), untypedDst[0], bld.loadImm(NULL, 32 - bits - format->bits[i])),1922bld.loadImm(NULL, 32 - format->bits[i]));19231924// If the stored data is already in the appropriate type, we don't1925// have to do anything. Convert to float for the *NORM formats.1926if (format->type == UNORM || format->type == SNORM)1927bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_U32, typedDst[i]);1928}19291930// Normalize / convert as necessary1931if (format->type == UNORM)1932bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));1933else if (format->type == SNORM)1934bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));1935else if (format->type == FLOAT && format->bits[i] < 16) {1936// We expect the value to be in the low bits of the register, so we1937// have to shift back up.1938bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));1939Value *src[2];1940bld.mkSplit(src, 2, typedDst[i]);1941bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, src[0]);1942}1943}19441945if (format->bgra) {1946std::swap(typedDst[0], typedDst[2]);1947}19481949bld.getBB()->remove(su);1950return true;1951}19521953bool1954NV50LoweringPreSSA::handleSUREDP(TexInstruction *su)1955{1956const int slot = su->tex.r;1957const int dim = su->tex.target.getDim();1958const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());1959assert(!su->getIndirectR());19601961bld.setPosition(su, false);19621963Value *coord = processSurfaceCoords(su);19641965// This is guaranteed to be a 32-bit format. So there's nothing to1966// pack/unpack.1967Instruction *atom = bld.mkOp2(1968OP_ATOM, su->dType, su->getDef(0),1969bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), su->getSrc(arg));1970if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)1971atom->setSrc(2, su->getSrc(arg + 1));1972atom->setIndirect(0, 0, coord);1973atom->subOp = su->subOp;19741975bld.getBB()->remove(su);1976return true;1977}19781979bool1980NV50LoweringPreSSA::handleSUSTP(TexInstruction *su)1981{1982const int slot = su->tex.r;1983const int dim = su->tex.target.getDim();1984const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());1985assert(!su->getIndirectR());19861987bld.setPosition(su, false);19881989const TexInstruction::ImgFormatDesc *format = su->tex.format;1990const int bytes = (su->tex.format->bits[0] +1991su->tex.format->bits[1] +1992su->tex.format->bits[2] +1993su->tex.format->bits[3]) / 8;1994DataType ty = typeOfSize(bytes);19951996Value *coord = processSurfaceCoords(su);19971998// The packed values we will eventually store into memory1999Value *untypedDst[4] = {};2000// Each component's packed representation, in 16-bit registers (only used2001// where appropriate)2002Value *untypedDst16[4] = {};2003// The original values that are being packed2004Value *typedDst[4] = {};2005int i;20062007for (i = 0; i < bytes / 4; i++)2008untypedDst[i] = bld.getSSA();2009for (i = 0; i < format->components; i++)2010untypedDst16[i] = bld.getSSA(2);2011// Make sure we get at least one of each value allocated for the2012// super-narrow formats.2013if (bytes < 4)2014untypedDst[0] = bld.getSSA();2015if (bytes < 2)2016untypedDst16[0] = bld.getSSA(2);20172018for (i = 0; i < 4; i++) {2019typedDst[i] = bld.getSSA();2020bld.mkMov(typedDst[i], su->getSrc(arg + i));2021}20222023if (format->bgra) {2024std::swap(typedDst[0], typedDst[2]);2025}20262027// Pack each component into the untyped dsts.2028int bits = 0;2029for (int i = 0; i < format->components; bits += format->bits[i], i++) {2030// Un-normalize / convert as necessary2031if (format->type == UNORM)2032bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << format->bits[i]) - 1)));2033else if (format->type == SNORM)2034bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << (format->bits[i] - 1)) - 1)));20352036// There is nothing to convert/pack for 32-bit values2037if (format->bits[i] == 32) {2038bld.mkMov(untypedDst[i], typedDst[i]);2039continue;2040}20412042// The remainder of the cases will naturally want to deal in 16-bit2043// registers. We will put these into untypedDst16 and then merge them2044// together later.2045if (format->type == FLOAT && format->bits[i] < 16) {2046bld.mkCvt(OP_CVT, TYPE_F16, untypedDst16[i], TYPE_F32, typedDst[i]);2047bld.mkOp2(OP_SHR, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(15 - format->bits[i])));20482049// For odd bit sizes, it's easier to pack it into the final2050// destination directly.2051Value *tmp = bld.getSSA();2052bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);2053if (i == 0) {2054untypedDst[0] = tmp;2055} else {2056bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));2057bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);2058}2059} else if (format->bits[i] == 16) {2060// We can always convert the shader value into the packed value2061// directly here2062bld.mkCvt(OP_CVT, getPackedType(format, i), untypedDst16[i],2063getShaderType(format->type), typedDst[i]);2064} else if (format->bits[i] < 16) {2065DataType packedType = getPackedType(format, i);2066DataType shaderType = getShaderType(format->type);2067// We can't convert F32 to U8/S8 directly, so go to U16/S16 first.2068if (shaderType == TYPE_F32 && typeSizeof(packedType) == 1) {2069packedType = format->type == SNORM ? TYPE_S16 : TYPE_U16;2070}2071bld.mkCvt(OP_CVT, packedType, untypedDst16[i], shaderType, typedDst[i]);2072// TODO: clamp for 10- and 2-bit sizes. Also, due to the oddness of2073// the size, it's easier to dump them into a 32-bit value and OR2074// everything later.2075if (format->bits[i] != 8) {2076// Restrict value to the appropriate bits (although maybe supposed2077// to clamp instead?)2078bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)((1 << format->bits[i]) - 1)));2079// And merge into final packed value2080Value *tmp = bld.getSSA();2081bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);2082if (i == 0) {2083untypedDst[0] = tmp;2084} else {2085bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));2086bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);2087}2088} else if (i & 1) {2089// Shift the 8-bit value up (so that it can be OR'd later)2090bld.mkOp2(OP_SHL, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(bits % 16)));2091} else if (packedType != TYPE_U8) {2092// S8 (or the *16 if converted from float) will all have high bits2093// set, so AND them out.2094bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)0xff));2095}2096}2097}20982099// OR pairs of 8-bit values together (into the even value)2100if (format->bits[0] == 8) {2101for (i = 0; i < 2 && untypedDst16[2 * i] && untypedDst16[2 * i + 1]; i++)2102bld.mkOp2(OP_OR, TYPE_U16, untypedDst16[2 * i], untypedDst16[2 * i], untypedDst16[2 * i + 1]);2103}21042105// We'll always want to have at least a 32-bit source register for the store2106Instruction *merge = bld.mkOp(OP_MERGE, bytes < 4 ? TYPE_U32 : ty, bld.getSSA(bytes < 4 ? 4 : bytes));2107if (format->bits[0] == 32) {2108for (i = 0; i < 4 && untypedDst[i]; i++)2109merge->setSrc(i, untypedDst[i]);2110} else if (format->bits[0] == 16) {2111for (i = 0; i < 4 && untypedDst16[i]; i++)2112merge->setSrc(i, untypedDst16[i]);2113if (i == 1)2114merge->setSrc(i, bld.getSSA(2));2115} else if (format->bits[0] == 8) {2116for (i = 0; i < 2 && untypedDst16[2 * i]; i++)2117merge->setSrc(i, untypedDst16[2 * i]);2118if (i == 1)2119merge->setSrc(i, bld.getSSA(2));2120} else {2121merge->setSrc(0, untypedDst[0]);2122}21232124bld.mkStore(OP_STORE, ty, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), coord, merge->getDef(0));21252126bld.getBB()->remove(su);2127return true;2128}21292130bool2131NV50LoweringPreSSA::handlePFETCH(Instruction *i)2132{2133assert(prog->getType() == Program::TYPE_GEOMETRY);21342135// NOTE: cannot use getImmediate here, not in SSA form yet, move to2136// later phase if that assertion ever triggers:21372138ImmediateValue *imm = i->getSrc(0)->asImm();2139assert(imm);21402141assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens21422143if (i->srcExists(1)) {2144// indirect addressing of vertex in primitive space21452146LValue *val = bld.getScratch();2147Value *ptr = bld.getSSA(2, FILE_ADDRESS);2148bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));2149bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);21502151// NOTE: PFETCH directly to an $aX only works with direct addressing2152i->op = OP_SHL;2153i->setSrc(0, val);2154i->setSrc(1, bld.mkImm(0));2155}21562157return true;2158}21592160// Set flags according to predicate and make the instruction read $cX.2161void2162NV50LoweringPreSSA::checkPredicate(Instruction *insn)2163{2164Value *pred = insn->getPredicate();2165Value *cdst;21662167// FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA2168if (!pred ||2169pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)2170return;21712172cdst = bld.getSSA(1, FILE_FLAGS);21732174bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);21752176insn->setPredicate(insn->cc, cdst);2177}21782179//2180// - add quadop dance for texturing2181// - put FP outputs in GPRs2182// - convert instruction sequences2183//2184bool2185NV50LoweringPreSSA::visit(Instruction *i)2186{2187bld.setPosition(i, false);21882189if (i->cc != CC_ALWAYS)2190checkPredicate(i);21912192switch (i->op) {2193case OP_TEX:2194case OP_TXF:2195case OP_TXG:2196return handleTEX(i->asTex());2197case OP_TXB:2198return handleTXB(i->asTex());2199case OP_TXL:2200return handleTXL(i->asTex());2201case OP_TXD:2202return handleTXD(i->asTex());2203case OP_TXLQ:2204return handleTXLQ(i->asTex());2205case OP_TXQ:2206return handleTXQ(i->asTex());2207case OP_EX2:2208bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));2209i->setSrc(0, i->getDef(0));2210break;2211case OP_SET:2212return handleSET(i);2213case OP_SLCT:2214return handleSLCT(i->asCmp());2215case OP_SELP:2216return handleSELP(i);2217case OP_POW:2218return handlePOW(i);2219case OP_DIV:2220return handleDIV(i);2221case OP_SQRT:2222return handleSQRT(i);2223case OP_EXPORT:2224return handleEXPORT(i);2225case OP_LOAD:2226return handleLOAD(i);2227case OP_MEMBAR:2228return handleMEMBAR(i);2229case OP_ATOM:2230case OP_STORE:2231return handleLDST(i);2232case OP_SULDP:2233return handleSULDP(i->asTex());2234case OP_SUSTP:2235return handleSUSTP(i->asTex());2236case OP_SUREDP:2237return handleSUREDP(i->asTex());2238case OP_SUQ:2239return handleSUQ(i->asTex());2240case OP_BUFQ:2241return handleBUFQ(i);2242case OP_RDSV:2243return handleRDSV(i);2244case OP_WRSV:2245return handleWRSV(i);2246case OP_CALL:2247return handleCALL(i);2248case OP_PRECONT:2249return handlePRECONT(i);2250case OP_CONT:2251return handleCONT(i);2252case OP_PFETCH:2253return handlePFETCH(i);2254default:2255break;2256}2257return true;2258}22592260bool2261TargetNV50::runLegalizePass(Program *prog, CGStage stage) const2262{2263bool ret = false;22642265if (stage == CG_STAGE_PRE_SSA) {2266NV50LoweringPreSSA pass(prog);2267ret = pass.run(prog, false, true);2268} else2269if (stage == CG_STAGE_SSA) {2270if (!prog->targetPriv)2271prog->targetPriv = new std::list<Instruction *>();2272NV50LegalizeSSA pass(prog);2273ret = pass.run(prog, false, true);2274} else2275if (stage == CG_STAGE_POST_RA) {2276NV50LegalizePostRA pass;2277ret = pass.run(prog, false, true);2278if (prog->targetPriv)2279delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);2280}2281return ret;2282}22832284} // namespace nv50_ir228522862287