CoCalc -- nv50_ir_lowering

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
⁴⁵⁷⁴ views
1
/*
2
 * Copyright 2011 Christoph Bumiller
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice shall be included in
12
 * all copies or substantial portions of the Software.
13
 *
14
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20
 * OTHER DEALINGS IN THE SOFTWARE.
21
 */
22

23
#include "codegen/nv50_ir.h"
24
#include "codegen/nv50_ir_build_util.h"
25

26
#include "codegen/nv50_ir_target_nvc0.h"
27
#include "codegen/nv50_ir_lowering_nvc0.h"
28

29
#include <limits>
30

31
namespace nv50_ir {
32

33
#define QOP_ADD  0
34
#define QOP_SUBR 1
35
#define QOP_SUB  2
36
#define QOP_MOV2 3
37

38
//             UL UR LL LR
39
#define QUADOP(q, r, s, t)                      \
40
   ((QOP_##q << 6) | (QOP_##r << 4) |           \
41
    (QOP_##s << 2) | (QOP_##t << 0))
42

43
void
44
NVC0LegalizeSSA::handleDIV(Instruction *i)
45
{
46
   FlowInstruction *call;
47
   int builtin;
48

49
   bld.setPosition(i, false);
50

51
   // Generate movs to the input regs for the call we want to generate
52
   for (int s = 0; i->srcExists(s); ++s) {
53
      Instruction *ld = i->getSrc(s)->getInsn();
54
      // check if we are moving an immediate, propagate it in that case
55
      if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV) ||
56
            !(ld->src(0).getFile() == FILE_IMMEDIATE))
57
         bld.mkMovToReg(s, i->getSrc(s));
58
      else {
59
         assert(ld->getSrc(0) != NULL);
60
         bld.mkMovToReg(s, ld->getSrc(0));
61
         // Clear the src, to make code elimination possible here before we
62
         // delete the instruction i later
63
         i->setSrc(s, NULL);
64
         if (ld->isDead())
65
            delete_Instruction(prog, ld);
66
      }
67
   }
68

69
   switch (i->dType) {
70
   case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
71
   case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
72
   default:
73
      return;
74
   }
75
   call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
76
   bld.mkMovFromReg(i->getDef(0), i->op == OP_DIV ? 0 : 1);
77
   bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
78
   bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
79

80
   call->fixed = 1;
81
   call->absolute = call->builtin = 1;
82
   call->target.builtin = builtin;
83
   delete_Instruction(prog, i);
84
}
85

86
void
87
NVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[])
88
{
89
   FlowInstruction *call;
90
   Value *def[2];
91
   int builtin;
92

93
   def[0] = bld.mkMovToReg(0, src[0])->getDef(0);
94
   def[1] = bld.mkMovToReg(1, src[1])->getDef(0);
95

96
   if (i->op == OP_RCP)
97
      builtin = NVC0_BUILTIN_RCP_F64;
98
   else
99
      builtin = NVC0_BUILTIN_RSQ_F64;
100

101
   call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
102
   def[0] = bld.getSSA();
103
   def[1] = bld.getSSA();
104
   bld.mkMovFromReg(def[0], 0);
105
   bld.mkMovFromReg(def[1], 1);
106
   bld.mkClobber(FILE_GPR, 0x3fc, 2);
107
   bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0);
108
   bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]);
109

110
   call->fixed = 1;
111
   call->absolute = call->builtin = 1;
112
   call->target.builtin = builtin;
113
   delete_Instruction(prog, i);
114

115
   prog->fp64 = true;
116
}
117

118
void
119
NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
120
{
121
   assert(i->dType == TYPE_F64);
122
   // There are instructions that will compute the high 32 bits of the 64-bit
123
   // float. We will just stick 0 in the bottom 32 bits.
124

125
   bld.setPosition(i, false);
126

127
   // 1. Take the source and it up.
128
   Value *src[2], *dst[2], *def = i->getDef(0);
129
   bld.mkSplit(src, 4, i->getSrc(0));
130

131
   int chip = prog->getTarget()->getChipset();
132
   if (chip >= NVISA_GK104_CHIPSET) {
133
      handleRCPRSQLib(i, src);
134
      return;
135
   }
136

137
   // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
138
   dst[0] = bld.loadImm(NULL, 0);
139
   dst[1] = bld.getSSA();
140

141
   // 3. The new version of the instruction takes the high 32 bits of the
142
   // source and outputs the high 32 bits of the destination.
143
   i->setSrc(0, src[1]);
144
   i->setDef(0, dst[1]);
145
   i->setType(TYPE_F32);
146
   i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
147

148
   // 4. Recombine the two dst pieces back into the original destination.
149
   bld.setPosition(i, true);
150
   bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
151
}
152

153
void
154
NVC0LegalizeSSA::handleFTZ(Instruction *i)
155
{
156
   // Only want to flush float inputs
157
   assert(i->sType == TYPE_F32);
158

159
   // If we're already flushing denorms (and NaN's) to zero, no need for this.
160
   if (i->dnz)
161
      return;
162

163
   // Only certain classes of operations can flush
164
   OpClass cls = prog->getTarget()->getOpClass(i->op);
165
   if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
166
       cls != OPCLASS_CONVERT)
167
      return;
168

169
   i->ftz = true;
170
}
171

172
void
173
NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
174
{
175
   if (i->tex.levelZero)
176
      return;
177

178
   ImmediateValue lod;
179

180
   // The LOD argument comes right after the coordinates (before depth bias,
181
   // offsets, etc).
182
   int arg = i->tex.target.getArgCount();
183

184
   // SM30+ stores the indirect handle as a separate arg, which comes before
185
   // the LOD.
186
   if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET &&
187
       i->tex.rIndirectSrc >= 0)
188
      arg++;
189
   // SM20 stores indirect handle combined with array coordinate
190
   if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET &&
191
       !i->tex.target.isArray() &&
192
       i->tex.rIndirectSrc >= 0)
193
      arg++;
194

195
   if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0))
196
      return;
197

198
   if (i->op == OP_TXL)
199
      i->op = OP_TEX;
200
   i->tex.levelZero = true;
201
   i->moveSources(arg + 1, -1);
202
}
203

204
void
205
NVC0LegalizeSSA::handleShift(Instruction *lo)
206
{
207
   Value *shift = lo->getSrc(1);
208
   Value *dst64 = lo->getDef(0);
209
   Value *src[2], *dst[2];
210
   operation op = lo->op;
211

212
   bld.setPosition(lo, false);
213

214
   bld.mkSplit(src, 4, lo->getSrc(0));
215

216
   // SM30 and prior don't have the fancy new SHF.L/R ops. So the logic has to
217
   // be completely emulated. For SM35+, we can use the more directed SHF
218
   // operations.
219
   if (prog->getTarget()->getChipset() < NVISA_GK20A_CHIPSET) {
220
      // The strategy here is to handle shifts >= 32 and less than 32 as
221
      // separate parts.
222
      //
223
      // For SHL:
224
      // If the shift is <= 32, then
225
      //   (HI,LO) << x = (HI << x | (LO >> (32 - x)), LO << x)
226
      // If the shift is > 32, then
227
      //   (HI,LO) << x = (LO << (x - 32), 0)
228
      //
229
      // For SHR:
230
      // If the shift is <= 32, then
231
      //   (HI,LO) >> x = (HI >> x, (HI << (32 - x)) | LO >> x)
232
      // If the shift is > 32, then
233
      //   (HI,LO) >> x = (0, HI >> (x - 32))
234
      //
235
      // Note that on NVIDIA hardware, a shift > 32 yields a 0 value, which we
236
      // can use to our advantage. Also note the structural similarities
237
      // between the right/left cases. The main difference is swapping hi/lo
238
      // on input and output.
239

240
      Value *x32_minus_shift, *pred, *hi1, *hi2;
241
      DataType type = isSignedIntType(lo->dType) ? TYPE_S32 : TYPE_U32;
242
      operation antiop = op == OP_SHR ? OP_SHL : OP_SHR;
243
      if (op == OP_SHR)
244
         std::swap(src[0], src[1]);
245
      bld.mkOp2(OP_ADD, TYPE_U32, (x32_minus_shift = bld.getSSA()), shift, bld.mkImm(0x20))
246
         ->src(0).mod = Modifier(NV50_IR_MOD_NEG);
247
      bld.mkCmp(OP_SET, CC_LE, TYPE_U8, (pred = bld.getSSA(1, FILE_PREDICATE)),
248
                TYPE_U32, shift, bld.mkImm(32));
249
      // Compute HI (shift <= 32)
250
      bld.mkOp2(OP_OR, TYPE_U32, (hi1 = bld.getSSA()),
251
                bld.mkOp2v(op, TYPE_U32, bld.getSSA(), src[1], shift),
252
                bld.mkOp2v(antiop, TYPE_U32, bld.getSSA(), src[0], x32_minus_shift))
253
         ->setPredicate(CC_P, pred);
254
      // Compute LO (all shift values)
255
      bld.mkOp2(op, type, (dst[0] = bld.getSSA()), src[0], shift);
256
      // Compute HI (shift > 32)
257
      bld.mkOp2(op, type, (hi2 = bld.getSSA()), src[0],
258
                bld.mkOp1v(OP_NEG, TYPE_S32, bld.getSSA(), x32_minus_shift))
259
         ->setPredicate(CC_NOT_P, pred);
260
      bld.mkOp2(OP_UNION, TYPE_U32, (dst[1] = bld.getSSA()), hi1, hi2);
261
      if (op == OP_SHR)
262
         std::swap(dst[0], dst[1]);
263
      bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
264
      delete_Instruction(prog, lo);
265
      return;
266
   }
267

268
   Instruction *hi = new_Instruction(func, op, TYPE_U32);
269
   lo->bb->insertAfter(lo, hi);
270

271
   hi->sType = lo->sType;
272
   lo->dType = TYPE_U32;
273

274
   hi->setDef(0, (dst[1] = bld.getSSA()));
275
   if (lo->op == OP_SHR)
276
      hi->subOp |= NV50_IR_SUBOP_SHIFT_HIGH;
277
   lo->setDef(0, (dst[0] = bld.getSSA()));
278

279
   bld.setPosition(hi, true);
280

281
   if (lo->op == OP_SHL)
282
      std::swap(hi, lo);
283

284
   hi->setSrc(0, new_ImmediateValue(prog, 0u));
285
   hi->setSrc(1, shift);
286
   hi->setSrc(2, lo->op == OP_SHL ? src[0] : src[1]);
287

288
   lo->setSrc(0, src[0]);
289
   lo->setSrc(1, shift);
290
   lo->setSrc(2, src[1]);
291

292
   bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
293
}
294

295
void
296
NVC0LegalizeSSA::handleSET(CmpInstruction *cmp)
297
{
298
   DataType hTy = cmp->sType == TYPE_S64 ? TYPE_S32 : TYPE_U32;
299
   Value *carry;
300
   Value *src0[2], *src1[2];
301
   bld.setPosition(cmp, false);
302

303
   bld.mkSplit(src0, 4, cmp->getSrc(0));
304
   bld.mkSplit(src1, 4, cmp->getSrc(1));
305
   bld.mkOp2(OP_SUB, hTy, NULL, src0[0], src1[0])
306
      ->setFlagsDef(0, (carry = bld.getSSA(1, FILE_FLAGS)));
307
   cmp->setFlagsSrc(cmp->srcCount(), carry);
308
   cmp->setSrc(0, src0[1]);
309
   cmp->setSrc(1, src1[1]);
310
   cmp->sType = hTy;
311
}
312

313
void
314
NVC0LegalizeSSA::handleBREV(Instruction *i)
315
{
316
   i->op = OP_EXTBF;
317
   i->subOp = NV50_IR_SUBOP_EXTBF_REV;
318
   i->setSrc(1, bld.mkImm(0x2000));
319
}
320

321
bool
322
NVC0LegalizeSSA::visit(Function *fn)
323
{
324
   bld.setProgram(fn->getProgram());
325
   return true;
326
}
327

328
bool
329
NVC0LegalizeSSA::visit(BasicBlock *bb)
330
{
331
   Instruction *next;
332
   for (Instruction *i = bb->getEntry(); i; i = next) {
333
      next = i->next;
334

335
      if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE)
336
         handleFTZ(i);
337

338
      switch (i->op) {
339
      case OP_DIV:
340
      case OP_MOD:
341
         if (i->sType != TYPE_F32)
342
            handleDIV(i);
343
         break;
344
      case OP_RCP:
345
      case OP_RSQ:
346
         if (i->dType == TYPE_F64)
347
            handleRCPRSQ(i);
348
         break;
349
      case OP_TXL:
350
      case OP_TXF:
351
         handleTEXLOD(i->asTex());
352
         break;
353
      case OP_SHR:
354
      case OP_SHL:
355
         if (typeSizeof(i->sType) == 8)
356
            handleShift(i);
357
         break;
358
      case OP_SET:
359
      case OP_SET_AND:
360
      case OP_SET_OR:
361
      case OP_SET_XOR:
362
         if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64)
363
            handleSET(i->asCmp());
364
         break;
365
      case OP_BREV:
366
         handleBREV(i);
367
         break;
368
      default:
369
         break;
370
      }
371
   }
372
   return true;
373
}
374

375
NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
376
   : rZero(NULL),
377
     carry(NULL),
378
     pOne(NULL),
379
     needTexBar(prog->getTarget()->getChipset() >= 0xe0 &&
380
                prog->getTarget()->getChipset() < 0x110)
381
{
382
}
383

384
bool
385
NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
386
                                    const Instruction *early) const
387
{
388
   if (early->bb == later->bb)
389
      return early->serial < later->serial;
390
   return later->bb->dominatedBy(early->bb);
391
}
392

393
void
394
NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
395
                              Instruction *usei, const Instruction *texi)
396
{
397
   bool add = true;
398
   bool dominated = insnDominatedBy(usei, texi);
399
   // Uses before the tex have to all be included. Just because an earlier
400
   // instruction dominates another instruction doesn't mean that there's no
401
   // way to get from the tex to the later instruction. For example you could
402
   // have nested loops, with the tex in the inner loop, and uses before it in
403
   // both loops - even though the outer loop's instruction would dominate the
404
   // inner's, we still want a texbar before the inner loop's instruction.
405
   //
406
   // However we can still use the eliding logic between uses dominated by the
407
   // tex instruction, as that is unambiguously correct.
408
   if (dominated) {
409
      for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) {
410
         if (it->after) {
411
            if (insnDominatedBy(usei, it->insn)) {
412
               add = false;
413
               break;
414
            }
415
            if (insnDominatedBy(it->insn, usei)) {
416
               it = uses.erase(it);
417
               continue;
418
            }
419
         }
420
         ++it;
421
      }
422
   }
423
   if (add)
424
      uses.push_back(TexUse(usei, texi, dominated));
425
}
426

427
// While it might be tempting to use the an algorithm that just looks at tex
428
// uses, not all texture results are guaranteed to be used on all paths. In
429
// the case where along some control flow path a texture result is never used,
430
// we might reuse that register for something else, creating a
431
// write-after-write hazard. So we have to manually look through all
432
// instructions looking for ones that reference the registers in question.
433
void
434
NVC0LegalizePostRA::findFirstUses(
435
   Instruction *texi, std::list<TexUse> &uses)
436
{
437
   int minGPR = texi->def(0).rep()->reg.data.id;
438
   int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
439

440
   unordered_set<const BasicBlock *> visited;
441
   findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
442
}
443

444
void
445
NVC0LegalizePostRA::findFirstUsesBB(
446
   int minGPR, int maxGPR, Instruction *start,
447
   const Instruction *texi, std::list<TexUse> &uses,
448
   unordered_set<const BasicBlock *> &visited)
449
{
450
   const BasicBlock *bb = start->bb;
451

452
   // We don't process the whole bb the first time around. This is correct,
453
   // however we might be in a loop and hit this BB again, and need to process
454
   // the full thing. So only mark a bb as visited if we processed it from the
455
   // beginning.
456
   if (start == bb->getEntry()) {
457
      if (visited.find(bb) != visited.end())
458
         return;
459
      visited.insert(bb);
460
   }
461

462
   for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
463
      if (insn->isNop())
464
         continue;
465

466
      for (int d = 0; insn->defExists(d); ++d) {
467
         const Value *def = insn->def(d).rep();
468
         if (insn->def(d).getFile() != FILE_GPR ||
469
             def->reg.data.id + def->reg.size / 4 - 1 < minGPR ||
470
             def->reg.data.id > maxGPR)
471
            continue;
472
         addTexUse(uses, insn, texi);
473
         return;
474
      }
475

476
      for (int s = 0; insn->srcExists(s); ++s) {
477
         const Value *src = insn->src(s).rep();
478
         if (insn->src(s).getFile() != FILE_GPR ||
479
             src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
480
             src->reg.data.id > maxGPR)
481
            continue;
482
         addTexUse(uses, insn, texi);
483
         return;
484
      }
485
   }
486

487
   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
488
      findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
489
                      texi, uses, visited);
490
   }
491
}
492

493
// Texture barriers:
494
// This pass is a bit long and ugly and can probably be optimized.
495
//
496
// 1. obtain a list of TEXes and their outputs' first use(s)
497
// 2. calculate the barrier level of each first use (minimal number of TEXes,
498
//    over all paths, between the TEX and the use in question)
499
// 3. for each barrier, if all paths from the source TEX to that barrier
500
//    contain a barrier of lesser level, it can be culled
501
bool
502
NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
503
{
504
   std::list<TexUse> *uses;
505
   std::vector<Instruction *> texes;
506
   std::vector<int> bbFirstTex;
507
   std::vector<int> bbFirstUse;
508
   std::vector<int> texCounts;
509
   std::vector<TexUse> useVec;
510
   ArrayList insns;
511

512
   fn->orderInstructions(insns);
513

514
   texCounts.resize(fn->allBBlocks.getSize(), 0);
515
   bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
516
   bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
517

518
   // tag BB CFG nodes by their id for later
519
   for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
520
      BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
521
      if (bb)
522
         bb->cfg.tag = bb->getId();
523
   }
524

525
   // gather the first uses for each TEX
526
   for (int i = 0; i < insns.getSize(); ++i) {
527
      Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
528
      if (isTextureOp(tex->op)) {
529
         texes.push_back(tex);
530
         if (!texCounts.at(tex->bb->getId()))
531
            bbFirstTex[tex->bb->getId()] = texes.size() - 1;
532
         texCounts[tex->bb->getId()]++;
533
      }
534
   }
535
   insns.clear();
536
   if (texes.empty())
537
      return false;
538
   uses = new std::list<TexUse>[texes.size()];
539
   if (!uses)
540
      return false;
541
   for (size_t i = 0; i < texes.size(); ++i) {
542
      findFirstUses(texes[i], uses[i]);
543
   }
544

545
   // determine the barrier level at each use
546
   for (size_t i = 0; i < texes.size(); ++i) {
547
      for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
548
           ++u) {
549
         BasicBlock *tb = texes[i]->bb;
550
         BasicBlock *ub = u->insn->bb;
551
         if (tb == ub) {
552
            u->level = 0;
553
            for (size_t j = i + 1; j < texes.size() &&
554
                    texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
555
                 ++j)
556
               u->level++;
557
         } else {
558
            u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
559
                                                      &ub->cfg, texCounts);
560
            if (u->level < 0) {
561
               WARN("Failed to find path TEX -> TEXBAR\n");
562
               u->level = 0;
563
               continue;
564
            }
565
            // this counted all TEXes in the origin block, correct that
566
            u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
567
            // and did not count the TEXes in the destination block, add those
568
            for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
569
                    texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
570
                 ++j)
571
               u->level++;
572
         }
573
         assert(u->level >= 0);
574
         useVec.push_back(*u);
575
      }
576
   }
577
   delete[] uses;
578

579
   // insert the barriers
580
   for (size_t i = 0; i < useVec.size(); ++i) {
581
      Instruction *prev = useVec[i].insn->prev;
582
      if (useVec[i].level < 0)
583
         continue;
584
      if (prev && prev->op == OP_TEXBAR) {
585
         if (prev->subOp > useVec[i].level)
586
            prev->subOp = useVec[i].level;
587
         prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
588
      } else {
589
         Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
590
         bar->fixed = 1;
591
         bar->subOp = useVec[i].level;
592
         // make use explicit to ease latency calculation
593
         bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
594
         useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
595
      }
596
   }
597

598
   if (fn->getProgram()->optLevel < 3)
599
      return true;
600

601
   std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
602

603
   limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
604
   limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
605
   limitS.resize(fn->allBBlocks.getSize());
606

607
   // cull unneeded barriers (should do that earlier, but for simplicity)
608
   IteratorRef bi = fn->cfg.iteratorCFG();
609
   // first calculate min/max outstanding TEXes for each BB
610
   for (bi->reset(); !bi->end(); bi->next()) {
611
      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
612
      BasicBlock *bb = BasicBlock::get(n);
613
      int min = 0;
614
      int max = std::numeric_limits<int>::max();
615
      for (Instruction *i = bb->getFirst(); i; i = i->next) {
616
         if (isTextureOp(i->op)) {
617
            min++;
618
            if (max < std::numeric_limits<int>::max())
619
               max++;
620
         } else
621
         if (i->op == OP_TEXBAR) {
622
            min = MIN2(min, i->subOp);
623
            max = MIN2(max, i->subOp);
624
         }
625
      }
626
      // limits when looking at an isolated block
627
      limitS[bb->getId()].min = min;
628
      limitS[bb->getId()].max = max;
629
   }
630
   // propagate the min/max values
631
   for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
632
      for (bi->reset(); !bi->end(); bi->next()) {
633
         Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
634
         BasicBlock *bb = BasicBlock::get(n);
635
         const int bbId = bb->getId();
636
         for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
637
            BasicBlock *in = BasicBlock::get(ei.getNode());
638
            const int inId = in->getId();
639
            limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
640
            limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
641
         }
642
         // I just hope this is correct ...
643
         if (limitS[bbId].max == std::numeric_limits<int>::max()) {
644
            // no barrier
645
            limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
646
            limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
647
         } else {
648
            // block contained a barrier
649
            limitB[bbId].min = MIN2(limitS[bbId].max,
650
                                    limitT[bbId].min + limitS[bbId].min);
651
            limitB[bbId].max = MIN2(limitS[bbId].max,
652
                                    limitT[bbId].max + limitS[bbId].min);
653
         }
654
      }
655
   }
656
   // finally delete unnecessary barriers
657
   for (bi->reset(); !bi->end(); bi->next()) {
658
      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
659
      BasicBlock *bb = BasicBlock::get(n);
660
      Instruction *prev = NULL;
661
      Instruction *next;
662
      int max = limitT[bb->getId()].max;
663
      for (Instruction *i = bb->getFirst(); i; i = next) {
664
         next = i->next;
665
         if (i->op == OP_TEXBAR) {
666
            if (i->subOp >= max) {
667
               delete_Instruction(prog, i);
668
               i = NULL;
669
            } else {
670
               max = i->subOp;
671
               if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
672
                  delete_Instruction(prog, prev);
673
                  prev = NULL;
674
               }
675
            }
676
         } else
677
         if (isTextureOp(i->op)) {
678
            max++;
679
         }
680
         if (i && !i->isNop())
681
            prev = i;
682
      }
683
   }
684
   return true;
685
}
686

687
bool
688
NVC0LegalizePostRA::visit(Function *fn)
689
{
690
   if (needTexBar)
691
      insertTextureBarriers(fn);
692

693
   rZero = new_LValue(fn, FILE_GPR);
694
   pOne = new_LValue(fn, FILE_PREDICATE);
695
   carry = new_LValue(fn, FILE_FLAGS);
696

697
   rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63;
698
   carry->reg.data.id = 0;
699
   pOne->reg.data.id = 7;
700

701
   return true;
702
}
703

704
void
705
NVC0LegalizePostRA::replaceZero(Instruction *i)
706
{
707
   for (int s = 0; i->srcExists(s); ++s) {
708
      if (s == 2 && i->op == OP_SUCLAMP)
709
         continue;
710
      if (s == 1 && i->op == OP_SHLADD)
711
         continue;
712
      ImmediateValue *imm = i->getSrc(s)->asImm();
713
      if (imm) {
714
         if (i->op == OP_SELP && s == 2) {
715
            i->setSrc(s, pOne);
716
            if (imm->reg.data.u64 == 0)
717
               i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT);
718
         } else if (imm->reg.data.u64 == 0) {
719
            i->setSrc(s, rZero);
720
         }
721
      }
722
   }
723
}
724

725
// replace CONT with BRA for single unconditional continue
726
bool
727
NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
728
{
729
   if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
730
      return false;
731
   Graph::EdgeIterator ei = bb->cfg.incident();
732
   if (ei.getType() != Graph::Edge::BACK)
733
      ei.next();
734
   if (ei.getType() != Graph::Edge::BACK)
735
      return false;
736
   BasicBlock *contBB = BasicBlock::get(ei.getNode());
737

738
   if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
739
       contBB->getExit()->getPredicate())
740
      return false;
741
   contBB->getExit()->op = OP_BRA;
742
   bb->remove(bb->getEntry()); // delete PRECONT
743

744
   ei.next();
745
   assert(ei.end() || ei.getType() != Graph::Edge::BACK);
746
   return true;
747
}
748

749
// replace branches to join blocks with join ops
750
void
751
NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
752
{
753
   if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
754
      return;
755
   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
756
      BasicBlock *in = BasicBlock::get(ei.getNode());
757
      Instruction *exit = in->getExit();
758
      if (!exit) {
759
         in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
760
         // there should always be a terminator instruction
761
         WARN("inserted missing terminator in BB:%i\n", in->getId());
762
      } else
763
      if (exit->op == OP_BRA) {
764
         exit->op = OP_JOIN;
765
         exit->asFlow()->limit = 1; // must-not-propagate marker
766
      }
767
   }
768
   bb->remove(bb->getEntry());
769
}
770

771
// replaces instructions which would end up as f2f or i2i with faster
772
// alternatives:
773
//  - fabs(a)     -> fadd(0, abs a)
774
//  - fneg(a)     -> fadd(neg 0, neg a)
775
//  - ineg(a)     -> iadd(0, neg a)
776
//  - fneg(abs a) -> fadd(neg 0, neg abs a)
777
//  - sat(a)      -> sat add(0, a)
778
void
779
NVC0LegalizePostRA::replaceCvt(Instruction *cvt)
780
{
781
   if (!isFloatType(cvt->sType) && typeSizeof(cvt->sType) != 4)
782
      return;
783
   if (cvt->sType != cvt->dType)
784
      return;
785
   // we could make it work, but in this case we have optimizations disabled
786
   // and we don't really care either way.
787
   if (cvt->src(0).getFile() != FILE_GPR &&
788
       cvt->src(0).getFile() != FILE_MEMORY_CONST)
789
      return;
790

791
   Modifier mod0, mod1;
792

793
   switch (cvt->op) {
794
   case OP_ABS:
795
      if (cvt->src(0).mod)
796
         return;
797
      if (!isFloatType(cvt->sType))
798
         return;
799
      mod0 = 0;
800
      mod1 = NV50_IR_MOD_ABS;
801
      break;
802
   case OP_NEG:
803
      if (!isFloatType(cvt->sType) && cvt->src(0).mod)
804
         return;
805
      if (isFloatType(cvt->sType) &&
806
          (cvt->src(0).mod && cvt->src(0).mod != Modifier(NV50_IR_MOD_ABS)))
807
         return;
808

809
      mod0 = isFloatType(cvt->sType) ? NV50_IR_MOD_NEG : 0;
810
      mod1 = cvt->src(0).mod == Modifier(NV50_IR_MOD_ABS) ?
811
         NV50_IR_MOD_NEG_ABS : NV50_IR_MOD_NEG;
812
      break;
813
   case OP_SAT:
814
      if (!isFloatType(cvt->sType) && cvt->src(0).mod.abs())
815
         return;
816
      mod0 = 0;
817
      mod1 = cvt->src(0).mod;
818
      cvt->saturate = true;
819
      break;
820
   default:
821
      return;
822
   }
823

824
   cvt->op = OP_ADD;
825
   cvt->moveSources(0, 1);
826
   cvt->setSrc(0, rZero);
827
   cvt->src(0).mod = mod0;
828
   cvt->src(1).mod = mod1;
829
}
830

831
bool
832
NVC0LegalizePostRA::visit(BasicBlock *bb)
833
{
834
   Instruction *i, *next;
835

836
   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
837
   for (i = bb->getFirst(); i; i = next) {
838
      next = i->next;
839
      if (i->op == OP_EMIT || i->op == OP_RESTART) {
840
         if (!i->getDef(0)->refCount())
841
            i->setDef(0, NULL);
842
         if (i->src(0).getFile() == FILE_IMMEDIATE)
843
            i->setSrc(0, rZero); // initial value must be 0
844
         replaceZero(i);
845
      } else
846
      if (i->isNop()) {
847
         bb->remove(i);
848
      } else
849
      if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
850
          prog->getType() != Program::TYPE_COMPUTE) {
851
         // It seems like barriers are never required for tessellation since
852
         // the warp size is 32, and there are always at most 32 tcs threads.
853
         bb->remove(i);
854
      } else
855
      if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
856
         int offset = i->src(0).get()->reg.data.offset;
857
         if (abs(offset) >= 0x10000)
858
            i->src(0).get()->reg.fileIndex += offset >> 16;
859
         i->src(0).get()->reg.data.offset = (int)(short)offset;
860
      } else {
861
         // TODO: Move this to before register allocation for operations that
862
         // need the $c register !
863
         if (typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) {
864
            Instruction *hi;
865
            hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
866
            if (hi)
867
               next = hi;
868
         }
869

870
         if (i->op != OP_MOV && i->op != OP_PFETCH)
871
            replaceZero(i);
872

873
         if (i->op == OP_SAT || i->op == OP_NEG || i->op == OP_ABS)
874
            replaceCvt(i);
875
      }
876
   }
877
   if (!bb->getEntry())
878
      return true;
879

880
   if (!tryReplaceContWithBra(bb))
881
      propagateJoin(bb);
882

883
   return true;
884
}
885

886
NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget()),
887
   gpEmitAddress(NULL)
888
{
889
   bld.setProgram(prog);
890
}
891

892
bool
893
NVC0LoweringPass::visit(Function *fn)
894
{
895
   if (prog->getType() == Program::TYPE_GEOMETRY) {
896
      assert(!strncmp(fn->getName(), "MAIN", 4));
897
      // TODO: when we generate actual functions pass this value along somehow
898
      bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
899
      gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
900
      if (fn->cfgExit) {
901
         bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
902
         if (prog->getTarget()->getChipset() >= NVISA_GV100_CHIPSET)
903
            bld.mkOp1(OP_FINAL, TYPE_NONE, NULL, gpEmitAddress)->fixed = 1;
904
         bld.mkMovToReg(0, gpEmitAddress);
905
      }
906
   }
907
   return true;
908
}
909

910
bool
911
NVC0LoweringPass::visit(BasicBlock *bb)
912
{
913
   return true;
914
}
915

916
inline Value *
917
NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
918
{
919
   uint8_t b = prog->driver->io.auxCBSlot;
920
   uint32_t off = prog->driver->io.texBindBase + slot * 4;
921

922
   if (ptr)
923
      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2));
924

925
   return bld.
926
      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
927
}
928

929
// move array source to first slot, convert to u16, add indirections
930
bool
931
NVC0LoweringPass::handleTEX(TexInstruction *i)
932
{
933
   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
934
   const int arg = i->tex.target.getArgCount();
935
   const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
936
   const int chipset = prog->getTarget()->getChipset();
937

938
   /* Only normalize in the non-explicit derivatives case. For explicit
939
    * derivatives, this is handled in handleManualTXD.
940
    */
941
   if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
942
      Value *src[3], *val;
943
      int c;
944
      for (c = 0; c < 3; ++c)
945
         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
946
      val = bld.getScratch();
947
      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
948
      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
949
      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
950
      for (c = 0; c < 3; ++c) {
951
         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
952
                                 i->getSrc(c), val));
953
      }
954
   }
955

956
   // Arguments to the TEX instruction are a little insane. Even though the
957
   // encoding is identical between SM20 and SM30, the arguments mean
958
   // different things between Fermi and Kepler+. A lot of arguments are
959
   // optional based on flags passed to the instruction. This summarizes the
960
   // order of things.
961
   //
962
   // Fermi:
963
   //  array/indirect
964
   //  coords
965
   //  sample
966
   //  lod bias
967
   //  depth compare
968
   //  offsets:
969
   //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
970
   //    - other: 4 bits each, single reg
971
   //
972
   // Kepler+:
973
   //  indirect handle
974
   //  array (+ offsets for txd in upper 16 bits)
975
   //  coords
976
   //  sample
977
   //  lod bias
978
   //  depth compare
979
   //  offsets (same as fermi, except txd which takes it with array)
980
   //
981
   // Maxwell (tex):
982
   //  array
983
   //  coords
984
   //  indirect handle
985
   //  sample
986
   //  lod bias
987
   //  depth compare
988
   //  offsets
989
   //
990
   // Maxwell (txd):
991
   //  indirect handle
992
   //  coords
993
   //  array + offsets
994
   //  derivatives
995

996
   if (chipset >= NVISA_GK104_CHIPSET) {
997
      if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
998
         // XXX this ignores tsc, and assumes a 1:1 mapping
999
         assert(i->tex.rIndirectSrc >= 0);
1000
         if (!i->tex.bindless) {
1001
            Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r);
1002
            i->tex.r = 0xff;
1003
            i->tex.s = 0x1f;
1004
            i->setIndirectR(hnd);
1005
         }
1006
         i->setIndirectS(NULL);
1007
      } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
1008
         if (i->tex.r == 0xffff)
1009
            i->tex.r = prog->driver->io.fbtexBindBase / 4;
1010
         else
1011
            i->tex.r += prog->driver->io.texBindBase / 4;
1012
         i->tex.s  = 0; // only a single cX[] value possible here
1013
      } else {
1014
         Value *hnd = bld.getScratch();
1015
         Value *rHnd = loadTexHandle(NULL, i->tex.r);
1016
         Value *sHnd = loadTexHandle(NULL, i->tex.s);
1017

1018
         bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
1019

1020
         i->tex.r = 0; // not used for indirect tex
1021
         i->tex.s = 0;
1022
         i->setIndirectR(hnd);
1023
      }
1024
      if (i->tex.target.isArray()) {
1025
         LValue *layer = new_LValue(func, FILE_GPR);
1026
         Value *src = i->getSrc(lyr);
1027
         const int sat = (i->op == OP_TXF) ? 1 : 0;
1028
         DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
1029
         bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
1030
         if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
1031
            for (int s = dim; s >= 1; --s)
1032
               i->setSrc(s, i->getSrc(s - 1));
1033
            i->setSrc(0, layer);
1034
         } else {
1035
            i->setSrc(dim, layer);
1036
         }
1037
      }
1038
      // Move the indirect reference to the first place
1039
      if (i->tex.rIndirectSrc >= 0 && (
1040
                i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
1041
         Value *hnd = i->getIndirectR();
1042

1043
         i->setIndirectR(NULL);
1044
         i->moveSources(0, 1);
1045
         i->setSrc(0, hnd);
1046
         i->tex.rIndirectSrc = 0;
1047
         i->tex.sIndirectSrc = -1;
1048
      }
1049
      // Move the indirect reference to right after the coords
1050
      else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {
1051
         Value *hnd = i->getIndirectR();
1052

1053
         i->setIndirectR(NULL);
1054
         i->moveSources(arg, 1);
1055
         i->setSrc(arg, hnd);
1056
         i->tex.rIndirectSrc = 0;
1057
         i->tex.sIndirectSrc = -1;
1058
      }
1059
   } else
1060
   // (nvc0) generate and move the tsc/tic/array source to the front
1061
   if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
1062
      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1063

1064
      Value *ticRel = i->getIndirectR();
1065
      Value *tscRel = i->getIndirectS();
1066

1067
      if (i->tex.r == 0xffff) {
1068
         i->tex.r = 0x20;
1069
         i->tex.s = 0x10;
1070
      }
1071

1072
      if (ticRel) {
1073
         i->setSrc(i->tex.rIndirectSrc, NULL);
1074
         if (i->tex.r)
1075
            ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1076
                                ticRel, bld.mkImm(i->tex.r));
1077
      }
1078
      if (tscRel) {
1079
         i->setSrc(i->tex.sIndirectSrc, NULL);
1080
         if (i->tex.s)
1081
            tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1082
                                tscRel, bld.mkImm(i->tex.s));
1083
      }
1084

1085
      Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
1086
      if (arrayIndex) {
1087
         for (int s = dim; s >= 1; --s)
1088
            i->setSrc(s, i->getSrc(s - 1));
1089
         i->setSrc(0, arrayIndex);
1090
      } else {
1091
         i->moveSources(0, 1);
1092
      }
1093

1094
      if (arrayIndex) {
1095
         int sat = (i->op == OP_TXF) ? 1 : 0;
1096
         DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
1097
         bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
1098
      } else {
1099
         bld.loadImm(src, 0);
1100
      }
1101

1102
      if (ticRel)
1103
         bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
1104
      if (tscRel)
1105
         bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
1106

1107
      i->setSrc(0, src);
1108
   }
1109

1110
   // For nvc0, the sample id has to be in the second operand, as the offset
1111
   // does. Right now we don't know how to pass both in, and this case can't
1112
   // happen with OpenGL. On nve0, the sample id is part of the texture
1113
   // coordinate argument.
1114
   assert(chipset >= NVISA_GK104_CHIPSET ||
1115
          !i->tex.useOffsets || !i->tex.target.isMS());
1116

1117
   // offset is between lod and dc
1118
   if (i->tex.useOffsets) {
1119
      int n, c;
1120
      int s = i->srcCount(0xff, true);
1121
      if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
1122
         if (i->tex.target.isShadow())
1123
            s--;
1124
         if (i->srcExists(s)) // move potential predicate out of the way
1125
            i->moveSources(s, 1);
1126
         if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
1127
            i->moveSources(s + 1, 1);
1128
      }
1129
      if (i->op == OP_TXG) {
1130
         // Either there is 1 offset, which goes into the 2 low bytes of the
1131
         // first source, or there are 4 offsets, which go into 2 sources (8
1132
         // values, 1 byte each).
1133
         Value *offs[2] = {NULL, NULL};
1134
         for (n = 0; n < i->tex.useOffsets; n++) {
1135
            for (c = 0; c < 2; ++c) {
1136
               if ((n % 2) == 0 && c == 0)
1137
                  bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());
1138
               else
1139
                  bld.mkOp3(OP_INSBF, TYPE_U32,
1140
                            offs[n / 2],
1141
                            i->offset[n][c].get(),
1142
                            bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
1143
                            offs[n / 2]);
1144
            }
1145
         }
1146
         i->setSrc(s, offs[0]);
1147
         if (offs[1])
1148
            i->setSrc(s + 1, offs[1]);
1149
      } else {
1150
         unsigned imm = 0;
1151
         assert(i->tex.useOffsets == 1);
1152
         for (c = 0; c < 3; ++c) {
1153
            ImmediateValue val;
1154
            if (!i->offset[0][c].getImmediate(val))
1155
               assert(!"non-immediate offset passed to non-TXG");
1156
            imm |= (val.reg.data.u32 & 0xf) << (c * 4);
1157
         }
1158
         if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
1159
            // The offset goes into the upper 16 bits of the array index. So
1160
            // create it if it's not already there, and INSBF it if it already
1161
            // is.
1162
            s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
1163
            if (chipset >= NVISA_GM107_CHIPSET)
1164
               s += dim;
1165
            if (i->tex.target.isArray()) {
1166
               Value *offset = bld.getScratch();
1167
               bld.mkOp3(OP_INSBF, TYPE_U32, offset,
1168
                         bld.loadImm(NULL, imm), bld.mkImm(0xc10),
1169
                         i->getSrc(s));
1170
               i->setSrc(s, offset);
1171
            } else {
1172
               i->moveSources(s, 1);
1173
               i->setSrc(s, bld.loadImm(NULL, imm << 16));
1174
            }
1175
         } else {
1176
            i->setSrc(s, bld.loadImm(NULL, imm));
1177
         }
1178
      }
1179
   }
1180

1181
   return true;
1182
}
1183

1184
bool
1185
NVC0LoweringPass::handleManualTXD(TexInstruction *i)
1186
{
1187
   // Always done from the l0 perspective. This is the way that NVIDIA's
1188
   // driver does it, and doing it from the "current" lane's perspective
1189
   // doesn't seem to always work for reasons that aren't altogether clear,
1190
   // even in frag shaders.
1191
   //
1192
   // Note that we must move not only the coordinates into lane0, but also all
1193
   // ancillary arguments, like array indices and depth compare as they may
1194
   // differ between lanes. Offsets for TXD are supposed to be uniform, so we
1195
   // leave them alone.
1196
   static const uint8_t qOps[2] =
1197
      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) };
1198

1199
   Value *def[4][4];
1200
   Value *crd[3], *arr[2], *shadow;
1201
   Instruction *tex;
1202
   Value *zero = bld.loadImm(bld.getSSA(), 0);
1203
   int l, c;
1204
   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
1205

1206
   // This function is invoked after handleTEX lowering, so we have to expect
1207
   // the arguments in the order that the hw wants them. For Fermi, array and
1208
   // indirect are both in the leading arg, while for Kepler, array and
1209
   // indirect are separate (and both precede the coordinates). Maxwell is
1210
   // handled in a separate function.
1211
   int array;
1212
   if (targ->getChipset() < NVISA_GK104_CHIPSET)
1213
      array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
1214
   else
1215
      array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
1216

1217
   i->op = OP_TEX; // no need to clone dPdx/dPdy later
1218

1219
   for (c = 0; c < dim; ++c)
1220
      crd[c] = bld.getScratch();
1221
   for (c = 0; c < array; ++c)
1222
      arr[c] = bld.getScratch();
1223
   shadow = bld.getScratch();
1224

1225
   for (l = 0; l < 4; ++l) {
1226
      Value *src[3], *val;
1227

1228
      bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
1229
      // we're using the texture result from lane 0 in all cases, so make sure
1230
      // that lane 0 is pointing at the proper array index, indirect value,
1231
      // and depth compare.
1232
      if (l != 0) {
1233
         for (c = 0; c < array; ++c)
1234
            bld.mkQuadop(0x00, arr[c], l, i->getSrc(c), zero);
1235
         if (i->tex.target.isShadow()) {
1236
            // The next argument after coords is the depth compare
1237
            bld.mkQuadop(0x00, shadow, l, i->getSrc(array + dim), zero);
1238
         }
1239
      }
1240
      // mov position coordinates from lane l to all lanes
1241
      for (c = 0; c < dim; ++c)
1242
         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
1243
      // add dPdx from lane l to lanes dx
1244
      for (c = 0; c < dim; ++c)
1245
         bld.mkQuadop(qOps[0], crd[c], l, i->dPdx[c].get(), crd[c]);
1246
      // add dPdy from lane l to lanes dy
1247
      for (c = 0; c < dim; ++c)
1248
         bld.mkQuadop(qOps[1], crd[c], l, i->dPdy[c].get(), crd[c]);
1249
      // normalize cube coordinates
1250
      if (i->tex.target.isCube()) {
1251
         for (c = 0; c < 3; ++c)
1252
            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
1253
         val = bld.getScratch();
1254
         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
1255
         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
1256
         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
1257
         for (c = 0; c < 3; ++c)
1258
            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
1259
      } else {
1260
         for (c = 0; c < dim; ++c)
1261
            src[c] = crd[c];
1262
      }
1263
      // texture
1264
      bld.insert(tex = cloneForward(func, i));
1265
      if (l != 0) {
1266
         for (c = 0; c < array; ++c)
1267
            tex->setSrc(c, arr[c]);
1268
         if (i->tex.target.isShadow())
1269
            tex->setSrc(array + dim, shadow);
1270
      }
1271
      for (c = 0; c < dim; ++c)
1272
         tex->setSrc(c + array, src[c]);
1273
      // broadcast results from lane 0 to all lanes so that the moves *into*
1274
      // the target lane pick up the proper value.
1275
      if (l != 0)
1276
         for (c = 0; i->defExists(c); ++c)
1277
            bld.mkQuadop(0x00, tex->getDef(c), 0, tex->getDef(c), zero);
1278
      bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1279

1280
      // save results
1281
      for (c = 0; i->defExists(c); ++c) {
1282
         Instruction *mov;
1283
         def[c][l] = bld.getSSA();
1284
         mov = bld.mkMov(def[c][l], tex->getDef(c));
1285
         mov->fixed = 1;
1286
         mov->lanes = 1 << l;
1287
      }
1288
   }
1289

1290
   for (c = 0; i->defExists(c); ++c) {
1291
      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1292
      for (l = 0; l < 4; ++l)
1293
         u->setSrc(l, def[c][l]);
1294
   }
1295

1296
   i->bb->remove(i);
1297
   return true;
1298
}
1299

1300
bool
1301
NVC0LoweringPass::handleTXD(TexInstruction *txd)
1302
{
1303
   int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
1304
   unsigned arg = txd->tex.target.getArgCount();
1305
   unsigned expected_args = arg;
1306
   const int chipset = prog->getTarget()->getChipset();
1307

1308
   if (chipset >= NVISA_GK104_CHIPSET) {
1309
      if (!txd->tex.target.isArray() && txd->tex.useOffsets)
1310
         expected_args++;
1311
      if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
1312
         expected_args++;
1313
   } else {
1314
      if (txd->tex.useOffsets)
1315
         expected_args++;
1316
      if (!txd->tex.target.isArray() && (
1317
                txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
1318
         expected_args++;
1319
   }
1320

1321
   if (expected_args > 4 ||
1322
       dim > 2 ||
1323
       txd->tex.target.isShadow())
1324
      txd->op = OP_TEX;
1325

1326
   handleTEX(txd);
1327
   while (txd->srcExists(arg))
1328
      ++arg;
1329

1330
   txd->tex.derivAll = true;
1331
   if (txd->op == OP_TEX)
1332
      return handleManualTXD(txd);
1333

1334
   assert(arg == expected_args);
1335
   for (int c = 0; c < dim; ++c) {
1336
      txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
1337
      txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
1338
      txd->dPdx[c].set(NULL);
1339
      txd->dPdy[c].set(NULL);
1340
   }
1341

1342
   // In this case we have fewer than 4 "real" arguments, which means that
1343
   // handleTEX didn't apply any padding. However we have to make sure that
1344
   // the second "group" of arguments still gets padded up to 4.
1345
   if (chipset >= NVISA_GK104_CHIPSET) {
1346
      int s = arg + 2 * dim;
1347
      if (s >= 4 && s < 7) {
1348
         if (txd->srcExists(s)) // move potential predicate out of the way
1349
            txd->moveSources(s, 7 - s);
1350
         while (s < 7)
1351
            txd->setSrc(s++, bld.loadImm(NULL, 0));
1352
      }
1353
   }
1354

1355
   return true;
1356
}
1357

1358
bool
1359
NVC0LoweringPass::handleTXQ(TexInstruction *txq)
1360
{
1361
   const int chipset = prog->getTarget()->getChipset();
1362
   if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
1363
      txq->tex.r += prog->driver->io.texBindBase / 4;
1364

1365
   if (txq->tex.rIndirectSrc < 0)
1366
      return true;
1367

1368
   Value *ticRel = txq->getIndirectR();
1369

1370
   txq->setIndirectS(NULL);
1371
   txq->tex.sIndirectSrc = -1;
1372

1373
   assert(ticRel);
1374

1375
   if (chipset < NVISA_GK104_CHIPSET) {
1376
      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1377

1378
      txq->setSrc(txq->tex.rIndirectSrc, NULL);
1379
      if (txq->tex.r)
1380
         ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1381
                             ticRel, bld.mkImm(txq->tex.r));
1382

1383
      bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
1384

1385
      txq->moveSources(0, 1);
1386
      txq->setSrc(0, src);
1387
   } else {
1388
      Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r);
1389
      txq->tex.r = 0xff;
1390
      txq->tex.s = 0x1f;
1391

1392
      txq->setIndirectR(NULL);
1393
      txq->moveSources(0, 1);
1394
      txq->setSrc(0, hnd);
1395
      txq->tex.rIndirectSrc = 0;
1396
   }
1397

1398
   return true;
1399
}
1400

1401
bool
1402
NVC0LoweringPass::handleTXLQ(TexInstruction *i)
1403
{
1404
   /* The outputs are inverted compared to what the TGSI instruction
1405
    * expects. Take that into account in the mask.
1406
    */
1407
   assert((i->tex.mask & ~3) == 0);
1408
   if (i->tex.mask == 1)
1409
      i->tex.mask = 2;
1410
   else if (i->tex.mask == 2)
1411
      i->tex.mask = 1;
1412
   handleTEX(i);
1413
   bld.setPosition(i, true);
1414

1415
   /* The returned values are not quite what we want:
1416
    * (a) convert from s16/u16 to f32
1417
    * (b) multiply by 1/256
1418
    */
1419
   for (int def = 0; def < 2; ++def) {
1420
      if (!i->defExists(def))
1421
         continue;
1422
      enum DataType type = TYPE_S16;
1423
      if (i->tex.mask == 2 || def > 0)
1424
         type = TYPE_U16;
1425
      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
1426
      bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1427
                i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1428
   }
1429
   if (i->tex.mask == 3) {
1430
      LValue *t = new_LValue(func, FILE_GPR);
1431
      bld.mkMov(t, i->getDef(0));
1432
      bld.mkMov(i->getDef(0), i->getDef(1));
1433
      bld.mkMov(i->getDef(1), t);
1434
   }
1435
   return true;
1436
}
1437

1438
bool
1439
NVC0LoweringPass::handleBUFQ(Instruction *bufq)
1440
{
1441
   bufq->op = OP_MOV;
1442
   bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1),
1443
                                   bufq->getSrc(0)->reg.fileIndex * 16));
1444
   bufq->setIndirect(0, 0, NULL);
1445
   bufq->setIndirect(0, 1, NULL);
1446
   return true;
1447
}
1448

1449
void
1450
NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
1451
{
1452
   assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1453

1454
   BasicBlock *currBB = atom->bb;
1455
   BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1456
   BasicBlock *joinBB = atom->bb->splitAfter(atom);
1457
   BasicBlock *setAndUnlockBB = new BasicBlock(func);
1458
   BasicBlock *failLockBB = new BasicBlock(func);
1459

1460
   bld.setPosition(currBB, true);
1461
   assert(!currBB->joinAt);
1462
   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1463

1464
   CmpInstruction *pred =
1465
      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1466
                TYPE_U32, bld.mkImm(0), bld.mkImm(1));
1467

1468
   bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1469
   currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1470

1471
   bld.setPosition(tryLockBB, true);
1472

1473
   Instruction *ld =
1474
      bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1475
                 atom->getIndirect(0, 0));
1476
   ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1477
   ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1478

1479
   bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
1480
   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1481
   tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1482
   tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1483

1484
   tryLockBB->cfg.detach(&joinBB->cfg);
1485
   bld.remove(atom);
1486

1487
   bld.setPosition(setAndUnlockBB, true);
1488
   Value *stVal;
1489
   if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1490
      // Read the old value, and write the new one.
1491
      stVal = atom->getSrc(1);
1492
   } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1493
      CmpInstruction *set =
1494
         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
1495
                   TYPE_U32, ld->getDef(0), atom->getSrc(1));
1496

1497
      bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
1498
                TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
1499
   } else {
1500
      operation op;
1501

1502
      switch (atom->subOp) {
1503
      case NV50_IR_SUBOP_ATOM_ADD:
1504
         op = OP_ADD;
1505
         break;
1506
      case NV50_IR_SUBOP_ATOM_AND:
1507
         op = OP_AND;
1508
         break;
1509
      case NV50_IR_SUBOP_ATOM_OR:
1510
         op = OP_OR;
1511
         break;
1512
      case NV50_IR_SUBOP_ATOM_XOR:
1513
         op = OP_XOR;
1514
         break;
1515
      case NV50_IR_SUBOP_ATOM_MIN:
1516
         op = OP_MIN;
1517
         break;
1518
      case NV50_IR_SUBOP_ATOM_MAX:
1519
         op = OP_MAX;
1520
         break;
1521
      default:
1522
         assert(0);
1523
         return;
1524
      }
1525

1526
      stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
1527
                         atom->getSrc(1));
1528
   }
1529

1530
   Instruction *st =
1531
      bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1532
                  atom->getIndirect(0, 0), stVal);
1533
   st->setDef(0, pred->getDef(0));
1534
   st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1535

1536
   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1537
   setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1538

1539
   // Lock until the store has not been performed.
1540
   bld.setPosition(failLockBB, true);
1541
   bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
1542
   bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1543
   failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1544
   failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1545

1546
   bld.setPosition(joinBB, false);
1547
   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1548
}
1549

1550
void
1551
NVC0LoweringPass::handleSharedATOM(Instruction *atom)
1552
{
1553
   assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1554

1555
   BasicBlock *currBB = atom->bb;
1556
   BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false);
1557
   BasicBlock *joinBB = atom->bb->splitAfter(atom);
1558

1559
   bld.setPosition(currBB, true);
1560
   assert(!currBB->joinAt);
1561
   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1562

1563
   bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL);
1564
   currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE);
1565

1566
   bld.setPosition(tryLockAndSetBB, true);
1567

1568
   Instruction *ld =
1569
      bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1570
                 atom->getIndirect(0, 0));
1571
   ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1572
   ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1573

1574
   Value *stVal;
1575
   if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1576
      // Read the old value, and write the new one.
1577
      stVal = atom->getSrc(1);
1578
   } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1579
      CmpInstruction *set =
1580
         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1581
                   TYPE_U32, ld->getDef(0), atom->getSrc(1));
1582
      set->setPredicate(CC_P, ld->getDef(1));
1583

1584
      Instruction *selp =
1585
         bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0),
1586
                   atom->getSrc(2), set->getDef(0));
1587
      selp->src(2).mod = Modifier(NV50_IR_MOD_NOT);
1588
      selp->setPredicate(CC_P, ld->getDef(1));
1589

1590
      stVal = selp->getDef(0);
1591
   } else {
1592
      operation op;
1593

1594
      switch (atom->subOp) {
1595
      case NV50_IR_SUBOP_ATOM_ADD:
1596
         op = OP_ADD;
1597
         break;
1598
      case NV50_IR_SUBOP_ATOM_AND:
1599
         op = OP_AND;
1600
         break;
1601
      case NV50_IR_SUBOP_ATOM_OR:
1602
         op = OP_OR;
1603
         break;
1604
      case NV50_IR_SUBOP_ATOM_XOR:
1605
         op = OP_XOR;
1606
         break;
1607
      case NV50_IR_SUBOP_ATOM_MIN:
1608
         op = OP_MIN;
1609
         break;
1610
      case NV50_IR_SUBOP_ATOM_MAX:
1611
         op = OP_MAX;
1612
         break;
1613
      default:
1614
         assert(0);
1615
         return;
1616
      }
1617

1618
      Instruction *i =
1619
         bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1620
                   atom->getSrc(1));
1621
      i->setPredicate(CC_P, ld->getDef(1));
1622

1623
      stVal = i->getDef(0);
1624
   }
1625

1626
   Instruction *st =
1627
      bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1628
                  atom->getIndirect(0, 0), stVal);
1629
   st->setPredicate(CC_P, ld->getDef(1));
1630
   st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1631

1632
   // Loop until the lock is acquired.
1633
   bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1));
1634
   tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK);
1635
   tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS);
1636
   bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1637

1638
   bld.remove(atom);
1639

1640
   bld.setPosition(joinBB, false);
1641
   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1642
}
1643

1644
bool
1645
NVC0LoweringPass::handleATOM(Instruction *atom)
1646
{
1647
   SVSemantic sv;
1648
   Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;
1649

1650
   switch (atom->src(0).getFile()) {
1651
   case FILE_MEMORY_LOCAL:
1652
      sv = SV_LBASE;
1653
      break;
1654
   case FILE_MEMORY_SHARED:
1655
      // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
1656
      // operations on shared memory. For Maxwell, ATOMS is enough.
1657
      if (targ->getChipset() < NVISA_GK104_CHIPSET)
1658
         handleSharedATOM(atom);
1659
      else if (targ->getChipset() < NVISA_GM107_CHIPSET)
1660
         handleSharedATOMNVE4(atom);
1661
      return true;
1662
   case FILE_MEMORY_GLOBAL:
1663
      return true;
1664
   default:
1665
      assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);
1666
      base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
1667
      assert(base->reg.size == 8);
1668
      if (ptr)
1669
         base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
1670
      assert(base->reg.size == 8);
1671
      atom->setIndirect(0, 0, base);
1672
      atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1673

1674
      // Harden against out-of-bounds accesses
1675
      Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType));
1676
      Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16);
1677
      Value *pred = new_LValue(func, FILE_PREDICATE);
1678
      if (ptr)
1679
         bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr);
1680
      bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
1681
      atom->setPredicate(CC_NOT_P, pred);
1682
      if (atom->defExists(0)) {
1683
         Value *zero, *dst = atom->getDef(0);
1684
         atom->setDef(0, bld.getSSA());
1685

1686
         bld.setPosition(atom, true);
1687
         bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
1688
            ->setPredicate(CC_P, pred);
1689
         bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero);
1690
      }
1691

1692
      return true;
1693
   }
1694
   base =
1695
      bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
1696

1697
   atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
1698
   atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1699
   if (ptr)
1700
      base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
1701
   atom->setIndirect(0, 1, NULL);
1702
   atom->setIndirect(0, 0, base);
1703

1704
   return true;
1705
}
1706

1707
bool
1708
NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
1709
{
1710
   if (targ->getChipset() < NVISA_GM107_CHIPSET) {
1711
      if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
1712
         // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1713
         return false;
1714
      }
1715
   }
1716

1717
   if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
1718
       cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
1719
      return false;
1720
   bld.setPosition(cas, true);
1721

1722
   if (needCctl) {
1723
      Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
1724
      cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
1725
      cctl->fixed = 1;
1726
      cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
1727
      if (cas->isPredicated())
1728
         cctl->setPredicate(cas->cc, cas->getPredicate());
1729
   }
1730

1731
   if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS &&
1732
       targ->getChipset() < NVISA_GV100_CHIPSET) {
1733
      // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1734
      // should be set to the high part of the double reg or bad things will
1735
      // happen elsewhere in the universe.
1736
      // Also, it sometimes returns the new value instead of the old one
1737
      // under mysterious circumstances.
1738
      DataType ty = typeOfSize(typeSizeof(cas->dType) * 2);
1739
      Value *dreg = bld.getSSA(typeSizeof(ty));
1740
      bld.setPosition(cas, false);
1741
      bld.mkOp2(OP_MERGE, ty, dreg, cas->getSrc(1), cas->getSrc(2));
1742
      cas->setSrc(1, dreg);
1743
      cas->setSrc(2, dreg);
1744
   }
1745

1746
   return true;
1747
}
1748

1749
inline Value *
1750
NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
1751
{
1752
   uint8_t b = prog->driver->io.auxCBSlot;
1753
   off += base;
1754

1755
   return bld.
1756
      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1757
}
1758

1759
inline Value *
1760
NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
1761
{
1762
   uint8_t b = prog->driver->io.auxCBSlot;
1763
   off += base;
1764

1765
   if (ptr)
1766
      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1767

1768
   return bld.
1769
      mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
1770
}
1771

1772
inline Value *
1773
NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
1774
{
1775
   uint8_t b = prog->driver->io.auxCBSlot;
1776
   off += base;
1777

1778
   if (ptr)
1779
      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1780

1781
   return bld.
1782
      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
1783
}
1784

1785
inline Value *
1786
NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
1787
{
1788
   return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
1789
}
1790

1791
inline Value *
1792
NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
1793
{
1794
   return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
1795
}
1796

1797
inline Value *
1798
NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
1799
{
1800
   return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
1801
}
1802

1803
inline Value *
1804
NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
1805
{
1806
   return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
1807
}
1808

1809
inline Value *
1810
NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
1811
{
1812
   uint8_t b = prog->driver->io.msInfoCBSlot;
1813
   off += prog->driver->io.msInfoBase;
1814
   return bld.
1815
      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1816
}
1817

1818
inline Value *
1819
NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless)
1820
{
1821
   uint32_t base = slot * NVC0_SU_INFO__STRIDE;
1822

1823
   // We don't upload surface info for bindless for GM107+
1824
   assert(!bindless || targ->getChipset() < NVISA_GM107_CHIPSET);
1825

1826
   if (ptr) {
1827
      ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
1828
      if (bindless)
1829
         ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(511));
1830
      else
1831
         ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
1832
      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6));
1833
      base = 0;
1834
   }
1835
   off += base;
1836

1837
   return loadResInfo32(ptr, off, bindless ? prog->driver->io.bindlessBase :
1838
                        prog->driver->io.suInfoBase);
1839
}
1840

1841
Value *
1842
NVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target, uint32_t index, int slot, Value *ind, bool bindless)
1843
{
1844
   if (!bindless || targ->getChipset() < NVISA_GM107_CHIPSET)
1845
      return loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(index), bindless);
1846

1847
   assert(bindless);
1848

1849
   Value *samples = bld.getSSA();
1850
   // this shouldn't be lowered because it's being inserted before the current instruction
1851
   TexInstruction *tex = new_TexInstruction(func, OP_TXQ);
1852
   tex->tex.target = target;
1853
   tex->tex.query = TXQ_TYPE;
1854
   tex->tex.mask = 0x4;
1855
   tex->tex.r = 0xff;
1856
   tex->tex.s = 0x1f;
1857
   tex->tex.rIndirectSrc = 0;
1858
   tex->setDef(0, samples);
1859
   tex->setSrc(0, ind);
1860
   tex->setSrc(1, bld.loadImm(NULL, 0));
1861
   bld.insert(tex);
1862

1863
   // doesn't work with sample counts other than 1/2/4/8 but they aren't supported
1864
   switch (index) {
1865
   case 0: {
1866
      Value *tmp = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), samples, bld.mkImm(2));
1867
      return bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(2));
1868
   }
1869
   case 1: {
1870
      Value *tmp = bld.mkCmp(OP_SET, CC_GT, TYPE_U32, bld.getSSA(), TYPE_U32, samples, bld.mkImm(2))->getDef(0);
1871
      return bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(1));
1872
   }
1873
   default: {
1874
      assert(false);
1875
      return NULL;
1876
   }
1877
   }
1878
}
1879

1880
static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
1881
{
1882
   switch (su->tex.target.getEnum()) {
1883
   case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1884
   case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1885
   case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1886
   case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
1887
                                   NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1888
                                   NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1889
   case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1890
   case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1891
   case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1892
   case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1893
   case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1894
   case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1895
   case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1896
   default:
1897
      assert(0);
1898
      return 0;
1899
   }
1900
}
1901

1902
bool
1903
NVC0LoweringPass::handleSUQ(TexInstruction *suq)
1904
{
1905
   int mask = suq->tex.mask;
1906
   int dim = suq->tex.target.getDim();
1907
   int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1908
   Value *ind = suq->getIndirectR();
1909
   int slot = suq->tex.r;
1910
   int c, d;
1911

1912
   for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1913
      if (c >= arg || !(mask & 1))
1914
         continue;
1915

1916
      int offset;
1917

1918
      if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1919
         offset = NVC0_SU_INFO_SIZE(2);
1920
      } else {
1921
         offset = NVC0_SU_INFO_SIZE(c);
1922
      }
1923
      bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset, suq->tex.bindless));
1924
      if (c == 2 && suq->tex.target.isCube())
1925
         bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1926
                   bld.loadImm(NULL, 6));
1927
   }
1928

1929
   if (mask & 1) {
1930
      if (suq->tex.target.isMS()) {
1931
         Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), suq->tex.bindless);
1932
         Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), suq->tex.bindless);
1933
         Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1934
         bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1935
      } else {
1936
         bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1937
      }
1938
   }
1939

1940
   bld.remove(suq);
1941
   return true;
1942
}
1943

1944
void
1945
NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
1946
{
1947
   const int arg = tex->tex.target.getArgCount();
1948
   int slot = tex->tex.r;
1949

1950
   if (tex->tex.target == TEX_TARGET_2D_MS)
1951
      tex->tex.target = TEX_TARGET_2D;
1952
   else
1953
   if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
1954
      tex->tex.target = TEX_TARGET_2D_ARRAY;
1955
   else
1956
      return;
1957

1958
   Value *x = tex->getSrc(0);
1959
   Value *y = tex->getSrc(1);
1960
   Value *s = tex->getSrc(arg - 1);
1961

1962
   Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
1963
   Value *ind = tex->getIndirectR();
1964

1965
   Value *ms_x = loadMsAdjInfo32(tex->tex.target, 0, slot, ind, tex->tex.bindless);
1966
   Value *ms_y = loadMsAdjInfo32(tex->tex.target, 1, slot, ind, tex->tex.bindless);
1967

1968
   bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
1969
   bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
1970

1971
   s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
1972
   s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
1973

1974
   Value *dx = loadMsInfo32(ts, 0x0);
1975
   Value *dy = loadMsInfo32(ts, 0x4);
1976

1977
   bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
1978
   bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
1979

1980
   tex->setSrc(0, tx);
1981
   tex->setSrc(1, ty);
1982
   tex->moveSources(arg, -1);
1983
}
1984

1985
// Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1986
// They're computed from the coordinates using the surface info in c[] space.
1987
void
1988
NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
1989
{
1990
   Instruction *insn;
1991
   const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
1992
   const bool raw =
1993
      su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
1994
   const int slot = su->tex.r;
1995
   const int dim = su->tex.target.getDim();
1996
   const bool array = su->tex.target.isArray() || su->tex.target.isCube();
1997
   const int arg = dim + array;
1998
   int c;
1999
   Value *zero = bld.mkImm(0);
2000
   Value *p1 = NULL;
2001
   Value *v;
2002
   Value *src[3];
2003
   Value *bf, *eau, *off;
2004
   Value *addr, *pred;
2005
   Value *ind = su->getIndirectR();
2006
   Value *y, *z;
2007

2008
   off = bld.getScratch(4);
2009
   bf = bld.getScratch(4);
2010
   addr = bld.getSSA(8);
2011
   pred = bld.getScratch(1, FILE_PREDICATE);
2012

2013
   bld.setPosition(su, false);
2014

2015
   adjustCoordinatesMS(su);
2016

2017
   // calculate clamped coordinates
2018
   for (c = 0; c < arg; ++c) {
2019
      int dimc = c;
2020

2021
      if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) {
2022
         // The array index is stored in the Z component for 1D arrays.
2023
         dimc = 2;
2024
      }
2025

2026
      src[c] = bld.getScratch();
2027
      if (c == 0 && raw)
2028
         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X, su->tex.bindless);
2029
      else
2030
         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc), su->tex.bindless);
2031
      bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
2032
         ->subOp = getSuClampSubOp(su, dimc);
2033
   }
2034
   for (; c < 3; ++c)
2035
      src[c] = zero;
2036

2037
   if (dim == 2 && !array) {
2038
      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2039
      src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
2040
                          v, bld.loadImm(NULL, 16));
2041

2042
      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless);
2043
      bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero)
2044
         ->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
2045
   }
2046

2047
   // set predicate output
2048
   if (su->tex.target == TEX_TARGET_BUFFER) {
2049
      src[0]->getInsn()->setFlagsDef(1, pred);
2050
   } else
2051
   if (array) {
2052
      p1 = bld.getSSA(1, FILE_PREDICATE);
2053
      src[dim]->getInsn()->setFlagsDef(1, p1);
2054
   }
2055

2056
   // calculate pixel offset
2057
   if (dim == 1) {
2058
      y = z = zero;
2059
      if (su->tex.target != TEX_TARGET_BUFFER)
2060
         bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
2061
   } else {
2062
      y = src[1];
2063
      z = src[2];
2064

2065
      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2066
      bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
2067
         ->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l
2068

2069
      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
2070
      bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
2071
         ->subOp = array ?
2072
         NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
2073
   }
2074

2075
   // calculate effective address part 1
2076
   if (su->tex.target == TEX_TARGET_BUFFER) {
2077
      if (raw) {
2078
         bf = src[0];
2079
      } else {
2080
         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
2081
         bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
2082
            ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
2083
      }
2084
   } else {
2085
      uint16_t subOp = 0;
2086

2087
      switch (dim) {
2088
      case 1:
2089
         break;
2090
      case 2:
2091
         if (array) {
2092
            z = off;
2093
         } else {
2094
            subOp = NV50_IR_SUBOP_SUBFM_3D;
2095
         }
2096
         break;
2097
      default:
2098
         subOp = NV50_IR_SUBOP_SUBFM_3D;
2099
         assert(dim == 3);
2100
         break;
2101
      }
2102
      insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
2103
      insn->subOp = subOp;
2104
      insn->setFlagsDef(1, pred);
2105
   }
2106

2107
   // part 2
2108
   v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless);
2109

2110
   if (su->tex.target == TEX_TARGET_BUFFER) {
2111
      eau = v;
2112
   } else {
2113
      eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
2114
   }
2115
   // add array layer offset
2116
   if (array) {
2117
      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
2118
      if (dim == 1)
2119
         bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
2120
            ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
2121
      else
2122
         bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
2123
            ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
2124
      // combine predicates
2125
      assert(p1);
2126
      bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
2127
   }
2128

2129
   if (atom) {
2130
      Value *lo = bf;
2131
      if (su->tex.target == TEX_TARGET_BUFFER) {
2132
         lo = zero;
2133
         bld.mkMov(off, bf);
2134
      }
2135
      //  bf == g[] address & 0xff
2136
      // eau == g[] address >> 8
2137
      bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
2138
      bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
2139
   } else
2140
   if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
2141
      // Convert from u32 to u8 address format, which is what the library code
2142
      // doing SULDP currently uses.
2143
      // XXX: can SUEAU do this ?
2144
      // XXX: does it matter that we don't mask high bytes in bf ?
2145
      // Grrr.
2146
      bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
2147
      bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
2148
   }
2149

2150
   bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
2151

2152
   if (atom && su->tex.target == TEX_TARGET_BUFFER)
2153
      bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
2154

2155
   // let's just set it 0 for raw access and hope it works
2156
   v = raw ?
2157
      bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
2158

2159
   // get rid of old coordinate sources, make space for fmt info and predicate
2160
   su->moveSources(arg, 3 - arg);
2161
   // set 64 bit address and 32-bit format sources
2162
   su->setSrc(0, addr);
2163
   su->setSrc(1, v);
2164
   su->setSrc(2, pred);
2165
   su->setIndirectR(NULL);
2166

2167
   // prevent read fault when the image is not actually bound
2168
   CmpInstruction *pred1 =
2169
      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2170
                TYPE_U32, bld.mkImm(0),
2171
                loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2172

2173
   if (su->op != OP_SUSTP && su->tex.format) {
2174
      const TexInstruction::ImgFormatDesc *format = su->tex.format;
2175
      int blockwidth = format->bits[0] + format->bits[1] +
2176
                       format->bits[2] + format->bits[3];
2177

2178
      // make sure that the format doesn't mismatch
2179
      assert(format->components != 0);
2180
      bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0),
2181
                TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2182
                loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2183
                pred1->getDef(0));
2184
   }
2185
   su->setPredicate(CC_NOT_P, pred1->getDef(0));
2186

2187
   // TODO: initialize def values to 0 when the surface operation is not
2188
   // performed (not needed for stores). Also, fix the "address bounds test"
2189
   // subtests from arb_shader_image_load_store-invalid for buffers, because it
2190
   // seems like that the predicate is not correctly set by suclamp.
2191
}
2192

2193
static DataType
2194
getSrcType(const TexInstruction::ImgFormatDesc *t, int c)
2195
{
2196
   switch (t->type) {
2197
   case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
2198
   case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
2199
   case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
2200
   case UINT:
2201
      return (t->bits[c] == 8 ? TYPE_U8 :
2202
              (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32));
2203
   case SINT:
2204
      return (t->bits[c] == 8 ? TYPE_S8 :
2205
              (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32));
2206
   }
2207
   return TYPE_NONE;
2208
}
2209

2210
static DataType
2211
getDestType(const ImgType type) {
2212
   switch (type) {
2213
   case FLOAT:
2214
   case UNORM:
2215
   case SNORM:
2216
      return TYPE_F32;
2217
   case UINT:
2218
      return TYPE_U32;
2219
   case SINT:
2220
      return TYPE_S32;
2221
   default:
2222
      assert(!"Impossible type");
2223
      return TYPE_NONE;
2224
   }
2225
}
2226

2227
void
2228
NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su, Instruction **loaded)
2229
{
2230
   const TexInstruction::ImgFormatDesc *format = su->tex.format;
2231
   int width = format->bits[0] + format->bits[1] +
2232
      format->bits[2] + format->bits[3];
2233
   Value *untypedDst[4] = {};
2234
   Value *typedDst[4] = {};
2235

2236
   // We must convert this to a generic load.
2237
   su->op = OP_SULDB;
2238

2239
   su->dType = typeOfSize(width / 8);
2240
   su->sType = TYPE_U8;
2241

2242
   for (int i = 0; i < width / 32; i++)
2243
      untypedDst[i] = bld.getSSA();
2244
   if (width < 32)
2245
      untypedDst[0] = bld.getSSA();
2246

2247
   if (loaded && loaded[0]) {
2248
      for (int i = 0; i < 4; i++) {
2249
         if (loaded[i])
2250
            typedDst[i] = loaded[i]->getDef(0);
2251
      }
2252
   } else {
2253
      for (int i = 0; i < 4; i++) {
2254
         typedDst[i] = su->getDef(i);
2255
      }
2256
   }
2257

2258
   // Set the untyped dsts as the su's destinations
2259
   if (loaded && loaded[0]) {
2260
      for (int i = 0; i < 4; i++)
2261
         if (loaded[i])
2262
            loaded[i]->setDef(0, untypedDst[i]);
2263
   } else {
2264
      for (int i = 0; i < 4; i++)
2265
         su->setDef(i, untypedDst[i]);
2266

2267
      bld.setPosition(su, true);
2268
   }
2269

2270
   // Unpack each component into the typed dsts
2271
   int bits = 0;
2272
   for (int i = 0; i < 4; bits += format->bits[i], i++) {
2273
      if (!typedDst[i])
2274
         continue;
2275

2276
      if (loaded && loaded[0])
2277
         bld.setPosition(loaded[i], true);
2278

2279
      if (i >= format->components) {
2280
         if (format->type == FLOAT ||
2281
             format->type == UNORM ||
2282
             format->type == SNORM)
2283
            bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
2284
         else
2285
            bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
2286
         continue;
2287
      }
2288

2289
      // Get just that component's data into the relevant place
2290
      if (format->bits[i] == 32)
2291
         bld.mkMov(typedDst[i], untypedDst[i]);
2292
      else if (format->bits[i] == 16)
2293
         bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2294
                   getSrcType(format, i), untypedDst[i / 2])
2295
         ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1);
2296
      else if (format->bits[i] == 8)
2297
         bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2298
                   getSrcType(format, i), untypedDst[0])->subOp = i;
2299
      else {
2300
         bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32],
2301
                   bld.mkImm((bits % 32) | (format->bits[i] << 8)));
2302
         if (format->type == UNORM || format->type == SNORM)
2303
            bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]);
2304
      }
2305

2306
      // Normalize / convert as necessary
2307
      if (format->type == UNORM)
2308
         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
2309
      else if (format->type == SNORM)
2310
         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
2311
      else if (format->type == FLOAT && format->bits[i] < 16) {
2312
         bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
2313
         bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]);
2314
      }
2315
   }
2316

2317
   if (format->bgra) {
2318
      std::swap(typedDst[0], typedDst[2]);
2319
   }
2320
}
2321

2322
void
2323
NVC0LoweringPass::insertOOBSurfaceOpResult(TexInstruction *su)
2324
{
2325
   if (!su->getPredicate())
2326
      return;
2327

2328
   bld.setPosition(su, true);
2329

2330
   for (unsigned i = 0; su->defExists(i); ++i) {
2331
      Value *def = su->getDef(i);
2332
      Value *newDef = bld.getSSA();
2333
      su->setDef(i, newDef);
2334

2335
      Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2336
      assert(su->cc == CC_NOT_P);
2337
      mov->setPredicate(CC_P, su->getPredicate());
2338
      Instruction *uni = bld.mkOp2(OP_UNION, TYPE_U32, bld.getSSA(), newDef, mov->getDef(0));
2339
      bld.mkMov(def, uni->getDef(0));
2340
   }
2341
}
2342

2343
void
2344
NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
2345
{
2346
   processSurfaceCoordsNVE4(su);
2347

2348
   if (su->op == OP_SULDP) {
2349
      convertSurfaceFormat(su, NULL);
2350
      insertOOBSurfaceOpResult(su);
2351
   }
2352

2353
   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2354
      assert(su->getPredicate());
2355
      Value *pred =
2356
         bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE),
2357
                    su->getPredicate(), su->getSrc(2));
2358

2359
      Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA());
2360
      red->subOp = su->subOp;
2361
      red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0));
2362
      red->setSrc(1, su->getSrc(3));
2363
      if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
2364
         red->setSrc(2, su->getSrc(4));
2365
      red->setIndirect(0, 0, su->getSrc(0));
2366

2367
      // make sure to initialize dst value when the atomic operation is not
2368
      // performed
2369
      Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2370

2371
      assert(su->cc == CC_NOT_P);
2372
      red->setPredicate(su->cc, pred);
2373
      mov->setPredicate(CC_P, pred);
2374

2375
      bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0),
2376
                red->getDef(0), mov->getDef(0));
2377

2378
      delete_Instruction(bld.getProgram(), su);
2379
      handleCasExch(red, true);
2380
   }
2381

2382
   if (su->op == OP_SUSTB || su->op == OP_SUSTP)
2383
      su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
2384
}
2385

2386
void
2387
NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
2388
{
2389
   const int slot = su->tex.r;
2390
   const int dim = su->tex.target.getDim();
2391
   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2392
   int c;
2393
   Value *zero = bld.mkImm(0);
2394
   Value *src[3];
2395
   Value *v;
2396
   Value *ind = su->getIndirectR();
2397

2398
   bld.setPosition(su, false);
2399

2400
   adjustCoordinatesMS(su);
2401

2402
   if (ind) {
2403
      Value *ptr;
2404
      ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));
2405
      ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
2406
      su->setIndirectR(ptr);
2407
   }
2408

2409
   // get surface coordinates
2410
   for (c = 0; c < arg; ++c)
2411
      src[c] = su->getSrc(c);
2412
   for (; c < 3; ++c)
2413
      src[c] = zero;
2414

2415
   // calculate pixel offset
2416
   if (su->op == OP_SULDP || su->op == OP_SUREDP) {
2417
      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless);
2418
      su->setSrc(0, (src[0] = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), src[0], v)));
2419
   }
2420

2421
   // add array layer offset
2422
   if (su->tex.target.isArray() || su->tex.target.isCube()) {
2423
      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
2424
      assert(dim > 1);
2425
      su->setSrc(2, (src[2] = bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v)));
2426
   }
2427

2428
   // 3d is special-cased. Note that a single "slice" of a 3d image may
2429
   // also be attached as 2d, so we have to do the same 3d processing for
2430
   // 2d as well, just in case. In order to remap a 3d image onto a 2d
2431
   // image, we have to retile it "by hand".
2432
   if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) {
2433
      Value *z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2434
      Value *y_size_aligned =
2435
         bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(),
2436
                    loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM_Y, su->tex.bindless),
2437
                    bld.loadImm(NULL, 0x0000ffff));
2438
      // Add the z coordinate for actual 3d-images
2439
      if (dim > 2)
2440
         src[2] = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), z, src[2]);
2441
      else
2442
         src[2] = z;
2443

2444
      // Compute the surface parameters from tile shifts
2445
      Value *tile_shift[3];
2446
      Value *tile_extbf[3];
2447
      // Fetch the "real" tiling parameters of the underlying surface
2448
      for (int i = 0; i < 3; i++) {
2449
         tile_extbf[i] =
2450
            bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
2451
                       loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(i), su->tex.bindless),
2452
                       bld.loadImm(NULL, 16));
2453
         tile_shift[i] =
2454
            bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
2455
                       loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(i), su->tex.bindless),
2456
                       bld.loadImm(NULL, 24));
2457
      }
2458

2459
      // However for load/atomics, we use byte-indexing. And for byte
2460
      // indexing, the X tile size is always the same. This leads to slightly
2461
      // better code.
2462
      if (su->op == OP_SULDP || su->op == OP_SUREDP) {
2463
         tile_extbf[0] = bld.loadImm(NULL, 0x600);
2464
         tile_shift[0] = bld.loadImm(NULL, 6);
2465
      }
2466

2467
      // Compute the location of given coordinate, both inside the tile as
2468
      // well as which (linearly-laid out) tile it's in.
2469
      Value *coord_in_tile[3];
2470
      Value *tile[3];
2471
      for (int i = 0; i < 3; i++) {
2472
         coord_in_tile[i] = bld.mkOp2v(OP_EXTBF, TYPE_U32, bld.getSSA(), src[i], tile_extbf[i]);
2473
         tile[i] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), src[i], tile_shift[i]);
2474
      }
2475

2476
      // Based on the "real" tiling parameters, compute x/y coordinates in the
2477
      // larger surface with 2d tiling that was supplied to the hardware. This
2478
      // was determined and verified with the help of the tiling pseudocode in
2479
      // the envytools docs.
2480
      //
2481
      // adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size +
2482
      //         z_coord_in_tile * x_tile_size
2483
      // adj_y = y_coord_in_tile + y_tile * y_tile_size +
2484
      //         z_tile * y_tile_size * y_tiles
2485
      //
2486
      // Note: STRIDE_Y = y_tile_size * y_tiles
2487

2488
      su->setSrc(0, bld.mkOp2v(
2489
            OP_ADD, TYPE_U32, bld.getSSA(),
2490
            bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2491
                       coord_in_tile[0],
2492
                       bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2493
                                  tile[0],
2494
                                  bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2495
                                             tile_shift[2], tile_shift[0]))),
2496
            bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2497
                       coord_in_tile[2], tile_shift[0])));
2498

2499
      su->setSrc(1, bld.mkOp2v(
2500
            OP_ADD, TYPE_U32, bld.getSSA(),
2501
            bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(),
2502
                       tile[2], y_size_aligned),
2503
            bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2504
                       coord_in_tile[1],
2505
                       bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2506
                                  tile[1], tile_shift[1]))));
2507

2508
      if (su->tex.target == TEX_TARGET_3D) {
2509
         su->moveSources(3, -1);
2510
         su->tex.target = TEX_TARGET_2D;
2511
      }
2512
   }
2513

2514
   // prevent read fault when the image is not actually bound
2515
   CmpInstruction *pred =
2516
      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2517
                TYPE_U32, bld.mkImm(0),
2518
                loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2519
   if (su->op != OP_SUSTP && su->tex.format) {
2520
      const TexInstruction::ImgFormatDesc *format = su->tex.format;
2521
      int blockwidth = format->bits[0] + format->bits[1] +
2522
                       format->bits[2] + format->bits[3];
2523

2524
      assert(format->components != 0);
2525
      // make sure that the format doesn't mismatch when it's not FMT_NONE
2526
      bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2527
                TYPE_U32, bld.loadImm(NULL, ffs(blockwidth / 8) - 1),
2528
                loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2529
                pred->getDef(0));
2530
   }
2531
   su->setPredicate(CC_NOT_P, pred->getDef(0));
2532
}
2533

2534
void
2535
NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
2536
{
2537
   if (su->tex.target == TEX_TARGET_1D_ARRAY) {
2538
      /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
2539
       * will simplify the lowering pass and the texture constraints. */
2540
      su->moveSources(1, 1);
2541
      su->setSrc(1, bld.loadImm(NULL, 0));
2542
      su->tex.target = TEX_TARGET_2D_ARRAY;
2543
   }
2544

2545
   processSurfaceCoordsNVC0(su);
2546

2547
   if (su->op == OP_SULDP) {
2548
      convertSurfaceFormat(su, NULL);
2549
      insertOOBSurfaceOpResult(su);
2550
   }
2551

2552
   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2553
      const int dim = su->tex.target.getDim();
2554
      const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2555
      LValue *addr = bld.getSSA(8);
2556
      Value *def = su->getDef(0);
2557

2558
      su->op = OP_SULEA;
2559

2560
      // Set the destination to the address
2561
      su->dType = TYPE_U64;
2562
      su->setDef(0, addr);
2563
      su->setDef(1, su->getPredicate());
2564

2565
      bld.setPosition(su, true);
2566

2567
      // Perform the atomic op
2568
      Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());
2569
      red->subOp = su->subOp;
2570
      red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));
2571
      red->setSrc(1, su->getSrc(arg));
2572
      if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)
2573
         red->setSrc(2, su->getSrc(arg + 1));
2574
      red->setIndirect(0, 0, addr);
2575

2576
      // make sure to initialize dst value when the atomic operation is not
2577
      // performed
2578
      Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2579

2580
      assert(su->cc == CC_NOT_P);
2581
      red->setPredicate(su->cc, su->getPredicate());
2582
      mov->setPredicate(CC_P, su->getPredicate());
2583

2584
      bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));
2585

2586
      handleCasExch(red, false);
2587
   }
2588
}
2589

2590
TexInstruction *
2591
NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su, Instruction *ret[4])
2592
{
2593
   const int slot = su->tex.r;
2594
   const int dim = su->tex.target.getDim();
2595
   const bool array = su->tex.target.isArray() || su->tex.target.isCube();
2596
   const int arg = dim + array;
2597
   Value *ind = su->getIndirectR();
2598
   Value *handle;
2599
   Instruction *pred = NULL, *pred2d = NULL;
2600
   int pos = 0;
2601

2602
   bld.setPosition(su, false);
2603

2604
   adjustCoordinatesMS(su);
2605

2606
   // add texture handle
2607
   switch (su->op) {
2608
   case OP_SUSTP:
2609
      pos = 4;
2610
      break;
2611
   case OP_SUREDP:
2612
      pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1;
2613
      break;
2614
   default:
2615
      assert(pos == 0);
2616
      break;
2617
   }
2618

2619
   if (dim == 2 && !array) {
2620
      // This might be a 2d slice of a 3d texture, try to load the z
2621
      // coordinate in.
2622
      Value *v;
2623
      if (!su->tex.bindless)
2624
         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2625
      else
2626
         v = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), ind, bld.mkImm(11));
2627
      Value *is_3d = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), v, bld.mkImm(1));
2628
      pred2d = bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2629
                         TYPE_U32, bld.mkImm(0), is_3d);
2630

2631
      bld.mkOp2(OP_SHR, TYPE_U32, v, v, bld.loadImm(NULL, 16));
2632
      su->moveSources(dim, 1);
2633
      su->setSrc(dim, v);
2634
      su->tex.target = nv50_ir::TEX_TARGET_3D;
2635
      pos++;
2636
   }
2637

2638
   if (su->tex.bindless)
2639
      handle = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ind, bld.mkImm(2047));
2640
   else
2641
      handle = loadTexHandle(ind, slot + 32);
2642

2643
   su->setSrc(arg + pos, handle);
2644

2645
   // The address check doesn't make sense here. The format check could make
2646
   // sense but it's a bit of a pain.
2647
   if (!su->tex.bindless) {
2648
      // prevent read fault when the image is not actually bound
2649
      pred =
2650
         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2651
                   TYPE_U32, bld.mkImm(0),
2652
                   loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2653
      if (su->op != OP_SUSTP && su->tex.format) {
2654
         const TexInstruction::ImgFormatDesc *format = su->tex.format;
2655
         int blockwidth = format->bits[0] + format->bits[1] +
2656
            format->bits[2] + format->bits[3];
2657

2658
         assert(format->components != 0);
2659
         // make sure that the format doesn't mismatch when it's not FMT_NONE
2660
         bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2661
                   TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2662
                   loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2663
                   pred->getDef(0));
2664
      }
2665
   }
2666

2667
   // Now we have "pred" which (optionally) contains whether to do the surface
2668
   // op at all, and a "pred2d" which indicates that, in case of doing the
2669
   // surface op, we have to create a 2d and 3d version, conditioned on pred2d.
2670
   TexInstruction *su2d = NULL;
2671
   if (pred2d) {
2672
      su2d = cloneForward(func, su)->asTex();
2673
      for (unsigned i = 0; su->defExists(i); ++i)
2674
         su2d->setDef(i, bld.getSSA());
2675
      su2d->moveSources(dim + 1, -1);
2676
      su2d->tex.target = nv50_ir::TEX_TARGET_2D;
2677
   }
2678
   if (pred2d && pred) {
2679
      Instruction *pred3d = bld.mkOp2(OP_AND, TYPE_U8,
2680
                                      bld.getSSA(1, FILE_PREDICATE),
2681
                                      pred->getDef(0), pred2d->getDef(0));
2682
      pred3d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
2683
      pred3d->src(1).mod = Modifier(NV50_IR_MOD_NOT);
2684
      su->setPredicate(CC_P, pred3d->getDef(0));
2685
      pred2d = bld.mkOp2(OP_AND, TYPE_U8, bld.getSSA(1, FILE_PREDICATE),
2686
                         pred->getDef(0), pred2d->getDef(0));
2687
      pred2d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
2688
   } else if (pred) {
2689
      su->setPredicate(CC_NOT_P, pred->getDef(0));
2690
   } else if (pred2d) {
2691
      su->setPredicate(CC_NOT_P, pred2d->getDef(0));
2692
   }
2693
   if (su2d) {
2694
      su2d->setPredicate(CC_P, pred2d->getDef(0));
2695
      bld.insert(su2d);
2696

2697
      // Create a UNION so that RA assigns the same registers
2698
      bld.setPosition(su, true);
2699
      for (unsigned i = 0; su->defExists(i); ++i) {
2700
         assert(i < 4);
2701

2702
         Value *def = su->getDef(i);
2703
         Value *newDef = bld.getSSA();
2704
         ValueDef &def2 = su2d->def(i);
2705
         Instruction *mov = NULL;
2706

2707
         su->setDef(i, newDef);
2708
         if (pred) {
2709
            mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2710
            mov->setPredicate(CC_P, pred->getDef(0));
2711
         }
2712

2713
         Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
2714
                                      bld.getSSA(),
2715
                                      newDef, def2.get());
2716
         if (mov)
2717
            uni->setSrc(2, mov->getDef(0));
2718
         bld.mkMov(def, uni->getDef(0));
2719
      }
2720
   } else if (pred) {
2721
      // Create a UNION so that RA assigns the same registers
2722
      bld.setPosition(su, true);
2723
      for (unsigned i = 0; su->defExists(i); ++i) {
2724
         assert(i < 4);
2725

2726
         Value *def = su->getDef(i);
2727
         Value *newDef = bld.getSSA();
2728
         su->setDef(i, newDef);
2729

2730
         Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2731
         mov->setPredicate(CC_P, pred->getDef(0));
2732

2733
         Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
2734
                                      bld.getSSA(),
2735
                                      newDef, mov->getDef(0));
2736
         bld.mkMov(def, uni->getDef(0));
2737
      }
2738
   }
2739

2740
   return su2d;
2741
}
2742

2743
void
2744
NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
2745
{
2746
   // processSurfaceCoords also takes care of fixing up the outputs and
2747
   // union'ing them with 0 as necessary. Additionally it may create a second
2748
   // surface which needs some of the similar fixups.
2749

2750
   Instruction *loaded[4] = {};
2751
   TexInstruction *su2 = processSurfaceCoordsGM107(su, loaded);
2752

2753
   if (su->op == OP_SULDP) {
2754
      convertSurfaceFormat(su, loaded);
2755
   }
2756

2757
   if (su->op == OP_SUREDP) {
2758
      su->op = OP_SUREDB;
2759
   }
2760

2761
   // If we fixed up the type of the regular surface load instruction, we also
2762
   // have to fix up the copy.
2763
   if (su2) {
2764
      su2->op = su->op;
2765
      su2->dType = su->dType;
2766
      su2->sType = su->sType;
2767
   }
2768
}
2769

2770
bool
2771
NVC0LoweringPass::handleWRSV(Instruction *i)
2772
{
2773
   Instruction *st;
2774
   Symbol *sym;
2775
   uint32_t addr;
2776

2777
   // must replace, $sreg are not writeable
2778
   addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
2779
   if (addr >= 0x400)
2780
      return false;
2781
   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
2782

2783
   st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
2784
                    i->getSrc(1));
2785
   st->perPatch = i->perPatch;
2786

2787
   bld.getBB()->remove(i);
2788
   return true;
2789
}
2790

2791
void
2792
NVC0LoweringPass::handleLDST(Instruction *i)
2793
{
2794
   if (i->src(0).getFile() == FILE_SHADER_INPUT) {
2795
      if (prog->getType() == Program::TYPE_COMPUTE) {
2796
         i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
2797
         i->getSrc(0)->reg.fileIndex = 0;
2798
      } else
2799
      if (prog->getType() == Program::TYPE_GEOMETRY &&
2800
          i->src(0).isIndirect(0)) {
2801
         // XXX: this assumes vec4 units
2802
         Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2803
                                 i->getIndirect(0, 0), bld.mkImm(4));
2804
         i->setIndirect(0, 0, ptr);
2805
         i->op = OP_VFETCH;
2806
      } else {
2807
         i->op = OP_VFETCH;
2808
         assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
2809
      }
2810
   } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
2811
      int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
2812
      Value *ind = i->getIndirect(0, 1);
2813

2814
      if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2815
          prog->getType() == Program::TYPE_COMPUTE &&
2816
          (fileIndex >= 6 || ind)) {
2817
         // The launch descriptor only allows to set up 8 CBs, but OpenGL
2818
         // requires at least 12 UBOs. To bypass this limitation, for constant
2819
         // buffers 7+, we store the addrs into the driver constbuf and we
2820
         // directly load from the global memory.
2821
         if (ind) {
2822
            // Clamp the UBO index when an indirect access is used to avoid
2823
            // loading information from the wrong place in the driver cb.
2824
            // TODO - synchronize the max with the driver.
2825
            ind = bld.mkOp2v(OP_MIN, TYPE_U32, bld.getSSA(),
2826
                             bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2827
                                        ind, bld.loadImm(NULL, fileIndex)),
2828
                             bld.loadImm(NULL, 13));
2829
            fileIndex = 0;
2830
         }
2831

2832
         Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2833
         Value *ptr = loadUboInfo64(ind, fileIndex * 16);
2834
         Value *length = loadUboLength32(ind, fileIndex * 16);
2835
         Value *pred = new_LValue(func, FILE_PREDICATE);
2836
         if (i->src(0).isIndirect(0)) {
2837
            bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2838
            bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2839
         }
2840
         i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2841
         i->setIndirect(0, 1, NULL);
2842
         i->setIndirect(0, 0, ptr);
2843
         bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2844
         i->setPredicate(CC_NOT_P, pred);
2845
         Value *zero, *dst = i->getDef(0);
2846
         i->setDef(0, bld.getSSA());
2847

2848
         bld.setPosition(i, true);
2849
         bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2850
            ->setPredicate(CC_P, pred);
2851
         bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2852
      } else if (i->src(0).isIndirect(1)) {
2853
         Value *ptr;
2854
         if (i->src(0).isIndirect(0))
2855
            ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
2856
                             i->getIndirect(0, 1), bld.mkImm(0x1010),
2857
                             i->getIndirect(0, 0));
2858
         else
2859
            ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2860
                             i->getIndirect(0, 1), bld.mkImm(16));
2861
         i->setIndirect(0, 1, NULL);
2862
         i->setIndirect(0, 0, ptr);
2863
         i->subOp = NV50_IR_SUBOP_LDC_IS;
2864
      }
2865
   } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
2866
      assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
2867
      i->op = OP_VFETCH;
2868
   } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {
2869
      Value *ind = i->getIndirect(0, 1);
2870
      Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
2871
      // XXX come up with a way not to do this for EVERY little access but
2872
      // rather to batch these up somehow. Unfortunately we've lost the
2873
      // information about the field width by the time we get here.
2874
      Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2875
      Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
2876
      Value *pred = new_LValue(func, FILE_PREDICATE);
2877
      if (i->src(0).isIndirect(0)) {
2878
         bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2879
         bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2880
      }
2881
      i->setIndirect(0, 1, NULL);
2882
      i->setIndirect(0, 0, ptr);
2883
      i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2884
      bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2885
      i->setPredicate(CC_NOT_P, pred);
2886
      if (i->defExists(0)) {
2887
         Value *zero, *dst = i->getDef(0);
2888
         i->setDef(0, bld.getSSA());
2889

2890
         bld.setPosition(i, true);
2891
         bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2892
            ->setPredicate(CC_P, pred);
2893
         bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2894
      }
2895
   }
2896
}
2897

2898
void
2899
NVC0LoweringPass::readTessCoord(LValue *dst, int c)
2900
{
2901
   Value *laneid = bld.getSSA();
2902
   Value *x, *y;
2903

2904
   bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
2905

2906
   if (c == 0) {
2907
      x = dst;
2908
      y = NULL;
2909
   } else
2910
   if (c == 1) {
2911
      x = NULL;
2912
      y = dst;
2913
   } else {
2914
      assert(c == 2);
2915
      if (prog->driver_out->prop.tp.domain != PIPE_PRIM_TRIANGLES) {
2916
         bld.mkMov(dst, bld.loadImm(NULL, 0));
2917
         return;
2918
      }
2919
      x = bld.getSSA();
2920
      y = bld.getSSA();
2921
   }
2922
   if (x)
2923
      bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
2924
   if (y)
2925
      bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
2926

2927
   if (c == 2) {
2928
      bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
2929
      bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
2930
   }
2931
}
2932

2933
bool
2934
NVC0LoweringPass::handleRDSV(Instruction *i)
2935
{
2936
   Symbol *sym = i->getSrc(0)->asSym();
2937
   const SVSemantic sv = sym->reg.data.sv.sv;
2938
   Value *vtx = NULL;
2939
   Instruction *ld;
2940
   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
2941

2942
   if (addr >= 0x400) {
2943
      // mov $sreg
2944
      if (sym->reg.data.sv.index == 3) {
2945
         // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
2946
         i->op = OP_MOV;
2947
         i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
2948
      } else
2949
      if (sv == SV_TID) {
2950
         // Help CSE combine TID fetches
2951
         Value *tid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(),
2952
                                 bld.mkSysVal(SV_COMBINED_TID, 0));
2953
         i->op = OP_EXTBF;
2954
         i->setSrc(0, tid);
2955
         switch (sym->reg.data.sv.index) {
2956
         case 0: i->setSrc(1, bld.mkImm(0x1000)); break;
2957
         case 1: i->setSrc(1, bld.mkImm(0x0a10)); break;
2958
         case 2: i->setSrc(1, bld.mkImm(0x061a)); break;
2959
         }
2960
      }
2961
      if (sv == SV_VERTEX_COUNT) {
2962
         bld.setPosition(i, true);
2963
         bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
2964
      }
2965
      return true;
2966
   }
2967

2968
   switch (sv) {
2969
   case SV_POSITION:
2970
      assert(prog->getType() == Program::TYPE_FRAGMENT);
2971
      if (i->srcExists(1)) {
2972
         // Pass offset through to the interpolation logic
2973
         ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
2974
                           i->getDef(0), addr, NULL);
2975
         ld->setSrc(1, i->getSrc(1));
2976
      } else {
2977
         bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
2978
      }
2979
      break;
2980
   case SV_FACE:
2981
   {
2982
      Value *face = i->getDef(0);
2983
      bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
2984
      if (i->dType == TYPE_F32) {
2985
         bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
2986
         bld.mkOp1(OP_NEG, TYPE_S32, face, face);
2987
         bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
2988
      }
2989
   }
2990
      break;
2991
   case SV_TESS_COORD:
2992
      assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
2993
      readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
2994
      break;
2995
   case SV_NTID:
2996
   case SV_NCTAID:
2997
   case SV_GRIDID:
2998
      assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
2999
      if (sym->reg.data.sv.index == 3) {
3000
         i->op = OP_MOV;
3001
         i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
3002
         return true;
3003
      }
3004
      FALLTHROUGH;
3005
   case SV_WORK_DIM:
3006
      addr += prog->driver->prop.cp.gridInfoBase;
3007
      bld.mkLoad(TYPE_U32, i->getDef(0),
3008
                 bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
3009
                              TYPE_U32, addr), NULL);
3010
      break;
3011
   case SV_SAMPLE_INDEX:
3012
      // TODO: Properly pass source as an address in the PIX address space
3013
      // (which can be of the form [r0+offset]). But this is currently
3014
      // unnecessary.
3015
      ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
3016
      ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
3017
      break;
3018
   case SV_SAMPLE_POS: {
3019
      Value *sampleID = bld.getScratch();
3020
      ld = bld.mkOp1(OP_PIXLD, TYPE_U32, sampleID, bld.mkImm(0));
3021
      ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
3022
      Value *offset = calculateSampleOffset(sampleID);
3023

3024
      assert(prog->driver_out->prop.fp.readsSampleLocations);
3025

3026
      if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
3027
         bld.mkLoad(TYPE_F32,
3028
                    i->getDef(0),
3029
                    bld.mkSymbol(
3030
                          FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
3031
                          TYPE_U32, prog->driver->io.sampleInfoBase),
3032
                    offset);
3033
         bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0),
3034
                   bld.mkImm(0x040c + sym->reg.data.sv.index * 16));
3035
         bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_U32, i->getDef(0));
3036
         bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), bld.mkImm(1.0f / 16.0f));
3037
      } else {
3038
         bld.mkLoad(TYPE_F32,
3039
                    i->getDef(0),
3040
                    bld.mkSymbol(
3041
                          FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
3042
                          TYPE_U32, prog->driver->io.sampleInfoBase +
3043
                          4 * sym->reg.data.sv.index),
3044
                    offset);
3045
      }
3046
      break;
3047
   }
3048
   case SV_SAMPLE_MASK: {
3049
      ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
3050
      ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
3051
      Instruction *sampleid =
3052
         bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0));
3053
      sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
3054
      Value *masked =
3055
         bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0),
3056
                    bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
3057
                               bld.loadImm(NULL, 1), sampleid->getDef(0)));
3058
      if (prog->persampleInvocation) {
3059
         bld.mkMov(i->getDef(0), masked);
3060
      } else {
3061
         bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked,
3062
                   bld.mkImm(0))
3063
            ->subOp = 1;
3064
      }
3065
      break;
3066
   }
3067
   case SV_BASEVERTEX:
3068
   case SV_BASEINSTANCE:
3069
   case SV_DRAWID:
3070
      ld = bld.mkLoad(TYPE_U32, i->getDef(0),
3071
                      bld.mkSymbol(FILE_MEMORY_CONST,
3072
                                   prog->driver->io.auxCBSlot,
3073
                                   TYPE_U32,
3074
                                   prog->driver->io.drawInfoBase +
3075
                                   4 * (sv - SV_BASEVERTEX)),
3076
                      NULL);
3077
      break;
3078
   default:
3079
      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
3080
         vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
3081
      if (prog->getType() == Program::TYPE_FRAGMENT) {
3082
         bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);
3083
      } else {
3084
         ld = bld.mkFetch(i->getDef(0), i->dType,
3085
                          FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
3086
         ld->perPatch = i->perPatch;
3087
      }
3088
      break;
3089
   }
3090
   bld.getBB()->remove(i);
3091
   return true;
3092
}
3093

3094
bool
3095
NVC0LoweringPass::handleDIV(Instruction *i)
3096
{
3097
   if (!isFloatType(i->dType))
3098
      return true;
3099
   bld.setPosition(i, false);
3100
   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
3101
   i->op = OP_MUL;
3102
   i->setSrc(1, rcp->getDef(0));
3103
   return true;
3104
}
3105

3106
bool
3107
NVC0LoweringPass::handleMOD(Instruction *i)
3108
{
3109
   if (!isFloatType(i->dType))
3110
      return true;
3111
   LValue *value = bld.getScratch(typeSizeof(i->dType));
3112
   bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
3113
   bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
3114
   bld.mkOp1(OP_TRUNC, i->dType, value, value);
3115
   bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
3116
   i->op = OP_SUB;
3117
   i->setSrc(1, value);
3118
   return true;
3119
}
3120

3121
bool
3122
NVC0LoweringPass::handleSQRT(Instruction *i)
3123
{
3124
   if (targ->isOpSupported(OP_SQRT, i->dType))
3125
      return true;
3126

3127
   if (i->dType == TYPE_F64) {
3128
      Value *pred = bld.getSSA(1, FILE_PREDICATE);
3129
      Value *zero = bld.loadImm(NULL, 0.0);
3130
      Value *dst = bld.getSSA(8);
3131
      bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
3132
      bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
3133
      bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
3134
      i->op = OP_MUL;
3135
      i->setSrc(1, dst);
3136
      // TODO: Handle this properly with a library function
3137
   } else {
3138
      bld.setPosition(i, true);
3139
      i->op = OP_RSQ;
3140
      bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
3141
   }
3142

3143
   return true;
3144
}
3145

3146
bool
3147
NVC0LoweringPass::handlePOW(Instruction *i)
3148
{
3149
   LValue *val = bld.getScratch();
3150

3151
   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
3152
   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
3153
   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
3154

3155
   i->op = OP_EX2;
3156
   i->setSrc(0, val);
3157
   i->setSrc(1, NULL);
3158

3159
   return true;
3160
}
3161

3162
bool
3163
NVC0LoweringPass::handleEXPORT(Instruction *i)
3164
{
3165
   if (prog->getType() == Program::TYPE_FRAGMENT) {
3166
      int id = i->getSrc(0)->reg.data.offset / 4;
3167

3168
      if (i->src(0).isIndirect(0)) // TODO, ugly
3169
         return false;
3170
      i->op = OP_MOV;
3171
      i->subOp = NV50_IR_SUBOP_MOV_FINAL;
3172
      i->src(0).set(i->src(1));
3173
      i->setSrc(1, NULL);
3174
      i->setDef(0, new_LValue(func, FILE_GPR));
3175
      i->getDef(0)->reg.data.id = id;
3176

3177
      prog->maxGPR = MAX2(prog->maxGPR, id);
3178
   } else
3179
   if (prog->getType() == Program::TYPE_GEOMETRY) {
3180
      i->setIndirect(0, 1, gpEmitAddress);
3181
   }
3182
   return true;
3183
}
3184

3185
bool
3186
NVC0LoweringPass::handleOUT(Instruction *i)
3187
{
3188
   Instruction *prev = i->prev;
3189
   ImmediateValue stream, prevStream;
3190

3191
   // Only merge if the stream ids match. Also, note that the previous
3192
   // instruction would have already been lowered, so we take arg1 from it.
3193
   if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
3194
       i->src(0).getImmediate(stream) &&
3195
       prev->src(1).getImmediate(prevStream) &&
3196
       stream.reg.data.u32 == prevStream.reg.data.u32) {
3197
      i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
3198
      delete_Instruction(prog, i);
3199
   } else {
3200
      assert(gpEmitAddress);
3201
      i->setDef(0, gpEmitAddress);
3202
      i->setSrc(1, i->getSrc(0));
3203
      i->setSrc(0, gpEmitAddress);
3204
   }
3205
   return true;
3206
}
3207

3208
Value *
3209
NVC0LoweringPass::calculateSampleOffset(Value *sampleID)
3210
{
3211
   Value *offset = bld.getScratch();
3212
   if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
3213
      // Sample location offsets (in bytes) are calculated like so:
3214
      // offset = (SV_POSITION.y % 4 * 2) + (SV_POSITION.x % 2)
3215
      // offset = offset * 32 + sampleID % 8 * 4;
3216
      // which is equivalent to:
3217
      // offset = (SV_POSITION.y & 0x3) << 6 + (SV_POSITION.x & 0x1) << 5;
3218
      // offset += sampleID << 2
3219

3220
      // The second operand (src1) of the INSBF instructions are like so:
3221
      // 0xssll where ss is the size and ll is the offset.
3222
      // so: dest = src2 | (src0 & (1 << ss - 1)) << ll
3223

3224
      // Add sample ID (offset = (sampleID & 0x7) << 2)
3225
      bld.mkOp3(OP_INSBF, TYPE_U32, offset, sampleID, bld.mkImm(0x0302), bld.mkImm(0x0));
3226

3227
      Symbol *xSym = bld.mkSysVal(SV_POSITION, 0);
3228
      Symbol *ySym = bld.mkSysVal(SV_POSITION, 1);
3229
      Value *coord = bld.getScratch();
3230

3231
      // Add X coordinate (offset |= (SV_POSITION.x & 0x1) << 5)
3232
      bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
3233
                   targ->getSVAddress(FILE_SHADER_INPUT, xSym), NULL);
3234
      bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
3235
         ->rnd = ROUND_ZI;
3236
      bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0105), offset);
3237

3238
      // Add Y coordinate (offset |= (SV_POSITION.y & 0x3) << 6)
3239
      bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
3240
                   targ->getSVAddress(FILE_SHADER_INPUT, ySym), NULL);
3241
      bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
3242
         ->rnd = ROUND_ZI;
3243
      bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0206), offset);
3244
   } else {
3245
      bld.mkOp2(OP_SHL, TYPE_U32, offset, sampleID, bld.mkImm(3));
3246
   }
3247
   return offset;
3248
}
3249

3250
// Handle programmable sample locations for GM20x+
3251
void
3252
NVC0LoweringPass::handlePIXLD(Instruction *i)
3253
{
3254
   if (i->subOp != NV50_IR_SUBOP_PIXLD_OFFSET)
3255
      return;
3256
   if (targ->getChipset() < NVISA_GM200_CHIPSET)
3257
      return;
3258

3259
   assert(prog->driver_out->prop.fp.readsSampleLocations);
3260

3261
   bld.mkLoad(TYPE_F32,
3262
              i->getDef(0),
3263
              bld.mkSymbol(
3264
                    FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
3265
                    TYPE_U32, prog->driver->io.sampleInfoBase),
3266
              calculateSampleOffset(i->getSrc(0)));
3267

3268
   bld.getBB()->remove(i);
3269
}
3270

3271
// Generate a binary predicate if an instruction is predicated by
3272
// e.g. an f32 value.
3273
void
3274
NVC0LoweringPass::checkPredicate(Instruction *insn)
3275
{
3276
   Value *pred = insn->getPredicate();
3277
   Value *pdst;
3278

3279
   if (!pred || pred->reg.file == FILE_PREDICATE)
3280
      return;
3281
   pdst = new_LValue(func, FILE_PREDICATE);
3282

3283
   // CAUTION: don't use pdst->getInsn, the definition might not be unique,
3284
   //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
3285

3286
   bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
3287

3288
   insn->setPredicate(insn->cc, pdst);
3289
}
3290

3291
//
3292
// - add quadop dance for texturing
3293
// - put FP outputs in GPRs
3294
// - convert instruction sequences
3295
//
3296
bool
3297
NVC0LoweringPass::visit(Instruction *i)
3298
{
3299
   bool ret = true;
3300
   bld.setPosition(i, false);
3301

3302
   if (i->cc != CC_ALWAYS)
3303
      checkPredicate(i);
3304

3305
   switch (i->op) {
3306
   case OP_TEX:
3307
   case OP_TXB:
3308
   case OP_TXL:
3309
   case OP_TXF:
3310
   case OP_TXG:
3311
      return handleTEX(i->asTex());
3312
   case OP_TXD:
3313
      return handleTXD(i->asTex());
3314
   case OP_TXLQ:
3315
      return handleTXLQ(i->asTex());
3316
   case OP_TXQ:
3317
     return handleTXQ(i->asTex());
3318
   case OP_EX2:
3319
      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
3320
      i->setSrc(0, i->getDef(0));
3321
      break;
3322
   case OP_POW:
3323
      return handlePOW(i);
3324
   case OP_DIV:
3325
      return handleDIV(i);
3326
   case OP_MOD:
3327
      return handleMOD(i);
3328
   case OP_SQRT:
3329
      return handleSQRT(i);
3330
   case OP_EXPORT:
3331
      ret = handleEXPORT(i);
3332
      break;
3333
   case OP_EMIT:
3334
   case OP_RESTART:
3335
      return handleOUT(i);
3336
   case OP_RDSV:
3337
      return handleRDSV(i);
3338
   case OP_WRSV:
3339
      return handleWRSV(i);
3340
   case OP_STORE:
3341
   case OP_LOAD:
3342
      handleLDST(i);
3343
      break;
3344
   case OP_ATOM:
3345
   {
3346
      const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;
3347
      handleATOM(i);
3348
      handleCasExch(i, cctl);
3349
   }
3350
      break;
3351
   case OP_SULDB:
3352
   case OP_SULDP:
3353
   case OP_SUSTB:
3354
   case OP_SUSTP:
3355
   case OP_SUREDB:
3356
   case OP_SUREDP:
3357
      if (targ->getChipset() >= NVISA_GM107_CHIPSET)
3358
         handleSurfaceOpGM107(i->asTex());
3359
      else if (targ->getChipset() >= NVISA_GK104_CHIPSET)
3360
         handleSurfaceOpNVE4(i->asTex());
3361
      else
3362
         handleSurfaceOpNVC0(i->asTex());
3363
      break;
3364
   case OP_SUQ:
3365
      handleSUQ(i->asTex());
3366
      break;
3367
   case OP_BUFQ:
3368
      handleBUFQ(i);
3369
      break;
3370
   case OP_PIXLD:
3371
      handlePIXLD(i);
3372
      break;
3373
   default:
3374
      break;
3375
   }
3376

3377
   /* Kepler+ has a special opcode to compute a new base address to be used
3378
    * for indirect loads.
3379
    *
3380
    * Maxwell+ has an additional similar requirement for indirect
3381
    * interpolation ops in frag shaders.
3382
    */
3383
   bool doAfetch = false;
3384
   if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
3385
       !i->perPatch &&
3386
       (i->op == OP_VFETCH || i->op == OP_EXPORT) &&
3387
       i->src(0).isIndirect(0)) {
3388
      doAfetch = true;
3389
   }
3390
   if (targ->getChipset() >= NVISA_GM107_CHIPSET &&
3391
       (i->op == OP_LINTERP || i->op == OP_PINTERP) &&
3392
       i->src(0).isIndirect(0)) {
3393
      doAfetch = true;
3394
   }
3395

3396
   if (doAfetch) {
3397
      Value *addr = cloneShallow(func, i->getSrc(0));
3398
      Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
3399
                                      i->getSrc(0));
3400
      afetch->setIndirect(0, 0, i->getIndirect(0, 0));
3401
      addr->reg.data.offset = 0;
3402
      i->setSrc(0, addr);
3403
      i->setIndirect(0, 0, afetch->getDef(0));
3404
   }
3405

3406
   return ret;
3407
}
3408

3409
bool
3410
TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
3411
{
3412
   if (stage == CG_STAGE_PRE_SSA) {
3413
      NVC0LoweringPass pass(prog);
3414
      return pass.run(prog, false, true);
3415
   } else
3416
   if (stage == CG_STAGE_POST_RA) {
3417
      NVC0LegalizePostRA pass(prog);
3418
      return pass.run(prog, false, true);
3419
   } else
3420
   if (stage == CG_STAGE_SSA) {
3421
      NVC0LegalizeSSA pass;
3422
      return pass.run(prog, false, true);
3423
   }
3424
   return false;
3425
}
3426

3427
} // namespace nv50_ir
3428

3429
Product

Resources

Company