CoCalc -- nv50_ir_lowering

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
⁴⁵⁷⁴ views
1
/*
2
 * Copyright 2011 Christoph Bumiller
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice shall be included in
12
 * all copies or substantial portions of the Software.
13
 *
14
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20
 * OTHER DEALINGS IN THE SOFTWARE.
21
 */
22

23
#include "codegen/nv50_ir.h"
24
#include "codegen/nv50_ir_build_util.h"
25

26
#include "codegen/nv50_ir_target_nv50.h"
27

28
#define NV50_SU_INFO_SIZE_X   0x00
29
#define NV50_SU_INFO_SIZE_Y   0x04
30
#define NV50_SU_INFO_SIZE_Z   0x08
31
#define NV50_SU_INFO_BSIZE    0x0c
32
#define NV50_SU_INFO_STRIDE_Y 0x10
33
#define NV50_SU_INFO_MS_X     0x18
34
#define NV50_SU_INFO_MS_Y     0x1c
35
#define NV50_SU_INFO_TILE_SHIFT_X 0x20
36
#define NV50_SU_INFO_TILE_SHIFT_Y 0x24
37
#define NV50_SU_INFO_TILE_SHIFT_Z 0x28
38
#define NV50_SU_INFO_OFFSET_Z 0x2c
39

40
#define NV50_SU_INFO__STRIDE 0x30
41

42
#define NV50_SU_INFO_SIZE(i) (0x00 + (i) * 4)
43
#define NV50_SU_INFO_MS(i)   (0x18 + (i) * 4)
44
#define NV50_SU_INFO_TILE_SHIFT(i) (0x20 + (i) * 4)
45

46
namespace nv50_ir {
47

48
// nv50 doesn't support 32 bit integer multiplication
49
//
50
//       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
51
// -------------------
52
//    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
53
// ah*bh 00 00                 (           carry1) << 16 + ( carry2)
54
//       al*bl
55
//    ah*bl 00
56
//
57
// fffe0001 + fffe0001
58
//
59
// Note that this sort of splitting doesn't work for signed values, so we
60
// compute the sign on those manually and then perform an unsigned multiply.
61
static bool
62
expandIntegerMUL(BuildUtil *bld, Instruction *mul)
63
{
64
   const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
65
   ImmediateValue src1;
66
   bool src1imm = mul->src(1).getImmediate(src1);
67

68
   DataType fTy; // full type
69
   switch (mul->sType) {
70
   case TYPE_S32: fTy = TYPE_U32; break;
71
   case TYPE_S64: fTy = TYPE_U64; break;
72
   default: fTy = mul->sType; break;
73
   }
74

75
   DataType hTy; // half type
76
   switch (fTy) {
77
   case TYPE_U32: hTy = TYPE_U16; break;
78
   case TYPE_U64: hTy = TYPE_U32; break;
79
   default:
80
      return false;
81
   }
82
   unsigned int fullSize = typeSizeof(fTy);
83
   unsigned int halfSize = typeSizeof(hTy);
84

85
   Instruction *i[9];
86

87
   bld->setPosition(mul, true);
88

89
   Value *s[2];
90
   Value *a[2], *b[2];
91
   Value *t[4];
92
   for (int j = 0; j < 4; ++j)
93
      t[j] = bld->getSSA(fullSize);
94

95
   if (isSignedType(mul->sType) && highResult) {
96
      s[0] = bld->getSSA(fullSize);
97
      s[1] = bld->getSSA(fullSize);
98
      bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
99
      bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
100
      src1.reg.data.s32 = abs(src1.reg.data.s32);
101
   } else {
102
      s[0] = mul->getSrc(0);
103
      s[1] = mul->getSrc(1);
104
   }
105

106
   // split sources into halves
107
   i[0] = bld->mkSplit(a, halfSize, s[0]);
108
   i[1] = bld->mkSplit(b, halfSize, s[1]);
109

110
   if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
111
      i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
112
                               bld->mkImm(src1.reg.data.u32 & 0xffff));
113
   } else {
114
      i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
115
                        src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
116
      if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
117
         i[3] = i[2];
118
         t[1] = t[0];
119
      } else {
120
         i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
121
      }
122
   }
123
   i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
124
   if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
125
      i[4] = i[3];
126
      t[3] = t[2];
127
   } else {
128
      i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
129
   }
130

131
   if (highResult) {
132
      Value *c[2];
133
      Value *r[5];
134
      Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
135
      c[0] = bld->getSSA(1, FILE_FLAGS);
136
      c[1] = bld->getSSA(1, FILE_FLAGS);
137
      for (int j = 0; j < 5; ++j)
138
         r[j] = bld->getSSA(fullSize);
139

140
      i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
141
      i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
142
      bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
143
      bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
144
      i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
145

146
      // set carry defs / sources
147
      i[3]->setFlagsDef(1, c[0]);
148
      // actual result required in negative case, but ignored for
149
      // unsigned. for some reason the compiler ends up dropping the whole
150
      // instruction if the destination is unused but the flags are.
151
      if (isSignedType(mul->sType))
152
         i[4]->setFlagsDef(1, c[1]);
153
      else
154
         i[4]->setFlagsDef(0, c[1]);
155
      i[6]->setPredicate(CC_C, c[0]);
156
      i[5]->setFlagsSrc(3, c[1]);
157

158
      if (isSignedType(mul->sType)) {
159
         Value *cc[2];
160
         Value *rr[7];
161
         Value *one = bld->getSSA(fullSize);
162
         bld->loadImm(one, 1);
163
         for (int j = 0; j < 7; j++)
164
            rr[j] = bld->getSSA(fullSize);
165

166
         // NOTE: this logic uses predicates because splitting basic blocks is
167
         // ~impossible during the SSA phase. The RA relies on a correlation
168
         // between edge order and phi node sources.
169

170
         // Set the sign of the result based on the inputs
171
         bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
172
            ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
173

174
         // 1s complement of 64-bit value
175
         bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
176
            ->setPredicate(CC_S, cc[0]);
177
         bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
178
            ->setPredicate(CC_S, cc[0]);
179

180
         // add to low 32-bits, keep track of the carry
181
         Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
182
         n->setPredicate(CC_S, cc[0]);
183
         n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
184

185
         // If there was a carry, add 1 to the upper 32 bits
186
         // XXX: These get executed even if they shouldn't be
187
         bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
188
            ->setPredicate(CC_C, cc[1]);
189
         bld->mkMov(rr[3], rr[0])
190
            ->setPredicate(CC_NC, cc[1]);
191
         bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
192

193
         // Merge the results from the negative and non-negative paths
194
         bld->mkMov(rr[5], rr[4])
195
            ->setPredicate(CC_S, cc[0]);
196
         bld->mkMov(rr[6], r[4])
197
            ->setPredicate(CC_NS, cc[0]);
198
         bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
199
      } else {
200
         bld->mkMov(mul->getDef(0), r[4]);
201
      }
202
   } else {
203
      bld->mkMov(mul->getDef(0), t[3]);
204
   }
205
   delete_Instruction(bld->getProgram(), mul);
206

207
   for (int j = 2; j <= (highResult ? 5 : 4); ++j)
208
      if (i[j])
209
         i[j]->sType = hTy;
210

211
   return true;
212
}
213

214
#define QOP_ADD  0
215
#define QOP_SUBR 1
216
#define QOP_SUB  2
217
#define QOP_MOV2 3
218

219
//             UL UR LL LR
220
#define QUADOP(q, r, s, t)            \
221
   ((QOP_##q << 6) | (QOP_##r << 4) | \
222
    (QOP_##s << 2) | (QOP_##t << 0))
223

224
class NV50LegalizePostRA : public Pass
225
{
226
public:
227
   NV50LegalizePostRA() : r63(NULL) { }
228

229
private:
230
   virtual bool visit(Function *);
231
   virtual bool visit(BasicBlock *);
232

233
   void handlePRERET(FlowInstruction *);
234
   void replaceZero(Instruction *);
235

236
   BuildUtil bld;
237

238
   LValue *r63;
239
};
240

241
bool
242
NV50LegalizePostRA::visit(Function *fn)
243
{
244
   Program *prog = fn->getProgram();
245

246
   r63 = new_LValue(fn, FILE_GPR);
247
   // GPR units on nv50 are in half-regs
248
   if (prog->maxGPR < 126)
249
      r63->reg.data.id = 63;
250
   else
251
      r63->reg.data.id = 127;
252

253
   // this is actually per-program, but we can do it all on visiting main()
254
   std::list<Instruction *> *outWrites =
255
      reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
256

257
   if (outWrites) {
258
      for (std::list<Instruction *>::iterator it = outWrites->begin();
259
           it != outWrites->end(); ++it)
260
         (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
261
      // instructions will be deleted on exit
262
      outWrites->clear();
263
   }
264

265
   return true;
266
}
267

268
void
269
NV50LegalizePostRA::replaceZero(Instruction *i)
270
{
271
   for (int s = 0; i->srcExists(s); ++s) {
272
      ImmediateValue *imm = i->getSrc(s)->asImm();
273
      if (imm && imm->reg.data.u64 == 0)
274
         i->setSrc(s, r63);
275
   }
276
}
277

278
// Emulate PRERET: jump to the target and call to the origin from there
279
//
280
// WARNING: atm only works if BBs are affected by at most a single PRERET
281
//
282
// BB:0
283
// preret BB:3
284
// (...)
285
// BB:3
286
// (...)
287
//             --->
288
// BB:0
289
// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
290
// (...)
291
// BB:3
292
// bra BB:3 + n1 (skip the call)
293
// call BB:0 + n2 (skip bra at beginning of BB:0)
294
// (...)
295
void
296
NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
297
{
298
   BasicBlock *bbE = pre->bb;
299
   BasicBlock *bbT = pre->target.bb;
300

301
   pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
302
   bbE->remove(pre);
303
   bbE->insertHead(pre);
304

305
   Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
306
   Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
307

308
   bbT->insertHead(call);
309
   bbT->insertHead(skip);
310

311
   // NOTE: maybe split blocks to prevent the instructions from moving ?
312

313
   skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
314
   call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
315
}
316

317
bool
318
NV50LegalizePostRA::visit(BasicBlock *bb)
319
{
320
   Instruction *i, *next;
321

322
   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
323
   for (i = bb->getFirst(); i; i = next) {
324
      next = i->next;
325
      if (i->isNop()) {
326
         bb->remove(i);
327
      } else
328
      if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
329
         handlePRERET(i->asFlow());
330
      } else {
331
         // TODO: We will want to do this before register allocation,
332
         // since have to use a $c register for the carry flag.
333
         if (typeSizeof(i->dType) == 8) {
334
            Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
335
            if (hi)
336
               next = hi;
337
         }
338

339
         if (i->op != OP_PFETCH && i->op != OP_BAR &&
340
             (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
341
            replaceZero(i);
342
      }
343
   }
344
   if (!bb->getEntry())
345
      return true;
346

347
   return true;
348
}
349

350
class NV50LegalizeSSA : public Pass
351
{
352
public:
353
   NV50LegalizeSSA(Program *);
354

355
   virtual bool visit(BasicBlock *bb);
356

357
private:
358
   void propagateWriteToOutput(Instruction *);
359
   void handleDIV(Instruction *);
360
   void handleMOD(Instruction *);
361
   void handleMUL(Instruction *);
362
   void handleAddrDef(Instruction *);
363

364
   inline bool isARL(const Instruction *) const;
365

366
   BuildUtil bld;
367

368
   std::list<Instruction *> *outWrites;
369
};
370

371
NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
372
{
373
   bld.setProgram(prog);
374

375
   if (prog->optLevel >= 2 &&
376
       (prog->getType() == Program::TYPE_GEOMETRY ||
377
        prog->getType() == Program::TYPE_VERTEX))
378
      outWrites =
379
         reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
380
   else
381
      outWrites = NULL;
382
}
383

384
void
385
NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
386
{
387
   if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
388
      return;
389

390
   // check def instruction can store
391
   Instruction *di = st->getSrc(1)->defs.front()->getInsn();
392

393
   // TODO: move exports (if beneficial) in common opt pass
394
   if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
395
      return;
396

397
   for (int s = 0; di->srcExists(s); ++s)
398
      if (di->src(s).getFile() == FILE_IMMEDIATE ||
399
          di->src(s).getFile() == FILE_MEMORY_LOCAL)
400
         return;
401

402
   if (prog->getType() == Program::TYPE_GEOMETRY) {
403
      // Only propagate output writes in geometry shaders when we can be sure
404
      // that we are propagating to the same output vertex.
405
      if (di->bb != st->bb)
406
         return;
407
      Instruction *i;
408
      for (i = di; i != st; i = i->next) {
409
         if (i->op == OP_EMIT || i->op == OP_RESTART)
410
            return;
411
      }
412
      assert(i); // st after di
413
   }
414

415
   // We cannot set defs to non-lvalues before register allocation, so
416
   // save & remove (to save registers) the exports and replace later.
417
   outWrites->push_back(st);
418
   st->bb->remove(st);
419
}
420

421
bool
422
NV50LegalizeSSA::isARL(const Instruction *i) const
423
{
424
   ImmediateValue imm;
425

426
   if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
427
      return false;
428
   if (!i->src(1).getImmediate(imm))
429
      return false;
430
   return imm.isInteger(0);
431
}
432

433
void
434
NV50LegalizeSSA::handleAddrDef(Instruction *i)
435
{
436
   Instruction *arl;
437

438
   i->getDef(0)->reg.size = 2; // $aX are only 16 bit
439

440
   // PFETCH can always write to $a
441
   if (i->op == OP_PFETCH)
442
      return;
443
   // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
444
   if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
445
      if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
446
         return;
447
      if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
448
         return;
449
   }
450

451
   // turn $a sources into $r sources (can't operate on $a)
452
   for (int s = 0; i->srcExists(s); ++s) {
453
      Value *a = i->getSrc(s);
454
      Value *r;
455
      if (a->reg.file == FILE_ADDRESS) {
456
         if (a->getInsn() && isARL(a->getInsn())) {
457
            i->setSrc(s, a->getInsn()->getSrc(0));
458
         } else {
459
            bld.setPosition(i, false);
460
            r = bld.getSSA();
461
            bld.mkMov(r, a);
462
            i->setSrc(s, r);
463
         }
464
      }
465
   }
466
   if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
467
      return;
468

469
   // turn result back into $a
470
   bld.setPosition(i, true);
471
   arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
472
   i->setDef(0, arl->getSrc(0));
473
}
474

475
void
476
NV50LegalizeSSA::handleMUL(Instruction *mul)
477
{
478
   if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
479
      return;
480
   Value *def = mul->getDef(0);
481
   Value *pred = mul->getPredicate();
482
   CondCode cc = mul->cc;
483
   if (pred)
484
      mul->setPredicate(CC_ALWAYS, NULL);
485

486
   if (mul->op == OP_MAD) {
487
      Instruction *add = mul;
488
      bld.setPosition(add, false);
489
      Value *res = cloneShallow(func, mul->getDef(0));
490
      mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
491
      add->op = OP_ADD;
492
      add->setSrc(0, mul->getDef(0));
493
      add->setSrc(1, add->getSrc(2));
494
      for (int s = 2; add->srcExists(s); ++s)
495
         add->setSrc(s, NULL);
496
      mul->subOp = add->subOp;
497
      add->subOp = 0;
498
   }
499
   expandIntegerMUL(&bld, mul);
500
   if (pred)
501
      def->getInsn()->setPredicate(cc, pred);
502
}
503

504
// Use f32 division: first compute an approximate result, use it to reduce
505
// the dividend, which should then be representable as f32, divide the reduced
506
// dividend, and add the quotients.
507
void
508
NV50LegalizeSSA::handleDIV(Instruction *div)
509
{
510
   const DataType ty = div->sType;
511

512
   if (ty != TYPE_U32 && ty != TYPE_S32)
513
      return;
514

515
   Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
516

517
   bld.setPosition(div, false);
518

519
   Value *a, *af = bld.getSSA();
520
   Value *b, *bf = bld.getSSA();
521

522
   bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
523
   bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
524

525
   if (isSignedType(ty)) {
526
      af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
527
      bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
528
      a = bld.getSSA();
529
      b = bld.getSSA();
530
      bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
531
      bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
532
   } else {
533
      a = div->getSrc(0);
534
      b = div->getSrc(1);
535
   }
536

537
   bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
538
   bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
539

540
   bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
541
   bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
542

543
   // get error of 1st result
544
   expandIntegerMUL(&bld,
545
      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
546
   bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
547

548
   bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
549

550
   bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
551
   bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
552
      ->rnd = ROUND_Z;
553
   bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
554

555
   // correction: if modulus >= divisor, add 1
556
   expandIntegerMUL(&bld,
557
      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
558
   bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
559
   bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
560
   if (!isSignedType(ty)) {
561
      div->op = OP_SUB;
562
      div->setSrc(0, q);
563
      div->setSrc(1, s);
564
   } else {
565
      t = q;
566
      bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
567
      s = bld.getSSA();
568
      t = bld.getSSA();
569
      // fix the sign
570
      bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
571
         ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
572
      bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
573
      bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
574

575
      div->op = OP_UNION;
576
      div->setSrc(0, s);
577
      div->setSrc(1, t);
578
   }
579
}
580

581
void
582
NV50LegalizeSSA::handleMOD(Instruction *mod)
583
{
584
   if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
585
      return;
586
   bld.setPosition(mod, false);
587

588
   Value *q = bld.getSSA();
589
   Value *m = bld.getSSA();
590

591
   bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
592
   handleDIV(q->getInsn());
593

594
   bld.setPosition(mod, false);
595
   expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
596

597
   mod->op = OP_SUB;
598
   mod->setSrc(1, m);
599
}
600

601
bool
602
NV50LegalizeSSA::visit(BasicBlock *bb)
603
{
604
   Instruction *insn, *next;
605
   // skipping PHIs (don't pass them to handleAddrDef) !
606
   for (insn = bb->getEntry(); insn; insn = next) {
607
      next = insn->next;
608

609
      if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
610
         handleAddrDef(insn);
611

612
      switch (insn->op) {
613
      case OP_EXPORT:
614
         if (outWrites)
615
            propagateWriteToOutput(insn);
616
         break;
617
      case OP_DIV:
618
         handleDIV(insn);
619
         break;
620
      case OP_MOD:
621
         handleMOD(insn);
622
         break;
623
      case OP_MAD:
624
      case OP_MUL:
625
         handleMUL(insn);
626
         break;
627
      default:
628
         break;
629
      }
630
   }
631
   return true;
632
}
633

634
class NV50LoweringPreSSA : public Pass
635
{
636
public:
637
   NV50LoweringPreSSA(Program *);
638

639
private:
640
   virtual bool visit(Instruction *);
641
   virtual bool visit(Function *);
642

643
   bool handleRDSV(Instruction *);
644
   bool handleWRSV(Instruction *);
645

646
   bool handlePFETCH(Instruction *);
647
   bool handleEXPORT(Instruction *);
648
   bool handleLOAD(Instruction *);
649
   bool handleLDST(Instruction *);
650
   bool handleMEMBAR(Instruction *);
651
   bool handleSharedATOM(Instruction *);
652
   bool handleSULDP(TexInstruction *);
653
   bool handleSUREDP(TexInstruction *);
654
   bool handleSUSTP(TexInstruction *);
655
   Value *processSurfaceCoords(TexInstruction *);
656

657
   bool handleDIV(Instruction *);
658
   bool handleSQRT(Instruction *);
659
   bool handlePOW(Instruction *);
660

661
   bool handleSET(Instruction *);
662
   bool handleSLCT(CmpInstruction *);
663
   bool handleSELP(Instruction *);
664

665
   bool handleTEX(TexInstruction *);
666
   bool handleTXB(TexInstruction *); // I really
667
   bool handleTXL(TexInstruction *); // hate
668
   bool handleTXD(TexInstruction *); // these 3
669
   bool handleTXLQ(TexInstruction *);
670
   bool handleTXQ(TexInstruction *);
671
   bool handleSUQ(TexInstruction *);
672
   bool handleBUFQ(Instruction *);
673

674
   bool handleCALL(Instruction *);
675
   bool handlePRECONT(Instruction *);
676
   bool handleCONT(Instruction *);
677

678
   void checkPredicate(Instruction *);
679
   void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
680
   void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
681
   Value *loadSuInfo(int slot, uint32_t off);
682
   Value *loadSuInfo16(int slot, uint32_t off);
683

684
private:
685
   const Target *const targ;
686

687
   BuildUtil bld;
688

689
   Value *tid;
690
};
691

692
NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
693
   targ(prog->getTarget()), tid(NULL)
694
{
695
   bld.setProgram(prog);
696
}
697

698
bool
699
NV50LoweringPreSSA::visit(Function *f)
700
{
701
   BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
702

703
   if (prog->getType() == Program::TYPE_COMPUTE) {
704
      // Add implicit "thread id" argument in $r0 to the function
705
      Value *arg = new_LValue(func, FILE_GPR);
706
      arg->reg.data.id = 0;
707
      f->ins.push_back(arg);
708

709
      bld.setPosition(root, false);
710
      tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
711
   }
712

713
   return true;
714
}
715

716
void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
717
                                       Value **ms_x, Value **ms_y) {
718
   // This loads the texture-indexed ms setting from the constant buffer
719
   Value *tmp = new_LValue(func, FILE_GPR);
720
   uint8_t b = prog->driver->io.auxCBSlot;
721
   off += prog->driver->io.suInfoBase;
722
   if (prog->getType() > Program::TYPE_VERTEX)
723
      off += 16 * 2 * 4;
724
   if (prog->getType() > Program::TYPE_GEOMETRY)
725
      off += 16 * 2 * 4;
726
   if (prog->getType() > Program::TYPE_FRAGMENT)
727
      off += 16 * 2 * 4;
728
   *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
729
                             FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
730
   *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
731
                             FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
732
   *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
733
}
734

735
void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
736
   // Given a MS level, and a sample id, compute the delta x/y
737
   uint8_t b = prog->driver->io.msInfoCBSlot;
738
   Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);
739

740
   // The required information is at mslevel * 16 * 4 + sample * 8
741
   // = (mslevel * 8 + sample) * 8
742
   bld.mkOp2(OP_SHL,
743
             TYPE_U32,
744
             off,
745
             bld.mkOp2v(OP_ADD, TYPE_U32, t,
746
                        bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
747
                        s),
748
             bld.mkImm(3));
749
   *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
750
                           FILE_MEMORY_CONST, b, TYPE_U32,
751
                           prog->driver->io.msInfoBase), off);
752
   *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
753
                           FILE_MEMORY_CONST, b, TYPE_U32,
754
                           prog->driver->io.msInfoBase + 4), off);
755
}
756

757
Value *
758
NV50LoweringPreSSA::loadSuInfo(int slot, uint32_t off)
759
{
760
   uint8_t b = prog->driver->io.auxCBSlot;
761
   off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
762
   return bld.mkLoadv(TYPE_U32, bld.mkSymbol(
763
                            FILE_MEMORY_CONST, b, TYPE_U32, off), NULL);
764
}
765

766
Value *
767
NV50LoweringPreSSA::loadSuInfo16(int slot, uint32_t off)
768
{
769
   uint8_t b = prog->driver->io.auxCBSlot;
770
   off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
771
   return bld.mkLoadv(TYPE_U16, bld.mkSymbol(
772
                            FILE_MEMORY_CONST, b, TYPE_U16, off), NULL);
773
}
774

775
bool
776
NV50LoweringPreSSA::handleTEX(TexInstruction *i)
777
{
778
   const int arg = i->tex.target.getArgCount();
779
   const int dref = arg;
780
   const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
781

782
   /* Only normalize in the non-explicit derivatives case.
783
    */
784
   if (i->tex.target.isCube() && i->op != OP_TXD) {
785
      Value *src[3], *val;
786
      int c;
787
      for (c = 0; c < 3; ++c)
788
         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
789
      val = bld.getScratch();
790
      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
791
      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
792
      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
793
      for (c = 0; c < 3; ++c) {
794
         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
795
                                 i->getSrc(c), val));
796
      }
797
   }
798

799
   // handle MS, which means looking up the MS params for this texture, and
800
   // adjusting the input coordinates to point at the right sample.
801
   if (i->tex.target.isMS()) {
802
      Value *x = i->getSrc(0);
803
      Value *y = i->getSrc(1);
804
      Value *s = i->getSrc(arg - 1);
805
      Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
806
         *ms, *ms_x, *ms_y, *dx, *dy;
807

808
      i->tex.target.clearMS();
809

810
      loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
811
      loadMsInfo(ms, s, &dx, &dy);
812

813
      bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
814
      bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
815
      bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
816
      bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
817
      i->setSrc(0, tx);
818
      i->setSrc(1, ty);
819
      i->setSrc(arg - 1, bld.loadImm(NULL, 0));
820
   }
821

822
   // dref comes before bias/lod
823
   if (i->tex.target.isShadow())
824
      if (i->op == OP_TXB || i->op == OP_TXL)
825
         i->swapSources(dref, lod);
826

827
   if (i->tex.target.isArray()) {
828
      if (i->op != OP_TXF) {
829
         // array index must be converted to u32, but it's already an integer
830
         // for TXF
831
         Value *layer = i->getSrc(arg - 1);
832
         LValue *src = new_LValue(func, FILE_GPR);
833
         bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
834
         bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
835
         i->setSrc(arg - 1, src);
836
      }
837
      if (i->tex.target.isCube() && i->srcCount() > 4) {
838
         std::vector<Value *> acube, a2d;
839
         int c;
840

841
         acube.resize(4);
842
         for (c = 0; c < 4; ++c)
843
            acube[c] = i->getSrc(c);
844
         a2d.resize(4);
845
         for (c = 0; c < 3; ++c)
846
            a2d[c] = new_LValue(func, FILE_GPR);
847
         a2d[3] = NULL;
848

849
         bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
850
                   a2d, acube)->asTex()->tex.mask = 0x7;
851

852
         for (c = 0; c < 3; ++c)
853
            i->setSrc(c, a2d[c]);
854
         for (; i->srcExists(c + 1); ++c)
855
            i->setSrc(c, i->getSrc(c + 1));
856
         i->setSrc(c, NULL);
857
         assert(c <= 4);
858

859
         i->tex.target = i->tex.target.isShadow() ?
860
            TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
861
      }
862
   }
863

864
   // texel offsets are 3 immediate fields in the instruction,
865
   // nv50 cannot do textureGatherOffsets
866
   assert(i->tex.useOffsets <= 1);
867
   if (i->tex.useOffsets) {
868
      for (int c = 0; c < 3; ++c) {
869
         ImmediateValue val;
870
         if (!i->offset[0][c].getImmediate(val))
871
            assert(!"non-immediate offset");
872
         i->tex.offset[c] = val.reg.data.u32;
873
         i->offset[0][c].set(NULL);
874
      }
875
   }
876

877
   return true;
878
}
879

880
// Bias must be equal for all threads of a quad or lod calculation will fail.
881
//
882
// The lanes of a quad are grouped by the bit in the condition register they
883
// have set, which is selected by differing bias values.
884
// Move the input values for TEX into a new register set for each group and
885
// execute TEX only for a specific group.
886
// We always need to use 4 new registers for the inputs/outputs because the
887
// implicitly calculated derivatives must be correct.
888
//
889
// TODO: move to SSA phase so we can easily determine whether bias is constant
890
bool
891
NV50LoweringPreSSA::handleTXB(TexInstruction *i)
892
{
893
   const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
894
   int l, d;
895

896
   // We can't actually apply bias *and* do a compare for a cube
897
   // texture. Since the compare has to be done before the filtering, just
898
   // drop the bias on the floor.
899
   if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
900
      i->op = OP_TEX;
901
      i->setSrc(3, i->getSrc(4));
902
      i->setSrc(4, NULL);
903
      return handleTEX(i);
904
   }
905

906
   handleTEX(i);
907
   Value *bias = i->getSrc(i->tex.target.getArgCount());
908
   if (bias->isUniform())
909
      return true;
910

911
   Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
912
                                 bld.loadImm(NULL, 1));
913
   bld.setPosition(cond, false);
914

915
   for (l = 1; l < 4; ++l) {
916
      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
917
      Value *bit = bld.getSSA();
918
      Value *pred = bld.getScratch(1, FILE_FLAGS);
919
      Value *imm = bld.loadImm(NULL, (1 << l));
920
      bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
921
      bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
922
      cond->setSrc(l, bit);
923
   }
924
   Value *flags = bld.getScratch(1, FILE_FLAGS);
925
   bld.setPosition(cond, true);
926
   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
927

928
   Instruction *tex[4];
929
   for (l = 0; l < 4; ++l) {
930
      (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
931
      bld.insert(tex[l]);
932
   }
933

934
   Value *res[4][4];
935
   for (d = 0; i->defExists(d); ++d)
936
      res[0][d] = tex[0]->getDef(d);
937
   for (l = 1; l < 4; ++l) {
938
      for (d = 0; tex[l]->defExists(d); ++d) {
939
         res[l][d] = cloneShallow(func, res[0][d]);
940
         bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
941
      }
942
   }
943

944
   for (d = 0; i->defExists(d); ++d) {
945
      Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
946
      for (l = 0; l < 4; ++l)
947
         dst->setSrc(l, res[l][d]);
948
   }
949
   delete_Instruction(prog, i);
950
   return true;
951
}
952

953
// LOD must be equal for all threads of a quad.
954
// Unlike with TXB, here we can just diverge since there's no LOD calculation
955
// that would require all 4 threads' sources to be set up properly.
956
bool
957
NV50LoweringPreSSA::handleTXL(TexInstruction *i)
958
{
959
   handleTEX(i);
960
   Value *lod = i->getSrc(i->tex.target.getArgCount());
961
   if (lod->isUniform())
962
      return true;
963

964
   BasicBlock *currBB = i->bb;
965
   BasicBlock *texiBB = i->bb->splitBefore(i, false);
966
   BasicBlock *joinBB = i->bb->splitAfter(i);
967

968
   bld.setPosition(currBB, true);
969
   assert(!currBB->joinAt);
970
   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
971

972
   for (int l = 0; l <= 3; ++l) {
973
      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
974
      Value *pred = bld.getScratch(1, FILE_FLAGS);
975
      bld.setPosition(currBB, true);
976
      bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
977
      bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
978
      currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
979
      if (l <= 2) {
980
         BasicBlock *laneBB = new BasicBlock(func);
981
         currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
982
         currBB = laneBB;
983
      }
984
   }
985
   bld.setPosition(joinBB, false);
986
   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
987
   return true;
988
}
989

990
bool
991
NV50LoweringPreSSA::handleTXD(TexInstruction *i)
992
{
993
   static const uint8_t qOps[4][2] =
994
   {
995
      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
996
      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
997
      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
998
      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
999
   };
1000
   Value *def[4][4];
1001
   Value *crd[3];
1002
   Instruction *tex;
1003
   Value *zero = bld.loadImm(bld.getSSA(), 0);
1004
   int l, c;
1005
   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
1006

1007
   handleTEX(i);
1008
   i->op = OP_TEX; // no need to clone dPdx/dPdy later
1009
   i->tex.derivAll = true;
1010

1011
   for (c = 0; c < dim; ++c)
1012
      crd[c] = bld.getScratch();
1013

1014
   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
1015
   for (l = 0; l < 4; ++l) {
1016
      Value *src[3], *val;
1017
      // mov coordinates from lane l to all lanes
1018
      for (c = 0; c < dim; ++c)
1019
         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
1020
      // add dPdx from lane l to lanes dx
1021
      for (c = 0; c < dim; ++c)
1022
         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
1023
      // add dPdy from lane l to lanes dy
1024
      for (c = 0; c < dim; ++c)
1025
         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
1026
      // normalize cube coordinates if necessary
1027
      if (i->tex.target.isCube()) {
1028
         for (c = 0; c < 3; ++c)
1029
            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
1030
         val = bld.getScratch();
1031
         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
1032
         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
1033
         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
1034
         for (c = 0; c < 3; ++c)
1035
            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
1036
      } else {
1037
         for (c = 0; c < dim; ++c)
1038
            src[c] = crd[c];
1039
      }
1040
      // texture
1041
      bld.insert(tex = cloneForward(func, i));
1042
      for (c = 0; c < dim; ++c)
1043
         tex->setSrc(c, src[c]);
1044
      // save results
1045
      for (c = 0; i->defExists(c); ++c) {
1046
         Instruction *mov;
1047
         def[c][l] = bld.getSSA();
1048
         mov = bld.mkMov(def[c][l], tex->getDef(c));
1049
         mov->fixed = 1;
1050
         mov->lanes = 1 << l;
1051
      }
1052
   }
1053
   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1054

1055
   for (c = 0; i->defExists(c); ++c) {
1056
      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1057
      for (l = 0; l < 4; ++l)
1058
         u->setSrc(l, def[c][l]);
1059
   }
1060

1061
   i->bb->remove(i);
1062
   return true;
1063
}
1064

1065
bool
1066
NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
1067
{
1068
   handleTEX(i);
1069
   bld.setPosition(i, true);
1070

1071
   /* The returned values are not quite what we want:
1072
    * (a) convert from s32 to f32
1073
    * (b) multiply by 1/256
1074
    */
1075
   for (int def = 0; def < 2; ++def) {
1076
      if (!i->defExists(def))
1077
         continue;
1078
      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
1079
      bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1080
                i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1081
   }
1082
   return true;
1083
}
1084

1085
bool
1086
NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
1087
{
1088
   Value *ms, *ms_x, *ms_y;
1089
   if (i->tex.query == TXQ_DIMS) {
1090
      if (i->tex.target.isMS()) {
1091
         bld.setPosition(i, true);
1092
         loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
1093
         int d = 0;
1094
         if (i->tex.mask & 1) {
1095
            bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_x);
1096
            d++;
1097
         }
1098
         if (i->tex.mask & 2) {
1099
            bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_y);
1100
            d++;
1101
         }
1102
      }
1103
      return true;
1104
   }
1105
   assert(i->tex.query == TXQ_TYPE);
1106
   assert(i->tex.mask == 4);
1107

1108
   loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
1109
   bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);
1110
   i->bb->remove(i);
1111

1112
   return true;
1113
}
1114

1115
bool
1116
NV50LoweringPreSSA::handleSUQ(TexInstruction *suq)
1117
{
1118
   const int dim = suq->tex.target.getDim();
1119
   const int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1120
   int mask = suq->tex.mask;
1121
   int slot = suq->tex.r;
1122
   int c, d;
1123

1124
   for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1125
      if (c >= arg || !(mask & 1))
1126
         continue;
1127

1128
      int offset;
1129

1130
      if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1131
         offset = NV50_SU_INFO_SIZE(2);
1132
      } else {
1133
         offset = NV50_SU_INFO_SIZE(c);
1134
      }
1135
      bld.mkMov(suq->getDef(d++), loadSuInfo(slot, offset));
1136
      if (c == 2 && suq->tex.target.isCube())
1137
         bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1138
                   bld.loadImm(NULL, 6));
1139
   }
1140

1141
   if (mask & 1) {
1142
      if (suq->tex.target.isMS()) {
1143
         Value *ms_x = loadSuInfo(slot, NV50_SU_INFO_MS(0));
1144
         Value *ms_y = loadSuInfo(slot, NV50_SU_INFO_MS(1));
1145
         Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1146
         bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1147
      } else {
1148
         bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1149
      }
1150
   }
1151

1152
   bld.remove(suq);
1153
   return true;
1154
}
1155

1156
bool
1157
NV50LoweringPreSSA::handleBUFQ(Instruction *bufq)
1158
{
1159
   bufq->op = OP_MOV;
1160
   bufq->setSrc(0, loadSuInfo(bufq->getSrc(0)->reg.fileIndex, NV50_SU_INFO_SIZE_X));
1161
   bufq->setIndirect(0, 0, NULL);
1162
   bufq->setIndirect(0, 1, NULL);
1163
   return true;
1164
}
1165

1166
bool
1167
NV50LoweringPreSSA::handleSET(Instruction *i)
1168
{
1169
   if (i->dType == TYPE_F32) {
1170
      bld.setPosition(i, true);
1171
      i->dType = TYPE_U32;
1172
      bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
1173
      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
1174
   }
1175
   return true;
1176
}
1177

1178
bool
1179
NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
1180
{
1181
   Value *src0 = bld.getSSA();
1182
   Value *src1 = bld.getSSA();
1183
   Value *pred = bld.getScratch(1, FILE_FLAGS);
1184

1185
   Value *v0 = i->getSrc(0);
1186
   Value *v1 = i->getSrc(1);
1187
   // XXX: these probably shouldn't be immediates in the first place ...
1188
   if (v0->asImm())
1189
      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1190
   if (v1->asImm())
1191
      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1192

1193
   bld.setPosition(i, true);
1194
   bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
1195
   bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
1196
   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1197

1198
   bld.setPosition(i, false);
1199
   i->op = OP_SET;
1200
   i->setFlagsDef(0, pred);
1201
   i->dType = TYPE_U8;
1202
   i->setSrc(0, i->getSrc(2));
1203
   i->setSrc(2, NULL);
1204
   i->setSrc(1, bld.loadImm(NULL, 0));
1205

1206
   return true;
1207
}
1208

1209
bool
1210
NV50LoweringPreSSA::handleSELP(Instruction *i)
1211
{
1212
   Value *src0 = bld.getSSA();
1213
   Value *src1 = bld.getSSA();
1214

1215
   Value *v0 = i->getSrc(0);
1216
   Value *v1 = i->getSrc(1);
1217
   if (v0->asImm())
1218
      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1219
   if (v1->asImm())
1220
      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1221

1222
   bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
1223
   bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
1224
   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1225
   delete_Instruction(prog, i);
1226
   return true;
1227
}
1228

1229
bool
1230
NV50LoweringPreSSA::handleWRSV(Instruction *i)
1231
{
1232
   Symbol *sym = i->getSrc(0)->asSym();
1233

1234
   // these are all shader outputs, $sreg are not writeable
1235
   uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
1236
   if (addr >= 0x400)
1237
      return false;
1238
   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
1239

1240
   bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
1241

1242
   bld.getBB()->remove(i);
1243
   return true;
1244
}
1245

1246
bool
1247
NV50LoweringPreSSA::handleCALL(Instruction *i)
1248
{
1249
   if (prog->getType() == Program::TYPE_COMPUTE) {
1250
      // Add implicit "thread id" argument in $r0 to the function
1251
      i->setSrc(i->srcCount(), tid);
1252
   }
1253
   return true;
1254
}
1255

1256
bool
1257
NV50LoweringPreSSA::handlePRECONT(Instruction *i)
1258
{
1259
   delete_Instruction(prog, i);
1260
   return true;
1261
}
1262

1263
bool
1264
NV50LoweringPreSSA::handleCONT(Instruction *i)
1265
{
1266
   i->op = OP_BRA;
1267
   return true;
1268
}
1269

1270
bool
1271
NV50LoweringPreSSA::handleRDSV(Instruction *i)
1272
{
1273
   Symbol *sym = i->getSrc(0)->asSym();
1274
   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1275
   Value *def = i->getDef(0);
1276
   SVSemantic sv = sym->reg.data.sv.sv;
1277
   int idx = sym->reg.data.sv.index;
1278

1279
   if (addr >= 0x400) // mov $sreg
1280
      return true;
1281

1282
   switch (sv) {
1283
   case SV_POSITION:
1284
      assert(prog->getType() == Program::TYPE_FRAGMENT);
1285
      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1286
      break;
1287
   case SV_FACE:
1288
      bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
1289
      if (i->dType == TYPE_F32) {
1290
         bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
1291
         bld.mkOp1(OP_NEG, TYPE_S32, def, def);
1292
         bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
1293
      }
1294
      break;
1295
   case SV_NCTAID:
1296
   case SV_CTAID:
1297
   case SV_NTID: {
1298
      Value *x = bld.getSSA(2);
1299
      bld.mkOp1(OP_LOAD, TYPE_U16, x,
1300
                bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
1301
      bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
1302
      break;
1303
   }
1304
   case SV_TID:
1305
      if (idx == 0) {
1306
         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
1307
      } else if (idx == 1) {
1308
         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
1309
         bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
1310
      } else if (idx == 2) {
1311
         bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
1312
      } else {
1313
         bld.mkMov(def, bld.mkImm(0));
1314
      }
1315
      break;
1316
   case SV_COMBINED_TID:
1317
      bld.mkMov(def, tid);
1318
      break;
1319
   case SV_SAMPLE_POS: {
1320
      Value *off = new_LValue(func, FILE_ADDRESS);
1321
      bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
1322
      bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
1323
      bld.mkLoad(TYPE_F32,
1324
                 def,
1325
                 bld.mkSymbol(
1326
                       FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
1327
                       TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
1328
                 off);
1329
      break;
1330
   }
1331
   case SV_THREAD_KILL:
1332
      // Not actually supported. But it's implementation-dependent, so we can
1333
      // always just say it's not a helper.
1334
      bld.mkMov(def, bld.loadImm(NULL, 0));
1335
      break;
1336
   default:
1337
      bld.mkFetch(i->getDef(0), i->dType,
1338
                  FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
1339
      break;
1340
   }
1341
   bld.getBB()->remove(i);
1342
   return true;
1343
}
1344

1345
bool
1346
NV50LoweringPreSSA::handleDIV(Instruction *i)
1347
{
1348
   if (!isFloatType(i->dType))
1349
      return true;
1350
   bld.setPosition(i, false);
1351
   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1352
   i->op = OP_MUL;
1353
   i->setSrc(1, rcp->getDef(0));
1354
   return true;
1355
}
1356

1357
bool
1358
NV50LoweringPreSSA::handleSQRT(Instruction *i)
1359
{
1360
   bld.setPosition(i, true);
1361
   i->op = OP_RSQ;
1362
   bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
1363

1364
   return true;
1365
}
1366

1367
bool
1368
NV50LoweringPreSSA::handlePOW(Instruction *i)
1369
{
1370
   LValue *val = bld.getScratch();
1371

1372
   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
1373
   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
1374
   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
1375

1376
   i->op = OP_EX2;
1377
   i->setSrc(0, val);
1378
   i->setSrc(1, NULL);
1379

1380
   return true;
1381
}
1382

1383
bool
1384
NV50LoweringPreSSA::handleEXPORT(Instruction *i)
1385
{
1386
   if (prog->getType() == Program::TYPE_FRAGMENT) {
1387
      if (i->getIndirect(0, 0)) {
1388
         // TODO: redirect to l[] here, load to GPRs at exit
1389
         return false;
1390
      } else {
1391
         int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
1392

1393
         i->op = OP_MOV;
1394
         i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1395
         i->src(0).set(i->src(1));
1396
         i->setSrc(1, NULL);
1397
         i->setDef(0, new_LValue(func, FILE_GPR));
1398
         i->getDef(0)->reg.data.id = id;
1399

1400
         prog->maxGPR = MAX2(prog->maxGPR, id * 2);
1401
      }
1402
   }
1403
   return true;
1404
}
1405

1406
// Handle indirect addressing in geometry shaders:
1407
//
1408
// ld $r0 a[$a1][$a2+k] ->
1409
// ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
1410
//
1411
bool
1412
NV50LoweringPreSSA::handleLOAD(Instruction *i)
1413
{
1414
   ValueRef src = i->src(0);
1415
   Symbol *sym = i->getSrc(0)->asSym();
1416

1417
   if (prog->getType() == Program::TYPE_COMPUTE) {
1418
      if (sym->inFile(FILE_MEMORY_SHARED) ||
1419
          sym->inFile(FILE_MEMORY_BUFFER) ||
1420
          sym->inFile(FILE_MEMORY_GLOBAL)) {
1421
         return handleLDST(i);
1422
      }
1423
   }
1424

1425
   if (src.isIndirect(1)) {
1426
      assert(prog->getType() == Program::TYPE_GEOMETRY);
1427
      Value *addr = i->getIndirect(0, 1);
1428

1429
      if (src.isIndirect(0)) {
1430
         // base address is in an address register, so move to a GPR
1431
         Value *base = bld.getScratch();
1432
         bld.mkMov(base, addr);
1433

1434
         Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
1435
         Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
1436
         Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1437
                                    i->getIndirect(0, 0), bld.mkImm(2));
1438

1439
         // Calculate final address: addr = base + attr*vstride; use 16-bit
1440
         // multiplication since 32-bit would be lowered to multiple
1441
         // instructions, and we only need the low 16 bits of the result
1442
         Value *a[2], *b[2];
1443
         bld.mkSplit(a, 2, attrib);
1444
         bld.mkSplit(b, 2, vstride);
1445
         Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
1446
                                 base);
1447

1448
         // move address from GPR into an address register
1449
         addr = bld.getSSA(2, FILE_ADDRESS);
1450
         bld.mkMov(addr, sum);
1451
      }
1452

1453
      i->setIndirect(0, 1, NULL);
1454
      i->setIndirect(0, 0, addr);
1455
   }
1456

1457
   return true;
1458
}
1459

1460
bool
1461
NV50LoweringPreSSA::handleSharedATOM(Instruction *atom)
1462
{
1463
   assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1464

1465
   BasicBlock *currBB = atom->bb;
1466
   BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1467
   BasicBlock *joinBB = atom->bb->splitAfter(atom);
1468
   BasicBlock *setAndUnlockBB = new BasicBlock(func);
1469
   BasicBlock *failLockBB = new BasicBlock(func);
1470

1471
   bld.setPosition(currBB, true);
1472
   assert(!currBB->joinAt);
1473
   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1474

1475
   bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1476
   currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1477

1478
   bld.setPosition(tryLockBB, true);
1479

1480
   Instruction *ld =
1481
      bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1482
                 atom->getIndirect(0, 0));
1483
   Value *locked = bld.getSSA(1, FILE_FLAGS);
1484
   if (prog->getTarget()->getChipset() >= 0xa0) {
1485
      ld->setFlagsDef(1, locked);
1486
      ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1487
   } else {
1488
      bld.mkMov(locked, bld.loadImm(NULL, 2))
1489
         ->flagsDef = 0;
1490
   }
1491

1492
   bld.mkFlow(OP_BRA, setAndUnlockBB, CC_LT, locked);
1493
   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1494
   tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1495
   tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1496

1497
   tryLockBB->cfg.detach(&joinBB->cfg);
1498
   bld.remove(atom);
1499

1500
   bld.setPosition(setAndUnlockBB, true);
1501
   Value *stVal;
1502
   if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1503
      // Read the old value, and write the new one.
1504
      stVal = atom->getSrc(1);
1505
   } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1506
      CmpInstruction *set =
1507
         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_FLAGS),
1508
                   TYPE_U32, ld->getDef(0), atom->getSrc(1));
1509

1510
      Instruction *selp =
1511
         bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), atom->getSrc(2),
1512
                   ld->getDef(0), set->getDef(0));
1513
      stVal = selp->getDef(0);
1514

1515
      handleSELP(selp);
1516
   } else {
1517
      operation op;
1518

1519
      switch (atom->subOp) {
1520
      case NV50_IR_SUBOP_ATOM_ADD:
1521
         op = OP_ADD;
1522
         break;
1523
      case NV50_IR_SUBOP_ATOM_AND:
1524
         op = OP_AND;
1525
         break;
1526
      case NV50_IR_SUBOP_ATOM_OR:
1527
         op = OP_OR;
1528
         break;
1529
      case NV50_IR_SUBOP_ATOM_XOR:
1530
         op = OP_XOR;
1531
         break;
1532
      case NV50_IR_SUBOP_ATOM_MIN:
1533
         op = OP_MIN;
1534
         break;
1535
      case NV50_IR_SUBOP_ATOM_MAX:
1536
         op = OP_MAX;
1537
         break;
1538
      default:
1539
         assert(0);
1540
         return false;
1541
      }
1542

1543
      Instruction *i =
1544
         bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1545
                   atom->getSrc(1));
1546

1547
      stVal = i->getDef(0);
1548
   }
1549

1550
   Instruction *store = bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1551
               atom->getIndirect(0, 0), stVal);
1552
   if (prog->getTarget()->getChipset() >= 0xa0) {
1553
      store->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1554
   }
1555

1556
   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1557
   setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1558

1559
   // Loop until the lock is acquired.
1560
   bld.setPosition(failLockBB, true);
1561
   bld.mkFlow(OP_BRA, tryLockBB, CC_GEU, locked);
1562
   bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1563
   failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1564
   failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1565

1566
   bld.setPosition(joinBB, false);
1567
   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1568

1569
   return true;
1570
}
1571

1572
bool
1573
NV50LoweringPreSSA::handleLDST(Instruction *i)
1574
{
1575
   ValueRef src = i->src(0);
1576
   Symbol *sym = i->getSrc(0)->asSym();
1577

1578
   if (prog->getType() != Program::TYPE_COMPUTE) {
1579
      return true;
1580
   }
1581

1582
   // Buffers just map directly to the different global memory spaces
1583
   if (sym->inFile(FILE_MEMORY_BUFFER)) {
1584
      sym->reg.file = FILE_MEMORY_GLOBAL;
1585
   }
1586

1587
   if (sym->inFile(FILE_MEMORY_SHARED)) {
1588

1589
      if (src.isIndirect(0)) {
1590
         Value *addr = i->getIndirect(0, 0);
1591

1592
         if (!addr->inFile(FILE_ADDRESS)) {
1593
            // Move address from GPR into an address register
1594
            Value *new_addr = bld.getSSA(2, FILE_ADDRESS);
1595
            bld.mkMov(new_addr, addr);
1596

1597
            i->setIndirect(0, 0, new_addr);
1598
         }
1599
      }
1600

1601
      if (i->op == OP_ATOM)
1602
         handleSharedATOM(i);
1603
   } else if (sym->inFile(FILE_MEMORY_GLOBAL)) {
1604
      // All global access must be indirect. There are no instruction forms
1605
      // with direct access.
1606
      Value *addr = i->getIndirect(0, 0);
1607

1608
      Value *offset = bld.loadImm(bld.getSSA(), sym->reg.data.offset);
1609
      Value *sum;
1610
      if (addr != NULL)
1611
         sum = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), addr,
1612
                          offset);
1613
      else
1614
         sum = offset;
1615

1616
      i->setIndirect(0, 0, sum);
1617
      sym->reg.data.offset = 0;
1618
   }
1619

1620
   return true;
1621
}
1622

1623
bool
1624
NV50LoweringPreSSA::handleMEMBAR(Instruction *i)
1625
{
1626
   // For global memory, apparently doing a bunch of reads at different
1627
   // addresses forces things to get sufficiently flushed.
1628
   if (i->subOp & NV50_IR_SUBOP_MEMBAR_GL) {
1629
      uint8_t b = prog->driver->io.auxCBSlot;
1630
      Value *base =
1631
         bld.mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32,
1632
                                            prog->driver->io.membarOffset), NULL);
1633
      Value *physid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), bld.mkSysVal(SV_PHYSID, 0));
1634
      Value *off = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1635
                              bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(),
1636
                                         physid, bld.loadImm(NULL, 0x1f)),
1637
                              bld.loadImm(NULL, 2));
1638
      base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, off);
1639
      Symbol *gmemMembar = bld.mkSymbol(FILE_MEMORY_GLOBAL, prog->driver->io.gmemMembar, TYPE_U32, 0);
1640
      for (int i = 0; i < 8; i++) {
1641
         if (i != 0) {
1642
            base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, bld.loadImm(NULL, 0x100));
1643
         }
1644
         bld.mkLoad(TYPE_U32, bld.getSSA(), gmemMembar, base)
1645
            ->fixed = 1;
1646
      }
1647
   }
1648

1649
   // Both global and shared memory barriers also need a regular control bar
1650
   // TODO: double-check this is the case
1651
   i->op = OP_BAR;
1652
   i->subOp = NV50_IR_SUBOP_BAR_SYNC;
1653
   i->setSrc(0, bld.mkImm(0u));
1654
   i->setSrc(1, bld.mkImm(0u));
1655

1656
   return true;
1657
}
1658

1659
// The type that bests represents how each component can be stored when packed.
1660
static DataType
1661
getPackedType(const TexInstruction::ImgFormatDesc *t, int c)
1662
{
1663
   switch (t->type) {
1664
   case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
1665
   case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
1666
   case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
1667
   case UINT:
1668
      return (t->bits[c] == 8 ? TYPE_U8 :
1669
              (t->bits[c] <= 16 ? TYPE_U16 : TYPE_U32));
1670
   case SINT:
1671
      return (t->bits[c] == 8 ? TYPE_S8 :
1672
              (t->bits[c] <= 16 ? TYPE_S16 : TYPE_S32));
1673
   }
1674
   return TYPE_NONE;
1675
}
1676

1677
// The type that the rest of the shader expects to process this image type in.
1678
static DataType
1679
getShaderType(const ImgType type) {
1680
   switch (type) {
1681
   case FLOAT:
1682
   case UNORM:
1683
   case SNORM:
1684
      return TYPE_F32;
1685
   case UINT:
1686
      return TYPE_U32;
1687
   case SINT:
1688
      return TYPE_S32;
1689
   default:
1690
      assert(!"Impossible type");
1691
      return TYPE_NONE;
1692
   }
1693
}
1694

1695
// Reads the raw coordinates out of the input instruction, and returns a
1696
// single-value coordinate which is what the hardware expects to receive in a
1697
// ld/st op.
1698
Value *
1699
NV50LoweringPreSSA::processSurfaceCoords(TexInstruction *su)
1700
{
1701
   const int slot = su->tex.r;
1702
   const int dim = su->tex.target.getDim();
1703
   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1704

1705
   const TexInstruction::ImgFormatDesc *format = su->tex.format;
1706
   const uint16_t bytes = (format->bits[0] + format->bits[1] +
1707
                           format->bits[2] + format->bits[3]) / 8;
1708
   uint16_t shift = ffs(bytes) - 1;
1709

1710
   // Buffer sizes don't necessarily fit in 16-bit values
1711
   if (su->tex.target == TEX_TARGET_BUFFER) {
1712
      return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1713
                        su->getSrc(0), bld.loadImm(NULL, (uint32_t)shift));
1714
   }
1715

1716
   // For buffers, we just need the byte offset. And for 2d buffers we want
1717
   // the x coordinate in bytes as well.
1718
   Value *coords[3] = {};
1719
   for (int i = 0; i < arg; i++) {
1720
      Value *src[2];
1721
      bld.mkSplit(src, 2, su->getSrc(i));
1722
      coords[i] = src[0];
1723
      // For 1d-images, we want the y coord to be 0, which it will be here.
1724
      if (i == 0)
1725
         coords[1] = src[1];
1726
   }
1727

1728
   coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1729
                          coords[0], bld.loadImm(NULL, shift));
1730

1731
   if (su->tex.target.isMS()) {
1732
      Value *ms_x = loadSuInfo16(slot, NV50_SU_INFO_MS(0));
1733
      Value *ms_y = loadSuInfo16(slot, NV50_SU_INFO_MS(1));
1734
      coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[0], ms_x);
1735
      coords[1] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[1], ms_y);
1736
   }
1737

1738
   // If there are more dimensions, we just want the y-offset. But that needs
1739
   // to be adjusted up by the y-stride for array images.
1740
   if (su->tex.target.isArray() || su->tex.target.isCube()) {
1741
      Value *index = coords[dim];
1742
      Value *height = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
1743
      Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4), index, height);
1744
      mul->sType = TYPE_U16;
1745
      Value *muls[2];
1746
      bld.mkSplit(muls, 2, mul->getDef(0));
1747
      if (dim > 1)
1748
         coords[1] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), coords[1], muls[0]);
1749
      else
1750
         coords[1] = muls[0];
1751
   }
1752

1753
   // 3d is special-cased. Note that a single "slice" of a 3d image may
1754
   // also be attached as 2d, so we have to do the same 3d processing for
1755
   // 2d as well, just in case. In order to remap a 3d image onto a 2d
1756
   // image, we have to retile it "by hand".
1757
   if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) {
1758
      Value *z = loadSuInfo16(slot, NV50_SU_INFO_OFFSET_Z);
1759
      Value *y_size_aligned = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
1760
      // Add the z coordinate for actual 3d-images
1761
      if (dim > 2)
1762
         coords[2] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), z, coords[2]);
1763
      else
1764
         coords[2] = z;
1765

1766
      // Compute the surface parameters from tile shifts
1767
      Value *tile_shift[3];
1768
      Value *tile_size[3];
1769
      Value *tile_mask[3];
1770
      // We only ever use one kind of X-tiling.
1771
      tile_shift[0] = bld.loadImm(NULL, (uint16_t)6);
1772
      tile_size[0] = bld.loadImm(NULL, (uint16_t)64);
1773
      tile_mask[0] = bld.loadImm(NULL, (uint16_t)63);
1774
      // Fetch the "real" tiling parameters of the underlying surface
1775
      for (int i = 1; i < 3; i++) {
1776
         tile_shift[i] = loadSuInfo16(slot, NV50_SU_INFO_TILE_SHIFT(i));
1777
         tile_size[i] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), bld.loadImm(NULL, (uint16_t)1), tile_shift[i]);
1778
         tile_mask[i] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), tile_size[i], bld.loadImm(NULL, (uint16_t)-1));
1779
      }
1780

1781
      // Compute the location of given coordinate, both inside the tile as
1782
      // well as which (linearly-laid out) tile it's in.
1783
      Value *coord_in_tile[3];
1784
      Value *tile[3];
1785
      for (int i = 0; i < 3; i++) {
1786
         coord_in_tile[i] = bld.mkOp2v(OP_AND, TYPE_U16, bld.getSSA(2), coords[i], tile_mask[i]);
1787
         tile[i] = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), coords[i], tile_shift[i]);
1788
      }
1789

1790
      // Based on the "real" tiling parameters, compute x/y coordinates in the
1791
      // larger surface with 2d tiling that was supplied to the hardware. This
1792
      // was determined and verified with the help of the tiling pseudocode in
1793
      // the envytools docs.
1794
      //
1795
      // adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size +
1796
      //         z_coord_in_tile * x_tile_size
1797
      // adj_y = y_coord_in_tile + y_tile * y_tile_size +
1798
      //         z_tile * y_tile_size * y_tiles
1799
      //
1800
      // Note: STRIDE_Y = y_tile_size * y_tiles
1801

1802
      coords[0] = bld.mkOp2v(
1803
            OP_ADD, TYPE_U16, bld.getSSA(2),
1804
            bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
1805
                       coord_in_tile[0],
1806
                       bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1807
                                  tile[0],
1808
                                  bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
1809
                                             tile_shift[2], tile_shift[0]))),
1810
            bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1811
                       coord_in_tile[2], tile_shift[0]));
1812

1813
      Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4),
1814
                                   tile[2], y_size_aligned);
1815
      mul->sType = TYPE_U16;
1816
      Value *muls[2];
1817
      bld.mkSplit(muls, 2, mul->getDef(0));
1818

1819
      coords[1] = bld.mkOp2v(
1820
            OP_ADD, TYPE_U16, bld.getSSA(2),
1821
            muls[0],
1822
            bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
1823
                       coord_in_tile[1],
1824
                       bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1825
                                  tile[1], tile_shift[1])));
1826
   }
1827

1828
   return bld.mkOp2v(OP_MERGE, TYPE_U32, bld.getSSA(), coords[0], coords[1]);
1829
}
1830

1831
// This is largely a copy of NVC0LoweringPass::convertSurfaceFormat, but
1832
// adjusted to make use of 16-bit math where possible.
1833
bool
1834
NV50LoweringPreSSA::handleSULDP(TexInstruction *su)
1835
{
1836
   const int slot = su->tex.r;
1837
   assert(!su->getIndirectR());
1838

1839
   bld.setPosition(su, false);
1840

1841
   const TexInstruction::ImgFormatDesc *format = su->tex.format;
1842
   const int bytes = (su->tex.format->bits[0] +
1843
                      su->tex.format->bits[1] +
1844
                      su->tex.format->bits[2] +
1845
                      su->tex.format->bits[3]) / 8;
1846
   DataType ty = typeOfSize(bytes);
1847

1848
   Value *coord = processSurfaceCoords(su);
1849

1850
   Value *untypedDst[4] = {};
1851
   Value *typedDst[4] = {};
1852
   int i;
1853
   for (i = 0; i < bytes / 4; i++)
1854
      untypedDst[i] = bld.getSSA();
1855
   if (bytes < 4)
1856
      untypedDst[0] = bld.getSSA();
1857

1858
   for (i = 0; i < 4; i++)
1859
      typedDst[i] = su->getDef(i);
1860

1861
   Instruction *load = bld.mkLoad(ty, NULL, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, ty, 0), coord);
1862
   for (i = 0; i < 4 && untypedDst[i]; i++)
1863
      load->setDef(i, untypedDst[i]);
1864

1865
   // Unpack each component into the typed dsts
1866
   int bits = 0;
1867
   for (int i = 0; i < 4; bits += format->bits[i], i++) {
1868
      if (!typedDst[i])
1869
         continue;
1870

1871
      if (i >= format->components) {
1872
         if (format->type == FLOAT ||
1873
             format->type == UNORM ||
1874
             format->type == SNORM)
1875
            bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
1876
         else
1877
            bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
1878
         continue;
1879
      }
1880

1881
      // Get just that component's data into the relevant place
1882
      if (format->bits[i] == 32)
1883
         bld.mkMov(typedDst[i], untypedDst[i]);
1884
      else if (format->bits[i] == 16) {
1885
         // We can always convert directly from the appropriate half of the
1886
         // loaded value into the typed result.
1887
         Value *src[2];
1888
         bld.mkSplit(src, 2, untypedDst[i / 2]);
1889
         bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
1890
                   getPackedType(format, i), src[i & 1]);
1891
      }
1892
      else if (format->bits[i] == 8) {
1893
         // Same approach as for 16 bits, but we have to massage the value a
1894
         // bit more, since we have to get the appropriate 8 bits from the
1895
         // half-register. In all cases, we can CVT from a 8-bit source, so we
1896
         // only have to shift when we want the upper 8 bits.
1897
         Value *src[2], *shifted;
1898
         bld.mkSplit(src, 2, untypedDst[0]);
1899
         DataType packedType = getPackedType(format, i);
1900
         if (i & 1)
1901
            shifted = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), src[!!(i & 2)], bld.loadImm(NULL, (uint16_t)8));
1902
         else
1903
            shifted = src[!!(i & 2)];
1904

1905
         bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
1906
                   packedType, shifted);
1907
      }
1908
      else {
1909
         // The options are 10, 11, and 2. Get it into a 32-bit reg, then
1910
         // shift/mask. That's where it'll have to end up anyways. For signed,
1911
         // we have to make sure to get sign-extension, so we actually have to
1912
         // shift *up* first, and then shift down. There's no advantage to
1913
         // AND'ing, so we don't.
1914
         DataType ty = TYPE_U32;
1915
         if (format->type == SNORM || format->type == SINT) {
1916
            ty = TYPE_S32;
1917
         }
1918

1919
         // Poor man's EXTBF
1920
         bld.mkOp2(
1921
               OP_SHR, ty, typedDst[i],
1922
               bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), untypedDst[0], bld.loadImm(NULL, 32 - bits - format->bits[i])),
1923
               bld.loadImm(NULL, 32 - format->bits[i]));
1924

1925
         // If the stored data is already in the appropriate type, we don't
1926
         // have to do anything. Convert to float for the *NORM formats.
1927
         if (format->type == UNORM || format->type == SNORM)
1928
            bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_U32, typedDst[i]);
1929
      }
1930

1931
      // Normalize / convert as necessary
1932
      if (format->type == UNORM)
1933
         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
1934
      else if (format->type == SNORM)
1935
         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
1936
      else if (format->type == FLOAT && format->bits[i] < 16) {
1937
         // We expect the value to be in the low bits of the register, so we
1938
         // have to shift back up.
1939
         bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
1940
         Value *src[2];
1941
         bld.mkSplit(src, 2, typedDst[i]);
1942
         bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, src[0]);
1943
      }
1944
   }
1945

1946
   if (format->bgra) {
1947
      std::swap(typedDst[0], typedDst[2]);
1948
   }
1949

1950
   bld.getBB()->remove(su);
1951
   return true;
1952
}
1953

1954
bool
1955
NV50LoweringPreSSA::handleSUREDP(TexInstruction *su)
1956
{
1957
   const int slot = su->tex.r;
1958
   const int dim = su->tex.target.getDim();
1959
   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1960
   assert(!su->getIndirectR());
1961

1962
   bld.setPosition(su, false);
1963

1964
   Value *coord = processSurfaceCoords(su);
1965

1966
   // This is guaranteed to be a 32-bit format. So there's nothing to
1967
   // pack/unpack.
1968
   Instruction *atom = bld.mkOp2(
1969
         OP_ATOM, su->dType, su->getDef(0),
1970
         bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), su->getSrc(arg));
1971
   if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
1972
      atom->setSrc(2, su->getSrc(arg + 1));
1973
   atom->setIndirect(0, 0, coord);
1974
   atom->subOp = su->subOp;
1975

1976
   bld.getBB()->remove(su);
1977
   return true;
1978
}
1979

1980
bool
1981
NV50LoweringPreSSA::handleSUSTP(TexInstruction *su)
1982
{
1983
   const int slot = su->tex.r;
1984
   const int dim = su->tex.target.getDim();
1985
   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1986
   assert(!su->getIndirectR());
1987

1988
   bld.setPosition(su, false);
1989

1990
   const TexInstruction::ImgFormatDesc *format = su->tex.format;
1991
   const int bytes = (su->tex.format->bits[0] +
1992
                      su->tex.format->bits[1] +
1993
                      su->tex.format->bits[2] +
1994
                      su->tex.format->bits[3]) / 8;
1995
   DataType ty = typeOfSize(bytes);
1996

1997
   Value *coord = processSurfaceCoords(su);
1998

1999
   // The packed values we will eventually store into memory
2000
   Value *untypedDst[4] = {};
2001
   // Each component's packed representation, in 16-bit registers (only used
2002
   // where appropriate)
2003
   Value *untypedDst16[4] = {};
2004
   // The original values that are being packed
2005
   Value *typedDst[4] = {};
2006
   int i;
2007

2008
   for (i = 0; i < bytes / 4; i++)
2009
      untypedDst[i] = bld.getSSA();
2010
   for (i = 0; i < format->components; i++)
2011
      untypedDst16[i] = bld.getSSA(2);
2012
   // Make sure we get at least one of each value allocated for the
2013
   // super-narrow formats.
2014
   if (bytes < 4)
2015
      untypedDst[0] = bld.getSSA();
2016
   if (bytes < 2)
2017
      untypedDst16[0] = bld.getSSA(2);
2018

2019
   for (i = 0; i < 4; i++) {
2020
      typedDst[i] = bld.getSSA();
2021
      bld.mkMov(typedDst[i], su->getSrc(arg + i));
2022
   }
2023

2024
   if (format->bgra) {
2025
      std::swap(typedDst[0], typedDst[2]);
2026
   }
2027

2028
   // Pack each component into the untyped dsts.
2029
   int bits = 0;
2030
   for (int i = 0; i < format->components; bits += format->bits[i], i++) {
2031
      // Un-normalize / convert as necessary
2032
      if (format->type == UNORM)
2033
         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << format->bits[i]) - 1)));
2034
      else if (format->type == SNORM)
2035
         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << (format->bits[i] - 1)) - 1)));
2036

2037
      // There is nothing to convert/pack for 32-bit values
2038
      if (format->bits[i] == 32) {
2039
         bld.mkMov(untypedDst[i], typedDst[i]);
2040
         continue;
2041
      }
2042

2043
      // The remainder of the cases will naturally want to deal in 16-bit
2044
      // registers. We will put these into untypedDst16 and then merge them
2045
      // together later.
2046
      if (format->type == FLOAT && format->bits[i] < 16) {
2047
         bld.mkCvt(OP_CVT, TYPE_F16, untypedDst16[i], TYPE_F32, typedDst[i]);
2048
         bld.mkOp2(OP_SHR, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(15 - format->bits[i])));
2049

2050
         // For odd bit sizes, it's easier to pack it into the final
2051
         // destination directly.
2052
         Value *tmp = bld.getSSA();
2053
         bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
2054
         if (i == 0) {
2055
            untypedDst[0] = tmp;
2056
         } else {
2057
            bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
2058
            bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
2059
         }
2060
      } else if (format->bits[i] == 16) {
2061
         // We can always convert the shader value into the packed value
2062
         // directly here
2063
         bld.mkCvt(OP_CVT, getPackedType(format, i), untypedDst16[i],
2064
                   getShaderType(format->type), typedDst[i]);
2065
      } else if (format->bits[i] < 16) {
2066
         DataType packedType = getPackedType(format, i);
2067
         DataType shaderType = getShaderType(format->type);
2068
         // We can't convert F32 to U8/S8 directly, so go to U16/S16 first.
2069
         if (shaderType == TYPE_F32 && typeSizeof(packedType) == 1) {
2070
            packedType = format->type == SNORM ? TYPE_S16 : TYPE_U16;
2071
         }
2072
         bld.mkCvt(OP_CVT, packedType, untypedDst16[i], shaderType, typedDst[i]);
2073
         // TODO: clamp for 10- and 2-bit sizes. Also, due to the oddness of
2074
         // the size, it's easier to dump them into a 32-bit value and OR
2075
         // everything later.
2076
         if (format->bits[i] != 8) {
2077
            // Restrict value to the appropriate bits (although maybe supposed
2078
            // to clamp instead?)
2079
            bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)((1 << format->bits[i]) - 1)));
2080
            // And merge into final packed value
2081
            Value *tmp = bld.getSSA();
2082
            bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
2083
            if (i == 0) {
2084
               untypedDst[0] = tmp;
2085
            } else {
2086
               bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
2087
               bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
2088
            }
2089
         } else if (i & 1) {
2090
            // Shift the 8-bit value up (so that it can be OR'd later)
2091
            bld.mkOp2(OP_SHL, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(bits % 16)));
2092
         } else if (packedType != TYPE_U8) {
2093
            // S8 (or the *16 if converted from float) will all have high bits
2094
            // set, so AND them out.
2095
            bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)0xff));
2096
         }
2097
      }
2098
   }
2099

2100
   // OR pairs of 8-bit values together (into the even value)
2101
   if (format->bits[0] == 8) {
2102
      for (i = 0; i < 2 && untypedDst16[2 * i] && untypedDst16[2 * i + 1]; i++)
2103
         bld.mkOp2(OP_OR, TYPE_U16, untypedDst16[2 * i], untypedDst16[2 * i], untypedDst16[2 * i + 1]);
2104
   }
2105

2106
   // We'll always want to have at least a 32-bit source register for the store
2107
   Instruction *merge = bld.mkOp(OP_MERGE, bytes < 4 ? TYPE_U32 : ty, bld.getSSA(bytes < 4 ? 4 : bytes));
2108
   if (format->bits[0] == 32) {
2109
      for (i = 0; i < 4 && untypedDst[i]; i++)
2110
         merge->setSrc(i, untypedDst[i]);
2111
   } else if (format->bits[0] == 16) {
2112
      for (i = 0; i < 4 && untypedDst16[i]; i++)
2113
         merge->setSrc(i, untypedDst16[i]);
2114
      if (i == 1)
2115
         merge->setSrc(i, bld.getSSA(2));
2116
   } else if (format->bits[0] == 8) {
2117
      for (i = 0; i < 2 && untypedDst16[2 * i]; i++)
2118
         merge->setSrc(i, untypedDst16[2 * i]);
2119
      if (i == 1)
2120
         merge->setSrc(i, bld.getSSA(2));
2121
   } else {
2122
      merge->setSrc(0, untypedDst[0]);
2123
   }
2124

2125
   bld.mkStore(OP_STORE, ty, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), coord, merge->getDef(0));
2126

2127
   bld.getBB()->remove(su);
2128
   return true;
2129
}
2130

2131
bool
2132
NV50LoweringPreSSA::handlePFETCH(Instruction *i)
2133
{
2134
   assert(prog->getType() == Program::TYPE_GEOMETRY);
2135

2136
   // NOTE: cannot use getImmediate here, not in SSA form yet, move to
2137
   // later phase if that assertion ever triggers:
2138

2139
   ImmediateValue *imm = i->getSrc(0)->asImm();
2140
   assert(imm);
2141

2142
   assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
2143

2144
   if (i->srcExists(1)) {
2145
      // indirect addressing of vertex in primitive space
2146

2147
      LValue *val = bld.getScratch();
2148
      Value *ptr = bld.getSSA(2, FILE_ADDRESS);
2149
      bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
2150
      bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
2151

2152
      // NOTE: PFETCH directly to an $aX only works with direct addressing
2153
      i->op = OP_SHL;
2154
      i->setSrc(0, val);
2155
      i->setSrc(1, bld.mkImm(0));
2156
   }
2157

2158
   return true;
2159
}
2160

2161
// Set flags according to predicate and make the instruction read $cX.
2162
void
2163
NV50LoweringPreSSA::checkPredicate(Instruction *insn)
2164
{
2165
   Value *pred = insn->getPredicate();
2166
   Value *cdst;
2167

2168
   // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
2169
   if (!pred ||
2170
       pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
2171
      return;
2172

2173
   cdst = bld.getSSA(1, FILE_FLAGS);
2174

2175
   bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
2176

2177
   insn->setPredicate(insn->cc, cdst);
2178
}
2179

2180
//
2181
// - add quadop dance for texturing
2182
// - put FP outputs in GPRs
2183
// - convert instruction sequences
2184
//
2185
bool
2186
NV50LoweringPreSSA::visit(Instruction *i)
2187
{
2188
   bld.setPosition(i, false);
2189

2190
   if (i->cc != CC_ALWAYS)
2191
      checkPredicate(i);
2192

2193
   switch (i->op) {
2194
   case OP_TEX:
2195
   case OP_TXF:
2196
   case OP_TXG:
2197
      return handleTEX(i->asTex());
2198
   case OP_TXB:
2199
      return handleTXB(i->asTex());
2200
   case OP_TXL:
2201
      return handleTXL(i->asTex());
2202
   case OP_TXD:
2203
      return handleTXD(i->asTex());
2204
   case OP_TXLQ:
2205
      return handleTXLQ(i->asTex());
2206
   case OP_TXQ:
2207
      return handleTXQ(i->asTex());
2208
   case OP_EX2:
2209
      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
2210
      i->setSrc(0, i->getDef(0));
2211
      break;
2212
   case OP_SET:
2213
      return handleSET(i);
2214
   case OP_SLCT:
2215
      return handleSLCT(i->asCmp());
2216
   case OP_SELP:
2217
      return handleSELP(i);
2218
   case OP_POW:
2219
      return handlePOW(i);
2220
   case OP_DIV:
2221
      return handleDIV(i);
2222
   case OP_SQRT:
2223
      return handleSQRT(i);
2224
   case OP_EXPORT:
2225
      return handleEXPORT(i);
2226
   case OP_LOAD:
2227
      return handleLOAD(i);
2228
   case OP_MEMBAR:
2229
      return handleMEMBAR(i);
2230
   case OP_ATOM:
2231
   case OP_STORE:
2232
      return handleLDST(i);
2233
   case OP_SULDP:
2234
      return handleSULDP(i->asTex());
2235
   case OP_SUSTP:
2236
      return handleSUSTP(i->asTex());
2237
   case OP_SUREDP:
2238
      return handleSUREDP(i->asTex());
2239
   case OP_SUQ:
2240
      return handleSUQ(i->asTex());
2241
   case OP_BUFQ:
2242
      return handleBUFQ(i);
2243
   case OP_RDSV:
2244
      return handleRDSV(i);
2245
   case OP_WRSV:
2246
      return handleWRSV(i);
2247
   case OP_CALL:
2248
      return handleCALL(i);
2249
   case OP_PRECONT:
2250
      return handlePRECONT(i);
2251
   case OP_CONT:
2252
      return handleCONT(i);
2253
   case OP_PFETCH:
2254
      return handlePFETCH(i);
2255
   default:
2256
      break;
2257
   }
2258
   return true;
2259
}
2260

2261
bool
2262
TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
2263
{
2264
   bool ret = false;
2265

2266
   if (stage == CG_STAGE_PRE_SSA) {
2267
      NV50LoweringPreSSA pass(prog);
2268
      ret = pass.run(prog, false, true);
2269
   } else
2270
   if (stage == CG_STAGE_SSA) {
2271
      if (!prog->targetPriv)
2272
         prog->targetPriv = new std::list<Instruction *>();
2273
      NV50LegalizeSSA pass(prog);
2274
      ret = pass.run(prog, false, true);
2275
   } else
2276
   if (stage == CG_STAGE_POST_RA) {
2277
      NV50LegalizePostRA pass;
2278
      ret = pass.run(prog, false, true);
2279
      if (prog->targetPriv)
2280
         delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
2281
   }
2282
   return ret;
2283
}
2284

2285
} // namespace nv50_ir
2286

2287
Product

Resources

Company