CoCalc -- ir3_compiler

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/freedreno/ir3/ir3_compiler_nir.c
⁴⁵⁶⁵ views
1
/*
2
 * Copyright (C) 2015 Rob Clark <[email protected]>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
 * SOFTWARE.
22
 *
23
 * Authors:
24
 *    Rob Clark <[email protected]>
25
 */
26

27
#include <stdarg.h>
28

29
#include "util/u_math.h"
30
#include "util/u_memory.h"
31
#include "util/u_string.h"
32

33
#include "ir3_compiler.h"
34
#include "ir3_image.h"
35
#include "ir3_nir.h"
36
#include "ir3_shader.h"
37

38
#include "instr-a3xx.h"
39
#include "ir3.h"
40
#include "ir3_context.h"
41

42
void
43
ir3_handle_nonuniform(struct ir3_instruction *instr,
44
                      nir_intrinsic_instr *intrin)
45
{
46
   if (nir_intrinsic_has_access(intrin) &&
47
       (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) {
48
      instr->flags |= IR3_INSTR_NONUNIF;
49
   }
50
}
51

52
void
53
ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc)
54
{
55
   nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc);
56
   if (!intrin)
57
      return;
58

59
   instr->flags |= IR3_INSTR_B;
60
   instr->cat6.base = nir_intrinsic_desc_set(intrin);
61
}
62

63
static struct ir3_instruction *
64
create_input(struct ir3_context *ctx, unsigned compmask)
65
{
66
   struct ir3_instruction *in;
67

68
   in = ir3_instr_create(ctx->in_block, OPC_META_INPUT, 1, 0);
69
   in->input.sysval = ~0;
70
   __ssa_dst(in)->wrmask = compmask;
71

72
   array_insert(ctx->ir, ctx->ir->inputs, in);
73

74
   return in;
75
}
76

77
static struct ir3_instruction *
78
create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord,
79
                  unsigned n)
80
{
81
   struct ir3_block *block = ctx->block;
82
   struct ir3_instruction *instr;
83
   /* packed inloc is fixed up later: */
84
   struct ir3_instruction *inloc = create_immed(block, n);
85

86
   if (coord) {
87
      instr = ir3_BARY_F(block, inloc, 0, coord, 0);
88
   } else if (ctx->compiler->flat_bypass) {
89
      instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
90
      instr->cat6.type = TYPE_U32;
91
      instr->cat6.iim_val = 1;
92
   } else {
93
      instr = ir3_BARY_F(block, inloc, 0, ctx->ij[IJ_PERSP_PIXEL], 0);
94
      instr->srcs[1]->wrmask = 0x3;
95
   }
96

97
   return instr;
98
}
99

100
static struct ir3_instruction *
101
create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
102
{
103
   /* first four vec4 sysval's reserved for UBOs: */
104
   /* NOTE: dp is in scalar, but there can be >4 dp components: */
105
   struct ir3_const_state *const_state = ir3_const_state(ctx->so);
106
   unsigned n = const_state->offsets.driver_param;
107
   unsigned r = regid(n + dp / 4, dp % 4);
108
   return create_uniform(ctx->block, r);
109
}
110

111
/*
112
 * Adreno's comparisons produce a 1 for true and 0 for false, in either 16 or
113
 * 32-bit registers.  We use NIR's 1-bit integers to represent bools, and
114
 * trust that we will only see and/or/xor on those 1-bit values, so we can
115
 * safely store NIR i1s in a 32-bit reg while always containing either a 1 or
116
 * 0.
117
 */
118

119
/*
120
 * alu/sfu instructions:
121
 */
122

123
static struct ir3_instruction *
124
create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
125
           unsigned src_bitsize, nir_op op)
126
{
127
   type_t src_type, dst_type;
128

129
   switch (op) {
130
   case nir_op_f2f32:
131
   case nir_op_f2f16_rtne:
132
   case nir_op_f2f16_rtz:
133
   case nir_op_f2f16:
134
   case nir_op_f2i32:
135
   case nir_op_f2i16:
136
   case nir_op_f2i8:
137
   case nir_op_f2u32:
138
   case nir_op_f2u16:
139
   case nir_op_f2u8:
140
      switch (src_bitsize) {
141
      case 32:
142
         src_type = TYPE_F32;
143
         break;
144
      case 16:
145
         src_type = TYPE_F16;
146
         break;
147
      default:
148
         ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
149
      }
150
      break;
151

152
   case nir_op_i2f32:
153
   case nir_op_i2f16:
154
   case nir_op_i2i32:
155
   case nir_op_i2i16:
156
   case nir_op_i2i8:
157
      switch (src_bitsize) {
158
      case 32:
159
         src_type = TYPE_S32;
160
         break;
161
      case 16:
162
         src_type = TYPE_S16;
163
         break;
164
      case 8:
165
         src_type = TYPE_S8;
166
         break;
167
      default:
168
         ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
169
      }
170
      break;
171

172
   case nir_op_u2f32:
173
   case nir_op_u2f16:
174
   case nir_op_u2u32:
175
   case nir_op_u2u16:
176
   case nir_op_u2u8:
177
      switch (src_bitsize) {
178
      case 32:
179
         src_type = TYPE_U32;
180
         break;
181
      case 16:
182
         src_type = TYPE_U16;
183
         break;
184
      case 8:
185
         src_type = TYPE_U8;
186
         break;
187
      default:
188
         ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
189
      }
190
      break;
191

192
   case nir_op_b2f16:
193
   case nir_op_b2f32:
194
   case nir_op_b2i8:
195
   case nir_op_b2i16:
196
   case nir_op_b2i32:
197
      src_type = TYPE_U32;
198
      break;
199

200
   default:
201
      ir3_context_error(ctx, "invalid conversion op: %u", op);
202
   }
203

204
   switch (op) {
205
   case nir_op_f2f32:
206
   case nir_op_i2f32:
207
   case nir_op_u2f32:
208
   case nir_op_b2f32:
209
      dst_type = TYPE_F32;
210
      break;
211

212
   case nir_op_f2f16_rtne:
213
   case nir_op_f2f16_rtz:
214
   case nir_op_f2f16:
215
   case nir_op_i2f16:
216
   case nir_op_u2f16:
217
   case nir_op_b2f16:
218
      dst_type = TYPE_F16;
219
      break;
220

221
   case nir_op_f2i32:
222
   case nir_op_i2i32:
223
   case nir_op_b2i32:
224
      dst_type = TYPE_S32;
225
      break;
226

227
   case nir_op_f2i16:
228
   case nir_op_i2i16:
229
   case nir_op_b2i16:
230
      dst_type = TYPE_S16;
231
      break;
232

233
   case nir_op_f2i8:
234
   case nir_op_i2i8:
235
   case nir_op_b2i8:
236
      dst_type = TYPE_S8;
237
      break;
238

239
   case nir_op_f2u32:
240
   case nir_op_u2u32:
241
      dst_type = TYPE_U32;
242
      break;
243

244
   case nir_op_f2u16:
245
   case nir_op_u2u16:
246
      dst_type = TYPE_U16;
247
      break;
248

249
   case nir_op_f2u8:
250
   case nir_op_u2u8:
251
      dst_type = TYPE_U8;
252
      break;
253

254
   default:
255
      ir3_context_error(ctx, "invalid conversion op: %u", op);
256
   }
257

258
   if (src_type == dst_type)
259
      return src;
260

261
   struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type);
262

263
   if (op == nir_op_f2f16_rtne) {
264
      cov->cat1.round = ROUND_EVEN;
265
   } else if (op == nir_op_f2f16) {
266
      unsigned execution_mode = ctx->s->info.float_controls_execution_mode;
267
      nir_rounding_mode rounding_mode =
268
         nir_get_rounding_mode_from_float_controls(execution_mode,
269
                                                   nir_type_float16);
270
      if (rounding_mode == nir_rounding_mode_rtne)
271
         cov->cat1.round = ROUND_EVEN;
272
   }
273

274
   return cov;
275
}
276

277
/* For shift instructions NIR always has shift amount as 32 bit integer */
278
static struct ir3_instruction *
279
resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src,
280
                    unsigned bs)
281
{
282
   if (bs != 16)
283
      return src;
284

285
   return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16);
286
}
287

288
static void
289
emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
290
{
291
   const nir_op_info *info = &nir_op_infos[alu->op];
292
   struct ir3_instruction **dst, *src[info->num_inputs];
293
   unsigned bs[info->num_inputs]; /* bit size */
294
   struct ir3_block *b = ctx->block;
295
   unsigned dst_sz, wrmask;
296
   type_t dst_type =
297
      nir_dest_bit_size(alu->dest.dest) == 16 ? TYPE_U16 : TYPE_U32;
298

299
   if (alu->dest.dest.is_ssa) {
300
      dst_sz = alu->dest.dest.ssa.num_components;
301
      wrmask = (1 << dst_sz) - 1;
302
   } else {
303
      dst_sz = alu->dest.dest.reg.reg->num_components;
304
      wrmask = alu->dest.write_mask;
305
   }
306

307
   dst = ir3_get_dst(ctx, &alu->dest.dest, dst_sz);
308

309
   /* Vectors are special in that they have non-scalarized writemasks,
310
    * and just take the first swizzle channel for each argument in
311
    * order into each writemask channel.
312
    */
313
   if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) ||
314
       (alu->op == nir_op_vec4)) {
315

316
      for (int i = 0; i < info->num_inputs; i++) {
317
         nir_alu_src *asrc = &alu->src[i];
318

319
         compile_assert(ctx, !asrc->abs);
320
         compile_assert(ctx, !asrc->negate);
321

322
         src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]];
323
         if (!src[i])
324
            src[i] = create_immed_typed(ctx->block, 0, dst_type);
325
         dst[i] = ir3_MOV(b, src[i], dst_type);
326
      }
327

328
      ir3_put_dst(ctx, &alu->dest.dest);
329
      return;
330
   }
331

332
   /* We also get mov's with more than one component for mov's so
333
    * handle those specially:
334
    */
335
   if (alu->op == nir_op_mov) {
336
      nir_alu_src *asrc = &alu->src[0];
337
      struct ir3_instruction *const *src0 = ir3_get_src(ctx, &asrc->src);
338

339
      for (unsigned i = 0; i < dst_sz; i++) {
340
         if (wrmask & (1 << i)) {
341
            dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], dst_type);
342
         } else {
343
            dst[i] = NULL;
344
         }
345
      }
346

347
      ir3_put_dst(ctx, &alu->dest.dest);
348
      return;
349
   }
350

351
   /* General case: We can just grab the one used channel per src. */
352
   for (int i = 0; i < info->num_inputs; i++) {
353
      unsigned chan = ffs(alu->dest.write_mask) - 1;
354
      nir_alu_src *asrc = &alu->src[i];
355

356
      compile_assert(ctx, !asrc->abs);
357
      compile_assert(ctx, !asrc->negate);
358

359
      src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
360
      bs[i] = nir_src_bit_size(asrc->src);
361

362
      compile_assert(ctx, src[i]);
363
   }
364

365
   switch (alu->op) {
366
   case nir_op_f2f32:
367
   case nir_op_f2f16_rtne:
368
   case nir_op_f2f16_rtz:
369
   case nir_op_f2f16:
370
   case nir_op_f2i32:
371
   case nir_op_f2i16:
372
   case nir_op_f2i8:
373
   case nir_op_f2u32:
374
   case nir_op_f2u16:
375
   case nir_op_f2u8:
376
   case nir_op_i2f32:
377
   case nir_op_i2f16:
378
   case nir_op_i2i32:
379
   case nir_op_i2i16:
380
   case nir_op_i2i8:
381
   case nir_op_u2f32:
382
   case nir_op_u2f16:
383
   case nir_op_u2u32:
384
   case nir_op_u2u16:
385
   case nir_op_u2u8:
386
   case nir_op_b2f16:
387
   case nir_op_b2f32:
388
   case nir_op_b2i8:
389
   case nir_op_b2i16:
390
   case nir_op_b2i32:
391
      dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
392
      break;
393

394
   case nir_op_fquantize2f16:
395
      dst[0] = create_cov(ctx, create_cov(ctx, src[0], 32, nir_op_f2f16_rtne),
396
                          16, nir_op_f2f32);
397
      break;
398
   case nir_op_f2b1:
399
      dst[0] = ir3_CMPS_F(
400
         b, src[0], 0,
401
         create_immed_typed(b, 0, bs[0] == 16 ? TYPE_F16 : TYPE_F32), 0);
402
      dst[0]->cat2.condition = IR3_COND_NE;
403
      break;
404

405
   case nir_op_i2b1:
406
      /* i2b1 will appear when translating from nir_load_ubo or
407
       * nir_intrinsic_load_ssbo, where any non-zero value is true.
408
       */
409
      dst[0] = ir3_CMPS_S(
410
         b, src[0], 0,
411
         create_immed_typed(b, 0, bs[0] == 16 ? TYPE_U16 : TYPE_U32), 0);
412
      dst[0]->cat2.condition = IR3_COND_NE;
413
      break;
414

415
   case nir_op_b2b1:
416
      /* b2b1 will appear when translating from
417
       *
418
       * - nir_intrinsic_load_shared of a 32-bit 0/~0 value.
419
       * - nir_intrinsic_load_constant of a 32-bit 0/~0 value
420
       *
421
       * A negate can turn those into a 1 or 0 for us.
422
       */
423
      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
424
      break;
425

426
   case nir_op_b2b32:
427
      /* b2b32 will appear when converting our 1-bit bools to a store_shared
428
       * argument.
429
       *
430
       * A negate can turn those into a ~0 for us.
431
       */
432
      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
433
      break;
434

435
   case nir_op_fneg:
436
      dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
437
      break;
438
   case nir_op_fabs:
439
      dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
440
      break;
441
   case nir_op_fmax:
442
      dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
443
      break;
444
   case nir_op_fmin:
445
      dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
446
      break;
447
   case nir_op_fsat:
448
      /* if there is just a single use of the src, and it supports
449
       * (sat) bit, we can just fold the (sat) flag back to the
450
       * src instruction and create a mov.  This is easier for cp
451
       * to eliminate.
452
       */
453
      if (alu->src[0].src.is_ssa && is_sat_compatible(src[0]->opc) &&
454
          (list_length(&alu->src[0].src.ssa->uses) == 1)) {
455
         src[0]->flags |= IR3_INSTR_SAT;
456
         dst[0] = ir3_MOV(b, src[0], dst_type);
457
      } else {
458
         /* otherwise generate a max.f that saturates.. blob does
459
          * similar (generating a cat2 mov using max.f)
460
          */
461
         dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
462
         dst[0]->flags |= IR3_INSTR_SAT;
463
      }
464
      break;
465
   case nir_op_fmul:
466
      dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
467
      break;
468
   case nir_op_fadd:
469
      dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
470
      break;
471
   case nir_op_fsub:
472
      dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
473
      break;
474
   case nir_op_ffma:
475
      dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
476
      break;
477
   case nir_op_fddx:
478
   case nir_op_fddx_coarse:
479
      dst[0] = ir3_DSX(b, src[0], 0);
480
      dst[0]->cat5.type = TYPE_F32;
481
      break;
482
   case nir_op_fddx_fine:
483
      dst[0] = ir3_DSXPP_MACRO(b, src[0], 0);
484
      dst[0]->cat5.type = TYPE_F32;
485
      break;
486
   case nir_op_fddy:
487
   case nir_op_fddy_coarse:
488
      dst[0] = ir3_DSY(b, src[0], 0);
489
      dst[0]->cat5.type = TYPE_F32;
490
      break;
491
      break;
492
   case nir_op_fddy_fine:
493
      dst[0] = ir3_DSYPP_MACRO(b, src[0], 0);
494
      dst[0]->cat5.type = TYPE_F32;
495
      break;
496
   case nir_op_flt:
497
      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
498
      dst[0]->cat2.condition = IR3_COND_LT;
499
      break;
500
   case nir_op_fge:
501
      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
502
      dst[0]->cat2.condition = IR3_COND_GE;
503
      break;
504
   case nir_op_feq:
505
      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
506
      dst[0]->cat2.condition = IR3_COND_EQ;
507
      break;
508
   case nir_op_fneu:
509
      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
510
      dst[0]->cat2.condition = IR3_COND_NE;
511
      break;
512
   case nir_op_fceil:
513
      dst[0] = ir3_CEIL_F(b, src[0], 0);
514
      break;
515
   case nir_op_ffloor:
516
      dst[0] = ir3_FLOOR_F(b, src[0], 0);
517
      break;
518
   case nir_op_ftrunc:
519
      dst[0] = ir3_TRUNC_F(b, src[0], 0);
520
      break;
521
   case nir_op_fround_even:
522
      dst[0] = ir3_RNDNE_F(b, src[0], 0);
523
      break;
524
   case nir_op_fsign:
525
      dst[0] = ir3_SIGN_F(b, src[0], 0);
526
      break;
527

528
   case nir_op_fsin:
529
      dst[0] = ir3_SIN(b, src[0], 0);
530
      break;
531
   case nir_op_fcos:
532
      dst[0] = ir3_COS(b, src[0], 0);
533
      break;
534
   case nir_op_frsq:
535
      dst[0] = ir3_RSQ(b, src[0], 0);
536
      break;
537
   case nir_op_frcp:
538
      dst[0] = ir3_RCP(b, src[0], 0);
539
      break;
540
   case nir_op_flog2:
541
      dst[0] = ir3_LOG2(b, src[0], 0);
542
      break;
543
   case nir_op_fexp2:
544
      dst[0] = ir3_EXP2(b, src[0], 0);
545
      break;
546
   case nir_op_fsqrt:
547
      dst[0] = ir3_SQRT(b, src[0], 0);
548
      break;
549

550
   case nir_op_iabs:
551
      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
552
      break;
553
   case nir_op_iadd:
554
      dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
555
      break;
556
   case nir_op_iand:
557
      dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
558
      break;
559
   case nir_op_imax:
560
      dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
561
      break;
562
   case nir_op_umax:
563
      dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
564
      break;
565
   case nir_op_imin:
566
      dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
567
      break;
568
   case nir_op_umin:
569
      dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
570
      break;
571
   case nir_op_umul_low:
572
      dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0);
573
      break;
574
   case nir_op_imadsh_mix16:
575
      dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0);
576
      break;
577
   case nir_op_imad24_ir3:
578
      dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0);
579
      break;
580
   case nir_op_imul:
581
      compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) == 16);
582
      dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
583
      break;
584
   case nir_op_imul24:
585
      dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
586
      break;
587
   case nir_op_ineg:
588
      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
589
      break;
590
   case nir_op_inot:
591
      if (bs[0] == 1) {
592
         dst[0] = ir3_SUB_U(b, create_immed(ctx->block, 1), 0, src[0], 0);
593
      } else {
594
         dst[0] = ir3_NOT_B(b, src[0], 0);
595
      }
596
      break;
597
   case nir_op_ior:
598
      dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
599
      break;
600
   case nir_op_ishl:
601
      dst[0] =
602
         ir3_SHL_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
603
      break;
604
   case nir_op_ishr:
605
      dst[0] =
606
         ir3_ASHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
607
      break;
608
   case nir_op_isub:
609
      dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
610
      break;
611
   case nir_op_ixor:
612
      dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
613
      break;
614
   case nir_op_ushr:
615
      dst[0] =
616
         ir3_SHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
617
      break;
618
   case nir_op_ilt:
619
      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
620
      dst[0]->cat2.condition = IR3_COND_LT;
621
      break;
622
   case nir_op_ige:
623
      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
624
      dst[0]->cat2.condition = IR3_COND_GE;
625
      break;
626
   case nir_op_ieq:
627
      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
628
      dst[0]->cat2.condition = IR3_COND_EQ;
629
      break;
630
   case nir_op_ine:
631
      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
632
      dst[0]->cat2.condition = IR3_COND_NE;
633
      break;
634
   case nir_op_ult:
635
      dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
636
      dst[0]->cat2.condition = IR3_COND_LT;
637
      break;
638
   case nir_op_uge:
639
      dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
640
      dst[0]->cat2.condition = IR3_COND_GE;
641
      break;
642

643
   case nir_op_bcsel: {
644
      struct ir3_instruction *cond = src[0];
645

646
      /* If src[0] is a negation (likely as a result of an ir3_b2n(cond)),
647
       * we can ignore that and use original cond, since the nonzero-ness of
648
       * cond stays the same.
649
       */
650
      if (cond->opc == OPC_ABSNEG_S && cond->flags == 0 &&
651
          (cond->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) ==
652
             IR3_REG_SNEG) {
653
         cond = cond->srcs[0]->def->instr;
654
      }
655

656
      compile_assert(ctx, bs[1] == bs[2]);
657
      /* The condition's size has to match the other two arguments' size, so
658
       * convert down if necessary.
659
       */
660
      if (bs[1] == 16) {
661
         struct hash_entry *prev_entry =
662
            _mesa_hash_table_search(ctx->sel_cond_conversions, src[0]);
663
         if (prev_entry) {
664
            cond = prev_entry->data;
665
         } else {
666
            cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
667
            _mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond);
668
         }
669
      }
670

671
      if (bs[1] != 16)
672
         dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
673
      else
674
         dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0);
675
      break;
676
   }
677
   case nir_op_bit_count: {
678
      // TODO, we need to do this 16b at a time on a5xx+a6xx.. need to
679
      // double check on earlier gen's.  Once half-precision support is
680
      // in place, this should probably move to a NIR lowering pass:
681
      struct ir3_instruction *hi, *lo;
682

683
      hi = ir3_COV(b, ir3_SHR_B(b, src[0], 0, create_immed(b, 16), 0), TYPE_U32,
684
                   TYPE_U16);
685
      lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);
686

687
      hi = ir3_CBITS_B(b, hi, 0);
688
      lo = ir3_CBITS_B(b, lo, 0);
689

690
      // TODO maybe the builders should default to making dst half-precision
691
      // if the src's were half precision, to make this less awkward.. otoh
692
      // we should probably just do this lowering in NIR.
693
      hi->dsts[0]->flags |= IR3_REG_HALF;
694
      lo->dsts[0]->flags |= IR3_REG_HALF;
695

696
      dst[0] = ir3_ADD_S(b, hi, 0, lo, 0);
697
      dst[0]->dsts[0]->flags |= IR3_REG_HALF;
698
      dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32);
699
      break;
700
   }
701
   case nir_op_ifind_msb: {
702
      struct ir3_instruction *cmp;
703
      dst[0] = ir3_CLZ_S(b, src[0], 0);
704
      cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
705
      cmp->cat2.condition = IR3_COND_GE;
706
      dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
707
                           0, cmp, 0, dst[0], 0);
708
      break;
709
   }
710
   case nir_op_ufind_msb:
711
      dst[0] = ir3_CLZ_B(b, src[0], 0);
712
      dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
713
                           0, src[0], 0, dst[0], 0);
714
      break;
715
   case nir_op_find_lsb:
716
      dst[0] = ir3_BFREV_B(b, src[0], 0);
717
      dst[0] = ir3_CLZ_B(b, dst[0], 0);
718
      break;
719
   case nir_op_bitfield_reverse:
720
      dst[0] = ir3_BFREV_B(b, src[0], 0);
721
      break;
722

723
   default:
724
      ir3_context_error(ctx, "Unhandled ALU op: %s\n",
725
                        nir_op_infos[alu->op].name);
726
      break;
727
   }
728

729
   if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) {
730
      assert(nir_dest_bit_size(alu->dest.dest) == 1 || alu->op == nir_op_b2b32);
731
      assert(dst_sz == 1);
732
   } else {
733
      /* 1-bit values stored in 32-bit registers are only valid for certain
734
       * ALU ops.
735
       */
736
      switch (alu->op) {
737
      case nir_op_iand:
738
      case nir_op_ior:
739
      case nir_op_ixor:
740
      case nir_op_inot:
741
      case nir_op_bcsel:
742
         break;
743
      default:
744
         compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) != 1);
745
      }
746
   }
747

748
   ir3_put_dst(ctx, &alu->dest.dest);
749
}
750

751
static void
752
emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
753
                            struct ir3_instruction **dst)
754
{
755
   struct ir3_block *b = ctx->block;
756

757
   unsigned ncomp = intr->num_components;
758
   struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
759
   struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
760
   struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
761
   ldc->dsts[0]->wrmask = MASK(ncomp);
762
   ldc->cat6.iim_val = ncomp;
763
   ldc->cat6.d = nir_intrinsic_component(intr);
764
   ldc->cat6.type = TYPE_U32;
765

766
   ir3_handle_bindless_cat6(ldc, intr->src[0]);
767
   if (ldc->flags & IR3_INSTR_B)
768
      ctx->so->bindless_ubo = true;
769
   ir3_handle_nonuniform(ldc, intr);
770

771
   ir3_split_dest(b, dst, ldc, 0, ncomp);
772
}
773

774
/* handles direct/indirect UBO reads: */
775
static void
776
emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
777
                        struct ir3_instruction **dst)
778
{
779
   struct ir3_block *b = ctx->block;
780
   struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
781
   const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
782
   unsigned ubo = regid(const_state->offsets.ubo, 0);
783
   const unsigned ptrsz = ir3_pointer_size(ctx->compiler);
784

785
   int off = 0;
786

787
   /* First src is ubo index, which could either be an immed or not: */
788
   src0 = ir3_get_src(ctx, &intr->src[0])[0];
789
   if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
790
      base_lo = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz));
791
      base_hi = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz) + 1);
792
   } else {
793
      base_lo = create_uniform_indirect(b, ubo, TYPE_U32,
794
                                        ir3_get_addr0(ctx, src0, ptrsz));
795
      base_hi = create_uniform_indirect(b, ubo + 1, TYPE_U32,
796
                                        ir3_get_addr0(ctx, src0, ptrsz));
797

798
      /* NOTE: since relative addressing is used, make sure constlen is
799
       * at least big enough to cover all the UBO addresses, since the
800
       * assembler won't know what the max address reg is.
801
       */
802
      ctx->so->constlen =
803
         MAX2(ctx->so->constlen,
804
              const_state->offsets.ubo + (ctx->s->info.num_ubos * ptrsz));
805
   }
806

807
   /* note: on 32bit gpu's base_hi is ignored and DCE'd */
808
   addr = base_lo;
809

810
   if (nir_src_is_const(intr->src[1])) {
811
      off += nir_src_as_uint(intr->src[1]);
812
   } else {
813
      /* For load_ubo_indirect, second src is indirect offset: */
814
      src1 = ir3_get_src(ctx, &intr->src[1])[0];
815

816
      /* and add offset to addr: */
817
      addr = ir3_ADD_S(b, addr, 0, src1, 0);
818
   }
819

820
   /* if offset is to large to encode in the ldg, split it out: */
821
   if ((off + (intr->num_components * 4)) > 1024) {
822
      /* split out the minimal amount to improve the odds that
823
       * cp can fit the immediate in the add.s instruction:
824
       */
825
      unsigned off2 = off + (intr->num_components * 4) - 1024;
826
      addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
827
      off -= off2;
828
   }
829

830
   if (ptrsz == 2) {
831
      struct ir3_instruction *carry;
832

833
      /* handle 32b rollover, ie:
834
       *   if (addr < base_lo)
835
       *      base_hi++
836
       */
837
      carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
838
      carry->cat2.condition = IR3_COND_LT;
839
      base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
840

841
      addr = ir3_collect(ctx, addr, base_hi);
842
   }
843

844
   for (int i = 0; i < intr->num_components; i++) {
845
      struct ir3_instruction *load =
846
         ir3_LDG(b, addr, 0, create_immed(b, off + i * 4), 0,
847
                 create_immed(b, 1), 0); /* num components */
848
      load->cat6.type = TYPE_U32;
849
      dst[i] = load;
850
   }
851
}
852

853
/* src[] = { block_index } */
854
static void
855
emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
856
                         struct ir3_instruction **dst)
857
{
858
   if (ir3_bindless_resource(intr->src[0])) {
859
      struct ir3_block *b = ctx->block;
860
      struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
861
      struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
862
      resinfo->cat6.iim_val = 1;
863
      resinfo->cat6.d = 1;
864
      resinfo->cat6.type = TYPE_U32;
865
      resinfo->cat6.typed = false;
866
      /* resinfo has no writemask and always writes out 3 components */
867
      resinfo->dsts[0]->wrmask = MASK(3);
868
      ir3_handle_bindless_cat6(resinfo, intr->src[0]);
869
      struct ir3_instruction *resinfo_dst;
870
      ir3_split_dest(b, &resinfo_dst, resinfo, 0, 1);
871
      /* Unfortunately resinfo returns the array length, i.e. in dwords,
872
       * while NIR expects us to return the size in bytes.
873
       *
874
       * TODO: fix this in NIR.
875
       */
876
      *dst = ir3_SHL_B(b, resinfo_dst, 0, create_immed(b, 2), 0);
877
      return;
878
   }
879

880
   /* SSBO size stored as a const starting at ssbo_sizes: */
881
   const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
882
   unsigned blk_idx = nir_src_as_uint(intr->src[0]);
883
   unsigned idx = regid(const_state->offsets.ssbo_sizes, 0) +
884
                  const_state->ssbo_size.off[blk_idx];
885

886
   debug_assert(const_state->ssbo_size.mask & (1 << blk_idx));
887

888
   dst[0] = create_uniform(ctx->block, idx);
889
}
890

891
/* src[] = { offset }. const_index[] = { base } */
892
static void
893
emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
894
                           struct ir3_instruction **dst)
895
{
896
   struct ir3_block *b = ctx->block;
897
   struct ir3_instruction *ldl, *offset;
898
   unsigned base;
899

900
   offset = ir3_get_src(ctx, &intr->src[0])[0];
901
   base = nir_intrinsic_base(intr);
902

903
   ldl = ir3_LDL(b, offset, 0, create_immed(b, base), 0,
904
                 create_immed(b, intr->num_components), 0);
905

906
   ldl->cat6.type = utype_dst(intr->dest);
907
   ldl->dsts[0]->wrmask = MASK(intr->num_components);
908

909
   ldl->barrier_class = IR3_BARRIER_SHARED_R;
910
   ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
911

912
   ir3_split_dest(b, dst, ldl, 0, intr->num_components);
913
}
914

915
/* src[] = { value, offset }. const_index[] = { base, write_mask } */
916
static void
917
emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
918
{
919
   struct ir3_block *b = ctx->block;
920
   struct ir3_instruction *stl, *offset;
921
   struct ir3_instruction *const *value;
922
   unsigned base, wrmask, ncomp;
923

924
   value = ir3_get_src(ctx, &intr->src[0]);
925
   offset = ir3_get_src(ctx, &intr->src[1])[0];
926

927
   base = nir_intrinsic_base(intr);
928
   wrmask = nir_intrinsic_write_mask(intr);
929
   ncomp = ffs(~wrmask) - 1;
930

931
   assert(wrmask == BITFIELD_MASK(intr->num_components));
932

933
   stl = ir3_STL(b, offset, 0, ir3_create_collect(ctx, value, ncomp), 0,
934
                 create_immed(b, ncomp), 0);
935
   stl->cat6.dst_offset = base;
936
   stl->cat6.type = utype_src(intr->src[0]);
937
   stl->barrier_class = IR3_BARRIER_SHARED_W;
938
   stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
939

940
   array_insert(b, b->keeps, stl);
941
}
942

943
/* src[] = { offset }. const_index[] = { base } */
944
static void
945
emit_intrinsic_load_shared_ir3(struct ir3_context *ctx,
946
                               nir_intrinsic_instr *intr,
947
                               struct ir3_instruction **dst)
948
{
949
   struct ir3_block *b = ctx->block;
950
   struct ir3_instruction *load, *offset;
951
   unsigned base;
952

953
   offset = ir3_get_src(ctx, &intr->src[0])[0];
954
   base = nir_intrinsic_base(intr);
955

956
   load = ir3_LDLW(b, offset, 0, create_immed(b, base), 0,
957
                   create_immed(b, intr->num_components), 0);
958

959
   /* for a650, use LDL for tess ctrl inputs: */
960
   if (ctx->so->type == MESA_SHADER_TESS_CTRL && ctx->compiler->tess_use_shared)
961
      load->opc = OPC_LDL;
962

963
   load->cat6.type = utype_dst(intr->dest);
964
   load->dsts[0]->wrmask = MASK(intr->num_components);
965

966
   load->barrier_class = IR3_BARRIER_SHARED_R;
967
   load->barrier_conflict = IR3_BARRIER_SHARED_W;
968

969
   ir3_split_dest(b, dst, load, 0, intr->num_components);
970
}
971

972
/* src[] = { value, offset }. const_index[] = { base } */
973
static void
974
emit_intrinsic_store_shared_ir3(struct ir3_context *ctx,
975
                                nir_intrinsic_instr *intr)
976
{
977
   struct ir3_block *b = ctx->block;
978
   struct ir3_instruction *store, *offset;
979
   struct ir3_instruction *const *value;
980

981
   value = ir3_get_src(ctx, &intr->src[0]);
982
   offset = ir3_get_src(ctx, &intr->src[1])[0];
983

984
   store = ir3_STLW(b, offset, 0,
985
                    ir3_create_collect(ctx, value, intr->num_components), 0,
986
                    create_immed(b, intr->num_components), 0);
987

988
   /* for a650, use STL for vertex outputs used by tess ctrl shader: */
989
   if (ctx->so->type == MESA_SHADER_VERTEX && ctx->so->key.tessellation &&
990
       ctx->compiler->tess_use_shared)
991
      store->opc = OPC_STL;
992

993
   store->cat6.dst_offset = nir_intrinsic_base(intr);
994
   store->cat6.type = utype_src(intr->src[0]);
995
   store->barrier_class = IR3_BARRIER_SHARED_W;
996
   store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
997

998
   array_insert(b, b->keeps, store);
999
}
1000

1001
/*
1002
 * CS shared variable atomic intrinsics
1003
 *
1004
 * All of the shared variable atomic memory operations read a value from
1005
 * memory, compute a new value using one of the operations below, write the
1006
 * new value to memory, and return the original value read.
1007
 *
1008
 * All operations take 2 sources except CompSwap that takes 3. These
1009
 * sources represent:
1010
 *
1011
 * 0: The offset into the shared variable storage region that the atomic
1012
 *    operation will operate on.
1013
 * 1: The data parameter to the atomic function (i.e. the value to add
1014
 *    in shared_atomic_add, etc).
1015
 * 2: For CompSwap only: the second data parameter.
1016
 */
1017
static struct ir3_instruction *
1018
emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1019
{
1020
   struct ir3_block *b = ctx->block;
1021
   struct ir3_instruction *atomic, *src0, *src1;
1022
   type_t type = TYPE_U32;
1023

1024
   src0 = ir3_get_src(ctx, &intr->src[0])[0]; /* offset */
1025
   src1 = ir3_get_src(ctx, &intr->src[1])[0]; /* value */
1026

1027
   switch (intr->intrinsic) {
1028
   case nir_intrinsic_shared_atomic_add:
1029
      atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
1030
      break;
1031
   case nir_intrinsic_shared_atomic_imin:
1032
      atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1033
      type = TYPE_S32;
1034
      break;
1035
   case nir_intrinsic_shared_atomic_umin:
1036
      atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1037
      break;
1038
   case nir_intrinsic_shared_atomic_imax:
1039
      atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1040
      type = TYPE_S32;
1041
      break;
1042
   case nir_intrinsic_shared_atomic_umax:
1043
      atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1044
      break;
1045
   case nir_intrinsic_shared_atomic_and:
1046
      atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
1047
      break;
1048
   case nir_intrinsic_shared_atomic_or:
1049
      atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
1050
      break;
1051
   case nir_intrinsic_shared_atomic_xor:
1052
      atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
1053
      break;
1054
   case nir_intrinsic_shared_atomic_exchange:
1055
      atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
1056
      break;
1057
   case nir_intrinsic_shared_atomic_comp_swap:
1058
      /* for cmpxchg, src1 is [ui]vec2(data, compare): */
1059
      src1 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[2])[0], src1);
1060
      atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
1061
      break;
1062
   default:
1063
      unreachable("boo");
1064
   }
1065

1066
   atomic->cat6.iim_val = 1;
1067
   atomic->cat6.d = 1;
1068
   atomic->cat6.type = type;
1069
   atomic->barrier_class = IR3_BARRIER_SHARED_W;
1070
   atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1071

1072
   /* even if nothing consume the result, we can't DCE the instruction: */
1073
   array_insert(b, b->keeps, atomic);
1074

1075
   return atomic;
1076
}
1077

1078
/* src[] = { offset }. */
1079
static void
1080
emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1081
                            struct ir3_instruction **dst)
1082
{
1083
   struct ir3_block *b = ctx->block;
1084
   struct ir3_instruction *ldp, *offset;
1085

1086
   offset = ir3_get_src(ctx, &intr->src[0])[0];
1087

1088
   ldp = ir3_LDP(b, offset, 0, create_immed(b, 0), 0,
1089
                 create_immed(b, intr->num_components), 0);
1090

1091
   ldp->cat6.type = utype_dst(intr->dest);
1092
   ldp->dsts[0]->wrmask = MASK(intr->num_components);
1093

1094
   ldp->barrier_class = IR3_BARRIER_PRIVATE_R;
1095
   ldp->barrier_conflict = IR3_BARRIER_PRIVATE_W;
1096

1097
   ir3_split_dest(b, dst, ldp, 0, intr->num_components);
1098
}
1099

1100
/* src[] = { value, offset }. const_index[] = { write_mask } */
1101
static void
1102
emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1103
{
1104
   struct ir3_block *b = ctx->block;
1105
   struct ir3_instruction *stp, *offset;
1106
   struct ir3_instruction *const *value;
1107
   unsigned wrmask, ncomp;
1108

1109
   value = ir3_get_src(ctx, &intr->src[0]);
1110
   offset = ir3_get_src(ctx, &intr->src[1])[0];
1111

1112
   wrmask = nir_intrinsic_write_mask(intr);
1113
   ncomp = ffs(~wrmask) - 1;
1114

1115
   assert(wrmask == BITFIELD_MASK(intr->num_components));
1116

1117
   stp = ir3_STP(b, offset, 0, ir3_create_collect(ctx, value, ncomp), 0,
1118
                 create_immed(b, ncomp), 0);
1119
   stp->cat6.dst_offset = 0;
1120
   stp->cat6.type = utype_src(intr->src[0]);
1121
   stp->barrier_class = IR3_BARRIER_PRIVATE_W;
1122
   stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;
1123

1124
   array_insert(b, b->keeps, stp);
1125
}
1126

1127
struct tex_src_info {
1128
   /* For prefetch */
1129
   unsigned tex_base, samp_base, tex_idx, samp_idx;
1130
   /* For normal tex instructions */
1131
   unsigned base, combined_idx, a1_val, flags;
1132
   struct ir3_instruction *samp_tex;
1133
};
1134

1135
/* TODO handle actual indirect/dynamic case.. which is going to be weird
1136
 * to handle with the image_mapping table..
1137
 */
1138
static struct tex_src_info
1139
get_image_samp_tex_src(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1140
{
1141
   struct ir3_block *b = ctx->block;
1142
   struct tex_src_info info = {0};
1143
   nir_intrinsic_instr *bindless_tex = ir3_bindless_resource(intr->src[0]);
1144
   ctx->so->bindless_tex = true;
1145

1146
   if (bindless_tex) {
1147
      /* Bindless case */
1148
      info.flags |= IR3_INSTR_B;
1149

1150
      /* Gather information required to determine which encoding to
1151
       * choose as well as for prefetch.
1152
       */
1153
      info.tex_base = nir_intrinsic_desc_set(bindless_tex);
1154
      bool tex_const = nir_src_is_const(bindless_tex->src[0]);
1155
      if (tex_const)
1156
         info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
1157
      info.samp_idx = 0;
1158

1159
      /* Choose encoding. */
1160
      if (tex_const && info.tex_idx < 256) {
1161
         if (info.tex_idx < 16) {
1162
            /* Everything fits within the instruction */
1163
            info.base = info.tex_base;
1164
            info.combined_idx = info.samp_idx | (info.tex_idx << 4);
1165
         } else {
1166
            info.base = info.tex_base;
1167
            info.a1_val = info.tex_idx << 3;
1168
            info.combined_idx = 0;
1169
            info.flags |= IR3_INSTR_A1EN;
1170
         }
1171
         info.samp_tex = NULL;
1172
      } else {
1173
         info.flags |= IR3_INSTR_S2EN;
1174
         info.base = info.tex_base;
1175

1176
         /* Note: the indirect source is now a vec2 instead of hvec2 */
1177
         struct ir3_instruction *texture, *sampler;
1178

1179
         texture = ir3_get_src(ctx, &intr->src[0])[0];
1180
         sampler = create_immed(b, 0);
1181
         info.samp_tex = ir3_collect(ctx, texture, sampler);
1182
      }
1183
   } else {
1184
      info.flags |= IR3_INSTR_S2EN;
1185
      unsigned slot = nir_src_as_uint(intr->src[0]);
1186
      unsigned tex_idx = ir3_image_to_tex(&ctx->so->image_mapping, slot);
1187
      struct ir3_instruction *texture, *sampler;
1188

1189
      texture = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
1190
      sampler = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
1191

1192
      info.samp_tex = ir3_collect(ctx, sampler, texture);
1193
   }
1194

1195
   return info;
1196
}
1197

1198
static struct ir3_instruction *
1199
emit_sam(struct ir3_context *ctx, opc_t opc, struct tex_src_info info,
1200
         type_t type, unsigned wrmask, struct ir3_instruction *src0,
1201
         struct ir3_instruction *src1)
1202
{
1203
   struct ir3_instruction *sam, *addr;
1204
   if (info.flags & IR3_INSTR_A1EN) {
1205
      addr = ir3_get_addr1(ctx, info.a1_val);
1206
   }
1207
   sam = ir3_SAM(ctx->block, opc, type, 0b1111, info.flags, info.samp_tex, src0,
1208
                 src1);
1209
   if (info.flags & IR3_INSTR_A1EN) {
1210
      ir3_instr_set_address(sam, addr);
1211
   }
1212
   if (info.flags & IR3_INSTR_B) {
1213
      sam->cat5.tex_base = info.base;
1214
      sam->cat5.samp = info.combined_idx;
1215
   }
1216
   return sam;
1217
}
1218

1219
/* src[] = { deref, coord, sample_index }. const_index[] = {} */
1220
static void
1221
emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1222
                          struct ir3_instruction **dst)
1223
{
1224
   struct ir3_block *b = ctx->block;
1225
   struct tex_src_info info = get_image_samp_tex_src(ctx, intr);
1226
   struct ir3_instruction *sam;
1227
   struct ir3_instruction *const *src0 = ir3_get_src(ctx, &intr->src[1]);
1228
   struct ir3_instruction *coords[4];
1229
   unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1230
   type_t type = ir3_get_type_for_image_intrinsic(intr);
1231

1232
   /* hmm, this seems a bit odd, but it is what blob does and (at least
1233
    * a5xx) just faults on bogus addresses otherwise:
1234
    */
1235
   if (flags & IR3_INSTR_3D) {
1236
      flags &= ~IR3_INSTR_3D;
1237
      flags |= IR3_INSTR_A;
1238
   }
1239
   info.flags |= flags;
1240

1241
   for (unsigned i = 0; i < ncoords; i++)
1242
      coords[i] = src0[i];
1243

1244
   if (ncoords == 1)
1245
      coords[ncoords++] = create_immed(b, 0);
1246

1247
   sam = emit_sam(ctx, OPC_ISAM, info, type, 0b1111,
1248
                  ir3_create_collect(ctx, coords, ncoords), NULL);
1249

1250
   ir3_handle_nonuniform(sam, intr);
1251

1252
   sam->barrier_class = IR3_BARRIER_IMAGE_R;
1253
   sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
1254

1255
   ir3_split_dest(b, dst, sam, 0, 4);
1256
}
1257

1258
/* A4xx version of image_size, see ir3_a6xx.c for newer resinfo version. */
1259
void
1260
emit_intrinsic_image_size_tex(struct ir3_context *ctx,
1261
                              nir_intrinsic_instr *intr,
1262
                              struct ir3_instruction **dst)
1263
{
1264
   struct ir3_block *b = ctx->block;
1265
   struct tex_src_info info = get_image_samp_tex_src(ctx, intr);
1266
   struct ir3_instruction *sam, *lod;
1267
   unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1268
   type_t dst_type = nir_dest_bit_size(intr->dest) == 16 ? TYPE_U16 : TYPE_U32;
1269

1270
   info.flags |= flags;
1271
   assert(nir_src_as_uint(intr->src[1]) == 0);
1272
   lod = create_immed(b, 0);
1273
   sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
1274

1275
   /* Array size actually ends up in .w rather than .z. This doesn't
1276
    * matter for miplevel 0, but for higher mips the value in z is
1277
    * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
1278
    * returned, which means that we have to add 1 to it for arrays for
1279
    * a3xx.
1280
    *
1281
    * Note use a temporary dst and then copy, since the size of the dst
1282
    * array that is passed in is based on nir's understanding of the
1283
    * result size, not the hardware's
1284
    */
1285
   struct ir3_instruction *tmp[4];
1286

1287
   ir3_split_dest(b, tmp, sam, 0, 4);
1288

1289
   for (unsigned i = 0; i < ncoords; i++)
1290
      dst[i] = tmp[i];
1291

1292
   if (flags & IR3_INSTR_A) {
1293
      if (ctx->compiler->levels_add_one) {
1294
         dst[ncoords - 1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
1295
      } else {
1296
         dst[ncoords - 1] = ir3_MOV(b, tmp[3], TYPE_U32);
1297
      }
1298
   }
1299
}
1300

1301
static void
1302
emit_control_barrier(struct ir3_context *ctx)
1303
{
1304
   /* Hull shaders dispatch 32 wide so an entire patch will always
1305
    * fit in a single warp and execute in lock-step. Consequently,
1306
    * we don't need to do anything for TCS barriers. Emitting
1307
    * barrier instruction will deadlock.
1308
    */
1309
   if (ctx->so->type == MESA_SHADER_TESS_CTRL)
1310
      return;
1311

1312
   struct ir3_block *b = ctx->block;
1313
   struct ir3_instruction *barrier = ir3_BAR(b);
1314
   barrier->cat7.g = true;
1315
   if (ctx->compiler->gpu_id < 600)
1316
      barrier->cat7.l = true;
1317
   barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
1318
   barrier->barrier_class = IR3_BARRIER_EVERYTHING;
1319
   array_insert(b, b->keeps, barrier);
1320
}
1321

1322
static void
1323
emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1324
{
1325
   struct ir3_block *b = ctx->block;
1326
   struct ir3_instruction *barrier;
1327

1328
   /* TODO: find out why there is a major difference of .l usage
1329
    * between a5xx and a6xx,
1330
    */
1331

1332
   switch (intr->intrinsic) {
1333
   case nir_intrinsic_control_barrier:
1334
      emit_control_barrier(ctx);
1335
      return;
1336
   case nir_intrinsic_scoped_barrier: {
1337
      nir_scope exec_scope = nir_intrinsic_execution_scope(intr);
1338
      nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
1339

1340
      if (ctx->so->type == MESA_SHADER_TESS_CTRL) {
1341
         /* Remove mode corresponding to nir_intrinsic_memory_barrier_tcs_patch,
1342
          * because hull shaders dispatch 32 wide so an entire patch will
1343
          * always fit in a single warp and execute in lock-step.
1344
          *
1345
          * TODO: memory barrier also tells us not to reorder stores, this
1346
          * information is lost here (backend doesn't reorder stores so we
1347
          * are safe for now).
1348
          */
1349
         modes &= ~nir_var_shader_out;
1350
      }
1351

1352
      assert(!(modes & nir_var_shader_out));
1353

1354
      if ((modes &
1355
           (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_mem_global))) {
1356
         barrier = ir3_FENCE(b);
1357
         barrier->cat7.r = true;
1358
         barrier->cat7.w = true;
1359

1360
         if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
1361
            barrier->cat7.g = true;
1362
         }
1363

1364
         if (ctx->compiler->gpu_id > 600) {
1365
            if (modes & nir_var_mem_ssbo) {
1366
               barrier->cat7.l = true;
1367
            }
1368
         } else {
1369
            if (modes & (nir_var_mem_shared | nir_var_mem_ssbo)) {
1370
               barrier->cat7.l = true;
1371
            }
1372
         }
1373

1374
         barrier->barrier_class = 0;
1375
         barrier->barrier_conflict = 0;
1376

1377
         if (modes & nir_var_mem_shared) {
1378
            barrier->barrier_class |= IR3_BARRIER_SHARED_W;
1379
            barrier->barrier_conflict |=
1380
               IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1381
         }
1382

1383
         if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
1384
            barrier->barrier_class |= IR3_BARRIER_BUFFER_W;
1385
            barrier->barrier_conflict |=
1386
               IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
1387
         }
1388

1389
         /* TODO: check for image mode when it has a separate one */
1390
         if (modes & nir_var_mem_ssbo) {
1391
            barrier->barrier_class |= IR3_BARRIER_IMAGE_W;
1392
            barrier->barrier_conflict |=
1393
               IR3_BARRIER_IMAGE_W | IR3_BARRIER_IMAGE_R;
1394
         }
1395
         array_insert(b, b->keeps, barrier);
1396
      }
1397

1398
      if (exec_scope >= NIR_SCOPE_WORKGROUP) {
1399
         emit_control_barrier(ctx);
1400
      }
1401

1402
      return;
1403
   }
1404
   case nir_intrinsic_memory_barrier_tcs_patch:
1405
      /* Not applicable, see explanation for scoped_barrier + shader_out */
1406
      return;
1407
   case nir_intrinsic_memory_barrier_buffer:
1408
      barrier = ir3_FENCE(b);
1409
      barrier->cat7.g = true;
1410
      if (ctx->compiler->gpu_id > 600)
1411
         barrier->cat7.l = true;
1412
      barrier->cat7.r = true;
1413
      barrier->cat7.w = true;
1414
      barrier->barrier_class = IR3_BARRIER_BUFFER_W;
1415
      barrier->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
1416
      break;
1417
   case nir_intrinsic_memory_barrier_image:
1418
      barrier = ir3_FENCE(b);
1419
      barrier->cat7.g = true;
1420
      barrier->cat7.l = true;
1421
      barrier->cat7.r = true;
1422
      barrier->cat7.w = true;
1423
      barrier->barrier_class = IR3_BARRIER_IMAGE_W;
1424
      barrier->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
1425
      break;
1426
   case nir_intrinsic_memory_barrier_shared:
1427
      barrier = ir3_FENCE(b);
1428
      if (ctx->compiler->gpu_id < 600)
1429
         barrier->cat7.l = true;
1430
      barrier->cat7.r = true;
1431
      barrier->cat7.w = true;
1432
      barrier->barrier_class = IR3_BARRIER_SHARED_W;
1433
      barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1434
      break;
1435
   case nir_intrinsic_memory_barrier:
1436
   case nir_intrinsic_group_memory_barrier:
1437
      barrier = ir3_FENCE(b);
1438
      barrier->cat7.g = true;
1439
      barrier->cat7.l = true;
1440
      barrier->cat7.r = true;
1441
      barrier->cat7.w = true;
1442
      barrier->barrier_class =
1443
         IR3_BARRIER_SHARED_W | IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
1444
      barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
1445
                                  IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
1446
                                  IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
1447
      break;
1448
   default:
1449
      unreachable("boo");
1450
   }
1451

1452
   /* make sure barrier doesn't get DCE'd */
1453
   array_insert(b, b->keeps, barrier);
1454
}
1455

1456
static void
1457
add_sysval_input_compmask(struct ir3_context *ctx, gl_system_value slot,
1458
                          unsigned compmask, struct ir3_instruction *instr)
1459
{
1460
   struct ir3_shader_variant *so = ctx->so;
1461
   unsigned n = so->inputs_count++;
1462

1463
   assert(instr->opc == OPC_META_INPUT);
1464
   instr->input.inidx = n;
1465
   instr->input.sysval = slot;
1466

1467
   so->inputs[n].sysval = true;
1468
   so->inputs[n].slot = slot;
1469
   so->inputs[n].compmask = compmask;
1470
   so->total_in++;
1471

1472
   so->sysval_in += util_last_bit(compmask);
1473
}
1474

1475
static struct ir3_instruction *
1476
create_sysval_input(struct ir3_context *ctx, gl_system_value slot,
1477
                    unsigned compmask)
1478
{
1479
   assert(compmask);
1480
   struct ir3_instruction *sysval = create_input(ctx, compmask);
1481
   add_sysval_input_compmask(ctx, slot, compmask, sysval);
1482
   return sysval;
1483
}
1484

1485
static struct ir3_instruction *
1486
get_barycentric(struct ir3_context *ctx, enum ir3_bary bary)
1487
{
1488
   static const gl_system_value sysval_base =
1489
      SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1490

1491
   STATIC_ASSERT(sysval_base + IJ_PERSP_PIXEL ==
1492
                 SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL);
1493
   STATIC_ASSERT(sysval_base + IJ_PERSP_SAMPLE ==
1494
                 SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE);
1495
   STATIC_ASSERT(sysval_base + IJ_PERSP_CENTROID ==
1496
                 SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID);
1497
   STATIC_ASSERT(sysval_base + IJ_PERSP_SIZE ==
1498
                 SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE);
1499
   STATIC_ASSERT(sysval_base + IJ_LINEAR_PIXEL ==
1500
                 SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL);
1501
   STATIC_ASSERT(sysval_base + IJ_LINEAR_CENTROID ==
1502
                 SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID);
1503
   STATIC_ASSERT(sysval_base + IJ_LINEAR_SAMPLE ==
1504
                 SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE);
1505

1506
   if (!ctx->ij[bary]) {
1507
      struct ir3_instruction *xy[2];
1508
      struct ir3_instruction *ij;
1509

1510
      ij = create_sysval_input(ctx, sysval_base + bary, 0x3);
1511
      ir3_split_dest(ctx->block, xy, ij, 0, 2);
1512

1513
      ctx->ij[bary] = ir3_create_collect(ctx, xy, 2);
1514
   }
1515

1516
   return ctx->ij[bary];
1517
}
1518

1519
/* TODO: make this a common NIR helper?
1520
 * there is a nir_system_value_from_intrinsic but it takes nir_intrinsic_op so
1521
 * it can't be extended to work with this
1522
 */
1523
static gl_system_value
1524
nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr)
1525
{
1526
   enum glsl_interp_mode interp_mode = nir_intrinsic_interp_mode(intr);
1527
   gl_system_value sysval;
1528

1529
   switch (intr->intrinsic) {
1530
   case nir_intrinsic_load_barycentric_pixel:
1531
      if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1532
         sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1533
      else
1534
         sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1535
      break;
1536
   case nir_intrinsic_load_barycentric_centroid:
1537
      if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1538
         sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID;
1539
      else
1540
         sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID;
1541
      break;
1542
   case nir_intrinsic_load_barycentric_sample:
1543
      if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1544
         sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE;
1545
      else
1546
         sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE;
1547
      break;
1548
   default:
1549
      unreachable("invalid barycentric intrinsic");
1550
   }
1551

1552
   return sysval;
1553
}
1554

1555
static void
1556
emit_intrinsic_barycentric(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1557
                           struct ir3_instruction **dst)
1558
{
1559
   gl_system_value sysval = nir_intrinsic_barycentric_sysval(intr);
1560

1561
   if (!ctx->so->key.msaa) {
1562
      switch (sysval) {
1563
      case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE:
1564
         sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1565
         break;
1566
      case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID:
1567
         if (ctx->compiler->gpu_id < 600)
1568
            sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1569
         break;
1570
      case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE:
1571
         sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1572
         break;
1573
      case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID:
1574
         if (ctx->compiler->gpu_id < 600)
1575
            sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1576
         break;
1577
      default:
1578
         break;
1579
      }
1580
   }
1581

1582
   enum ir3_bary bary = sysval - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1583

1584
   struct ir3_instruction *ij = get_barycentric(ctx, bary);
1585
   ir3_split_dest(ctx->block, dst, ij, 0, 2);
1586
}
1587

1588
static struct ir3_instruction *
1589
get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1590
{
1591
   if (!ctx->frag_coord) {
1592
      struct ir3_block *b = ctx->in_block;
1593
      struct ir3_instruction *xyzw[4];
1594
      struct ir3_instruction *hw_frag_coord;
1595

1596
      hw_frag_coord = create_sysval_input(ctx, SYSTEM_VALUE_FRAG_COORD, 0xf);
1597
      ir3_split_dest(b, xyzw, hw_frag_coord, 0, 4);
1598

1599
      /* for frag_coord.xy, we get unsigned values.. we need
1600
       * to subtract (integer) 8 and divide by 16 (right-
1601
       * shift by 4) then convert to float:
1602
       *
1603
       *    sub.s tmp, src, 8
1604
       *    shr.b tmp, tmp, 4
1605
       *    mov.u32f32 dst, tmp
1606
       *
1607
       */
1608
      for (int i = 0; i < 2; i++) {
1609
         xyzw[i] = ir3_COV(b, xyzw[i], TYPE_U32, TYPE_F32);
1610
         xyzw[i] =
1611
            ir3_MUL_F(b, xyzw[i], 0, create_immed(b, fui(1.0 / 16.0)), 0);
1612
      }
1613

1614
      ctx->frag_coord = ir3_create_collect(ctx, xyzw, 4);
1615
   }
1616

1617
   ctx->so->fragcoord_compmask |= nir_ssa_def_components_read(&intr->dest.ssa);
1618

1619
   return ctx->frag_coord;
1620
}
1621

1622
static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);
1623
static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);
1624

1625
static void
1626
emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1627
{
1628
   const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
1629
   struct ir3_instruction **dst;
1630
   struct ir3_instruction *const *src;
1631
   struct ir3_block *b = ctx->block;
1632
   unsigned dest_components = nir_intrinsic_dest_components(intr);
1633
   int idx;
1634

1635
   if (info->has_dest) {
1636
      dst = ir3_get_dst(ctx, &intr->dest, dest_components);
1637
   } else {
1638
      dst = NULL;
1639
   }
1640

1641
   const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
1642
   const unsigned primitive_param = const_state->offsets.primitive_param * 4;
1643
   const unsigned primitive_map = const_state->offsets.primitive_map * 4;
1644

1645
   switch (intr->intrinsic) {
1646
   case nir_intrinsic_load_uniform:
1647
      idx = nir_intrinsic_base(intr);
1648
      if (nir_src_is_const(intr->src[0])) {
1649
         idx += nir_src_as_uint(intr->src[0]);
1650
         for (int i = 0; i < dest_components; i++) {
1651
            dst[i] = create_uniform_typed(
1652
               b, idx + i,
1653
               nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32);
1654
         }
1655
      } else {
1656
         src = ir3_get_src(ctx, &intr->src[0]);
1657
         for (int i = 0; i < dest_components; i++) {
1658
            dst[i] = create_uniform_indirect(
1659
               b, idx + i,
1660
               nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32,
1661
               ir3_get_addr0(ctx, src[0], 1));
1662
         }
1663
         /* NOTE: if relative addressing is used, we set
1664
          * constlen in the compiler (to worst-case value)
1665
          * since we don't know in the assembler what the max
1666
          * addr reg value can be:
1667
          */
1668
         ctx->so->constlen =
1669
            MAX2(ctx->so->constlen, const_state->ubo_state.size / 16);
1670
      }
1671
      break;
1672

1673
   case nir_intrinsic_load_vs_primitive_stride_ir3:
1674
      dst[0] = create_uniform(b, primitive_param + 0);
1675
      break;
1676
   case nir_intrinsic_load_vs_vertex_stride_ir3:
1677
      dst[0] = create_uniform(b, primitive_param + 1);
1678
      break;
1679
   case nir_intrinsic_load_hs_patch_stride_ir3:
1680
      dst[0] = create_uniform(b, primitive_param + 2);
1681
      break;
1682
   case nir_intrinsic_load_patch_vertices_in:
1683
      dst[0] = create_uniform(b, primitive_param + 3);
1684
      break;
1685
   case nir_intrinsic_load_tess_param_base_ir3:
1686
      dst[0] = create_uniform(b, primitive_param + 4);
1687
      dst[1] = create_uniform(b, primitive_param + 5);
1688
      break;
1689
   case nir_intrinsic_load_tess_factor_base_ir3:
1690
      dst[0] = create_uniform(b, primitive_param + 6);
1691
      dst[1] = create_uniform(b, primitive_param + 7);
1692
      break;
1693

1694
   case nir_intrinsic_load_primitive_location_ir3:
1695
      idx = nir_intrinsic_driver_location(intr);
1696
      dst[0] = create_uniform(b, primitive_map + idx);
1697
      break;
1698

1699
   case nir_intrinsic_load_gs_header_ir3:
1700
      dst[0] = ctx->gs_header;
1701
      break;
1702
   case nir_intrinsic_load_tcs_header_ir3:
1703
      dst[0] = ctx->tcs_header;
1704
      break;
1705

1706
   case nir_intrinsic_load_primitive_id:
1707
      dst[0] = ctx->primitive_id;
1708
      break;
1709

1710
   case nir_intrinsic_load_tess_coord:
1711
      if (!ctx->tess_coord) {
1712
         ctx->tess_coord =
1713
            create_sysval_input(ctx, SYSTEM_VALUE_TESS_COORD, 0x3);
1714
      }
1715
      ir3_split_dest(b, dst, ctx->tess_coord, 0, 2);
1716

1717
      /* Unused, but ir3_put_dst() below wants to free something */
1718
      dst[2] = create_immed(b, 0);
1719
      break;
1720

1721
   case nir_intrinsic_end_patch_ir3:
1722
      assert(ctx->so->type == MESA_SHADER_TESS_CTRL);
1723
      struct ir3_instruction *end = ir3_PREDE(b);
1724
      array_insert(b, b->keeps, end);
1725

1726
      end->barrier_class = IR3_BARRIER_EVERYTHING;
1727
      end->barrier_conflict = IR3_BARRIER_EVERYTHING;
1728
      break;
1729

1730
   case nir_intrinsic_store_global_ir3:
1731
      ctx->funcs->emit_intrinsic_store_global_ir3(ctx, intr);
1732
      break;
1733
   case nir_intrinsic_load_global_ir3:
1734
      ctx->funcs->emit_intrinsic_load_global_ir3(ctx, intr, dst);
1735
      break;
1736

1737
   case nir_intrinsic_load_ubo:
1738
      emit_intrinsic_load_ubo(ctx, intr, dst);
1739
      break;
1740
   case nir_intrinsic_load_ubo_vec4:
1741
      emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
1742
      break;
1743
   case nir_intrinsic_load_frag_coord:
1744
      ir3_split_dest(b, dst, get_frag_coord(ctx, intr), 0, 4);
1745
      break;
1746
   case nir_intrinsic_load_sample_pos_from_id: {
1747
      /* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32,
1748
       * but that doesn't seem necessary.
1749
       */
1750
      struct ir3_instruction *offset =
1751
         ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0);
1752
      offset->dsts[0]->wrmask = 0x3;
1753
      offset->cat5.type = TYPE_F32;
1754

1755
      ir3_split_dest(b, dst, offset, 0, 2);
1756

1757
      break;
1758
   }
1759
   case nir_intrinsic_load_size_ir3:
1760
      if (!ctx->ij[IJ_PERSP_SIZE]) {
1761
         ctx->ij[IJ_PERSP_SIZE] =
1762
            create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE, 0x1);
1763
      }
1764
      dst[0] = ctx->ij[IJ_PERSP_SIZE];
1765
      break;
1766
   case nir_intrinsic_load_barycentric_centroid:
1767
   case nir_intrinsic_load_barycentric_sample:
1768
   case nir_intrinsic_load_barycentric_pixel:
1769
      emit_intrinsic_barycentric(ctx, intr, dst);
1770
      break;
1771
   case nir_intrinsic_load_interpolated_input:
1772
   case nir_intrinsic_load_input:
1773
      setup_input(ctx, intr);
1774
      break;
1775
   /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
1776
    * pass and replaced by an ir3-specifc version that adds the
1777
    * dword-offset in the last source.
1778
    */
1779
   case nir_intrinsic_load_ssbo_ir3:
1780
      ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
1781
      break;
1782
   case nir_intrinsic_store_ssbo_ir3:
1783
      if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
1784
          !ctx->s->info.fs.early_fragment_tests)
1785
         ctx->so->no_earlyz = true;
1786
      ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr);
1787
      break;
1788
   case nir_intrinsic_get_ssbo_size:
1789
      emit_intrinsic_ssbo_size(ctx, intr, dst);
1790
      break;
1791
   case nir_intrinsic_ssbo_atomic_add_ir3:
1792
   case nir_intrinsic_ssbo_atomic_imin_ir3:
1793
   case nir_intrinsic_ssbo_atomic_umin_ir3:
1794
   case nir_intrinsic_ssbo_atomic_imax_ir3:
1795
   case nir_intrinsic_ssbo_atomic_umax_ir3:
1796
   case nir_intrinsic_ssbo_atomic_and_ir3:
1797
   case nir_intrinsic_ssbo_atomic_or_ir3:
1798
   case nir_intrinsic_ssbo_atomic_xor_ir3:
1799
   case nir_intrinsic_ssbo_atomic_exchange_ir3:
1800
   case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
1801
      if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
1802
          !ctx->s->info.fs.early_fragment_tests)
1803
         ctx->so->no_earlyz = true;
1804
      dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr);
1805
      break;
1806
   case nir_intrinsic_load_shared:
1807
      emit_intrinsic_load_shared(ctx, intr, dst);
1808
      break;
1809
   case nir_intrinsic_store_shared:
1810
      emit_intrinsic_store_shared(ctx, intr);
1811
      break;
1812
   case nir_intrinsic_shared_atomic_add:
1813
   case nir_intrinsic_shared_atomic_imin:
1814
   case nir_intrinsic_shared_atomic_umin:
1815
   case nir_intrinsic_shared_atomic_imax:
1816
   case nir_intrinsic_shared_atomic_umax:
1817
   case nir_intrinsic_shared_atomic_and:
1818
   case nir_intrinsic_shared_atomic_or:
1819
   case nir_intrinsic_shared_atomic_xor:
1820
   case nir_intrinsic_shared_atomic_exchange:
1821
   case nir_intrinsic_shared_atomic_comp_swap:
1822
      dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
1823
      break;
1824
   case nir_intrinsic_load_scratch:
1825
      emit_intrinsic_load_scratch(ctx, intr, dst);
1826
      break;
1827
   case nir_intrinsic_store_scratch:
1828
      emit_intrinsic_store_scratch(ctx, intr);
1829
      break;
1830
   case nir_intrinsic_image_load:
1831
      emit_intrinsic_load_image(ctx, intr, dst);
1832
      break;
1833
   case nir_intrinsic_bindless_image_load:
1834
      /* Bindless uses the IBO state, which doesn't have swizzle filled out,
1835
       * so using isam doesn't work.
1836
       *
1837
       * TODO: can we use isam if we fill out more fields?
1838
       */
1839
      ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1840
      break;
1841
   case nir_intrinsic_image_store:
1842
   case nir_intrinsic_bindless_image_store:
1843
      if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
1844
          !ctx->s->info.fs.early_fragment_tests)
1845
         ctx->so->no_earlyz = true;
1846
      ctx->funcs->emit_intrinsic_store_image(ctx, intr);
1847
      break;
1848
   case nir_intrinsic_image_size:
1849
   case nir_intrinsic_bindless_image_size:
1850
      ctx->funcs->emit_intrinsic_image_size(ctx, intr, dst);
1851
      break;
1852
   case nir_intrinsic_image_atomic_add:
1853
   case nir_intrinsic_bindless_image_atomic_add:
1854
   case nir_intrinsic_image_atomic_imin:
1855
   case nir_intrinsic_bindless_image_atomic_imin:
1856
   case nir_intrinsic_image_atomic_umin:
1857
   case nir_intrinsic_bindless_image_atomic_umin:
1858
   case nir_intrinsic_image_atomic_imax:
1859
   case nir_intrinsic_bindless_image_atomic_imax:
1860
   case nir_intrinsic_image_atomic_umax:
1861
   case nir_intrinsic_bindless_image_atomic_umax:
1862
   case nir_intrinsic_image_atomic_and:
1863
   case nir_intrinsic_bindless_image_atomic_and:
1864
   case nir_intrinsic_image_atomic_or:
1865
   case nir_intrinsic_bindless_image_atomic_or:
1866
   case nir_intrinsic_image_atomic_xor:
1867
   case nir_intrinsic_bindless_image_atomic_xor:
1868
   case nir_intrinsic_image_atomic_exchange:
1869
   case nir_intrinsic_bindless_image_atomic_exchange:
1870
   case nir_intrinsic_image_atomic_comp_swap:
1871
   case nir_intrinsic_bindless_image_atomic_comp_swap:
1872
      if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
1873
          !ctx->s->info.fs.early_fragment_tests)
1874
         ctx->so->no_earlyz = true;
1875
      dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr);
1876
      break;
1877
   case nir_intrinsic_scoped_barrier:
1878
   case nir_intrinsic_control_barrier:
1879
   case nir_intrinsic_memory_barrier:
1880
   case nir_intrinsic_group_memory_barrier:
1881
   case nir_intrinsic_memory_barrier_buffer:
1882
   case nir_intrinsic_memory_barrier_image:
1883
   case nir_intrinsic_memory_barrier_shared:
1884
   case nir_intrinsic_memory_barrier_tcs_patch:
1885
      emit_intrinsic_barrier(ctx, intr);
1886
      /* note that blk ptr no longer valid, make that obvious: */
1887
      b = NULL;
1888
      break;
1889
   case nir_intrinsic_store_output:
1890
      setup_output(ctx, intr);
1891
      break;
1892
   case nir_intrinsic_load_base_vertex:
1893
   case nir_intrinsic_load_first_vertex:
1894
      if (!ctx->basevertex) {
1895
         ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
1896
      }
1897
      dst[0] = ctx->basevertex;
1898
      break;
1899
   case nir_intrinsic_load_draw_id:
1900
      if (!ctx->draw_id) {
1901
         ctx->draw_id = create_driver_param(ctx, IR3_DP_DRAWID);
1902
      }
1903
      dst[0] = ctx->draw_id;
1904
      break;
1905
   case nir_intrinsic_load_base_instance:
1906
      if (!ctx->base_instance) {
1907
         ctx->base_instance = create_driver_param(ctx, IR3_DP_INSTID_BASE);
1908
      }
1909
      dst[0] = ctx->base_instance;
1910
      break;
1911
   case nir_intrinsic_load_view_index:
1912
      if (!ctx->view_index) {
1913
         ctx->view_index =
1914
            create_sysval_input(ctx, SYSTEM_VALUE_VIEW_INDEX, 0x1);
1915
      }
1916
      dst[0] = ctx->view_index;
1917
      break;
1918
   case nir_intrinsic_load_vertex_id_zero_base:
1919
   case nir_intrinsic_load_vertex_id:
1920
      if (!ctx->vertex_id) {
1921
         gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id)
1922
                                 ? SYSTEM_VALUE_VERTEX_ID
1923
                                 : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
1924
         ctx->vertex_id = create_sysval_input(ctx, sv, 0x1);
1925
      }
1926
      dst[0] = ctx->vertex_id;
1927
      break;
1928
   case nir_intrinsic_load_instance_id:
1929
      if (!ctx->instance_id) {
1930
         ctx->instance_id =
1931
            create_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, 0x1);
1932
      }
1933
      dst[0] = ctx->instance_id;
1934
      break;
1935
   case nir_intrinsic_load_sample_id:
1936
      ctx->so->per_samp = true;
1937
      FALLTHROUGH;
1938
   case nir_intrinsic_load_sample_id_no_per_sample:
1939
      if (!ctx->samp_id) {
1940
         ctx->samp_id = create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, 0x1);
1941
         ctx->samp_id->dsts[0]->flags |= IR3_REG_HALF;
1942
      }
1943
      dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
1944
      break;
1945
   case nir_intrinsic_load_sample_mask_in:
1946
      if (!ctx->samp_mask_in) {
1947
         ctx->samp_mask_in =
1948
            create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1);
1949
      }
1950
      dst[0] = ctx->samp_mask_in;
1951
      break;
1952
   case nir_intrinsic_load_user_clip_plane:
1953
      idx = nir_intrinsic_ucp_id(intr);
1954
      for (int i = 0; i < dest_components; i++) {
1955
         unsigned n = idx * 4 + i;
1956
         dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
1957
      }
1958
      break;
1959
   case nir_intrinsic_load_front_face:
1960
      if (!ctx->frag_face) {
1961
         ctx->so->frag_face = true;
1962
         ctx->frag_face =
1963
            create_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, 0x1);
1964
         ctx->frag_face->dsts[0]->flags |= IR3_REG_HALF;
1965
      }
1966
      /* for fragface, we get -1 for back and 0 for front. However this is
1967
       * the inverse of what nir expects (where ~0 is true).
1968
       */
1969
      dst[0] = ir3_CMPS_S(b, ctx->frag_face, 0,
1970
                          create_immed_typed(b, 0, TYPE_U16), 0);
1971
      dst[0]->cat2.condition = IR3_COND_EQ;
1972
      break;
1973
   case nir_intrinsic_load_local_invocation_id:
1974
      if (!ctx->local_invocation_id) {
1975
         ctx->local_invocation_id =
1976
            create_sysval_input(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, 0x7);
1977
      }
1978
      ir3_split_dest(b, dst, ctx->local_invocation_id, 0, 3);
1979
      break;
1980
   case nir_intrinsic_load_workgroup_id:
1981
   case nir_intrinsic_load_workgroup_id_zero_base:
1982
      if (!ctx->work_group_id) {
1983
         ctx->work_group_id =
1984
            create_sysval_input(ctx, SYSTEM_VALUE_WORKGROUP_ID, 0x7);
1985
         ctx->work_group_id->dsts[0]->flags |= IR3_REG_SHARED;
1986
      }
1987
      ir3_split_dest(b, dst, ctx->work_group_id, 0, 3);
1988
      break;
1989
   case nir_intrinsic_load_base_workgroup_id:
1990
      for (int i = 0; i < dest_components; i++) {
1991
         dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i);
1992
      }
1993
      break;
1994
   case nir_intrinsic_load_num_workgroups:
1995
      for (int i = 0; i < dest_components; i++) {
1996
         dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
1997
      }
1998
      break;
1999
   case nir_intrinsic_load_workgroup_size:
2000
      for (int i = 0; i < dest_components; i++) {
2001
         dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
2002
      }
2003
      break;
2004
   case nir_intrinsic_load_subgroup_size:
2005
      dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_SIZE);
2006
      break;
2007
   case nir_intrinsic_load_subgroup_id_shift_ir3:
2008
      dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);
2009
      break;
2010
   case nir_intrinsic_discard_if:
2011
   case nir_intrinsic_discard:
2012
   case nir_intrinsic_demote:
2013
   case nir_intrinsic_demote_if:
2014
   case nir_intrinsic_terminate:
2015
   case nir_intrinsic_terminate_if: {
2016
      struct ir3_instruction *cond, *kill;
2017

2018
      if (intr->intrinsic == nir_intrinsic_discard_if ||
2019
          intr->intrinsic == nir_intrinsic_demote_if ||
2020
          intr->intrinsic == nir_intrinsic_terminate_if) {
2021
         /* conditional discard: */
2022
         src = ir3_get_src(ctx, &intr->src[0]);
2023
         cond = src[0];
2024
      } else {
2025
         /* unconditional discard: */
2026
         cond = create_immed(b, 1);
2027
      }
2028

2029
      /* NOTE: only cmps.*.* can write p0.x: */
2030
      cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
2031
      cond->cat2.condition = IR3_COND_NE;
2032

2033
      /* condition always goes in predicate register: */
2034
      cond->dsts[0]->num = regid(REG_P0, 0);
2035
      cond->dsts[0]->flags &= ~IR3_REG_SSA;
2036

2037
      if (intr->intrinsic == nir_intrinsic_demote ||
2038
          intr->intrinsic == nir_intrinsic_demote_if) {
2039
         kill = ir3_DEMOTE(b, cond, 0);
2040
      } else {
2041
         kill = ir3_KILL(b, cond, 0);
2042
      }
2043

2044
      /* Side-effects should not be moved on a different side of the kill */
2045
      kill->barrier_class = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
2046
      kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
2047
      kill->srcs[0]->num = regid(REG_P0, 0);
2048
      array_insert(ctx->ir, ctx->ir->predicates, kill);
2049

2050
      array_insert(b, b->keeps, kill);
2051
      ctx->so->has_kill = true;
2052

2053
      break;
2054
   }
2055

2056
   case nir_intrinsic_cond_end_ir3: {
2057
      struct ir3_instruction *cond, *kill;
2058

2059
      src = ir3_get_src(ctx, &intr->src[0]);
2060
      cond = src[0];
2061

2062
      /* NOTE: only cmps.*.* can write p0.x: */
2063
      cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
2064
      cond->cat2.condition = IR3_COND_NE;
2065

2066
      /* condition always goes in predicate register: */
2067
      cond->dsts[0]->num = regid(REG_P0, 0);
2068

2069
      kill = ir3_PREDT(b, cond, 0);
2070

2071
      kill->barrier_class = IR3_BARRIER_EVERYTHING;
2072
      kill->barrier_conflict = IR3_BARRIER_EVERYTHING;
2073

2074
      array_insert(ctx->ir, ctx->ir->predicates, kill);
2075
      array_insert(b, b->keeps, kill);
2076
      break;
2077
   }
2078

2079
   case nir_intrinsic_vote_any:
2080
   case nir_intrinsic_vote_all: {
2081
      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2082
      struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
2083
      if (intr->intrinsic == nir_intrinsic_vote_any)
2084
         dst[0] = ir3_ANY_MACRO(ctx->block, pred, 0);
2085
      else
2086
         dst[0] = ir3_ALL_MACRO(ctx->block, pred, 0);
2087
      dst[0]->srcs[0]->num = regid(REG_P0, 0);
2088
      array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
2089
      break;
2090
   }
2091
   case nir_intrinsic_elect:
2092
      dst[0] = ir3_ELECT_MACRO(ctx->block);
2093
      /* This may expand to a divergent if/then, so allocate stack space for
2094
       * it.
2095
       */
2096
      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2097
      break;
2098

2099
   case nir_intrinsic_read_invocation_cond_ir3: {
2100
      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2101
      struct ir3_instruction *cond = ir3_get_src(ctx, &intr->src[1])[0];
2102
      dst[0] = ir3_READ_COND_MACRO(ctx->block, ir3_get_predicate(ctx, cond), 0,
2103
                                   src, 0);
2104
      dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
2105
      dst[0]->srcs[0]->num = regid(REG_P0, 0);
2106
      array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
2107
      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2108
      break;
2109
   }
2110

2111
   case nir_intrinsic_read_first_invocation: {
2112
      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2113
      dst[0] = ir3_READ_FIRST_MACRO(ctx->block, src, 0);
2114
      dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
2115
      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2116
      break;
2117
   }
2118

2119
   case nir_intrinsic_ballot: {
2120
      struct ir3_instruction *ballot;
2121
      unsigned components = intr->dest.ssa.num_components;
2122
      if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {
2123
         /* ballot(true) is just MOVMSK */
2124
         ballot = ir3_MOVMSK(ctx->block, components);
2125
      } else {
2126
         struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2127
         struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
2128
         ballot = ir3_BALLOT_MACRO(ctx->block, pred, components);
2129
         ballot->srcs[0]->num = regid(REG_P0, 0);
2130
         array_insert(ctx->ir, ctx->ir->predicates, ballot);
2131
         ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2132
      }
2133
      ir3_split_dest(ctx->block, dst, ballot, 0, components);
2134
      break;
2135
   }
2136

2137
   case nir_intrinsic_load_shared_ir3:
2138
      emit_intrinsic_load_shared_ir3(ctx, intr, dst);
2139
      break;
2140
   case nir_intrinsic_store_shared_ir3:
2141
      emit_intrinsic_store_shared_ir3(ctx, intr);
2142
      break;
2143
   case nir_intrinsic_bindless_resource_ir3:
2144
      dst[0] = ir3_get_src(ctx, &intr->src[0])[0];
2145
      break;
2146
   default:
2147
      ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
2148
                        nir_intrinsic_infos[intr->intrinsic].name);
2149
      break;
2150
   }
2151

2152
   if (info->has_dest)
2153
      ir3_put_dst(ctx, &intr->dest);
2154
}
2155

2156
static void
2157
emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
2158
{
2159
   struct ir3_instruction **dst =
2160
      ir3_get_dst_ssa(ctx, &instr->def, instr->def.num_components);
2161

2162
   if (instr->def.bit_size == 16) {
2163
      for (int i = 0; i < instr->def.num_components; i++)
2164
         dst[i] = create_immed_typed(ctx->block, instr->value[i].u16, TYPE_U16);
2165
   } else {
2166
      for (int i = 0; i < instr->def.num_components; i++)
2167
         dst[i] = create_immed_typed(ctx->block, instr->value[i].u32, TYPE_U32);
2168
   }
2169
}
2170

2171
static void
2172
emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)
2173
{
2174
   struct ir3_instruction **dst =
2175
      ir3_get_dst_ssa(ctx, &undef->def, undef->def.num_components);
2176
   type_t type = (undef->def.bit_size == 16) ? TYPE_U16 : TYPE_U32;
2177

2178
   /* backend doesn't want undefined instructions, so just plug
2179
    * in 0.0..
2180
    */
2181
   for (int i = 0; i < undef->def.num_components; i++)
2182
      dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
2183
}
2184

2185
/*
2186
 * texture fetch/sample instructions:
2187
 */
2188

2189
static type_t
2190
get_tex_dest_type(nir_tex_instr *tex)
2191
{
2192
   type_t type;
2193

2194
   switch (tex->dest_type) {
2195
   case nir_type_float32:
2196
      return TYPE_F32;
2197
   case nir_type_float16:
2198
      return TYPE_F16;
2199
   case nir_type_int32:
2200
      return TYPE_S32;
2201
   case nir_type_int16:
2202
      return TYPE_S16;
2203
   case nir_type_bool32:
2204
   case nir_type_uint32:
2205
      return TYPE_U32;
2206
   case nir_type_bool16:
2207
   case nir_type_uint16:
2208
      return TYPE_U16;
2209
   case nir_type_invalid:
2210
   default:
2211
      unreachable("bad dest_type");
2212
   }
2213

2214
   return type;
2215
}
2216

2217
static void
2218
tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
2219
{
2220
   unsigned coords =
2221
      glsl_get_sampler_dim_coordinate_components(tex->sampler_dim);
2222
   unsigned flags = 0;
2223

2224
   /* note: would use tex->coord_components.. except txs.. also,
2225
    * since array index goes after shadow ref, we don't want to
2226
    * count it:
2227
    */
2228
   if (coords == 3)
2229
      flags |= IR3_INSTR_3D;
2230

2231
   if (tex->is_shadow && tex->op != nir_texop_lod)
2232
      flags |= IR3_INSTR_S;
2233

2234
   if (tex->is_array && tex->op != nir_texop_lod)
2235
      flags |= IR3_INSTR_A;
2236

2237
   *flagsp = flags;
2238
   *coordsp = coords;
2239
}
2240

2241
/* Gets the sampler/texture idx as a hvec2.  Which could either be dynamic
2242
 * or immediate (in which case it will get lowered later to a non .s2en
2243
 * version of the tex instruction which encode tex/samp as immediates:
2244
 */
2245
static struct tex_src_info
2246
get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
2247
{
2248
   struct ir3_block *b = ctx->block;
2249
   struct tex_src_info info = {0};
2250
   int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
2251
   int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
2252
   struct ir3_instruction *texture, *sampler;
2253

2254
   if (texture_idx >= 0 || sampler_idx >= 0) {
2255
      /* Bindless case */
2256
      info.flags |= IR3_INSTR_B;
2257

2258
      if (tex->texture_non_uniform || tex->sampler_non_uniform)
2259
         info.flags |= IR3_INSTR_NONUNIF;
2260

2261
      /* Gather information required to determine which encoding to
2262
       * choose as well as for prefetch.
2263
       */
2264
      nir_intrinsic_instr *bindless_tex = NULL;
2265
      bool tex_const;
2266
      if (texture_idx >= 0) {
2267
         ctx->so->bindless_tex = true;
2268
         bindless_tex = ir3_bindless_resource(tex->src[texture_idx].src);
2269
         assert(bindless_tex);
2270
         info.tex_base = nir_intrinsic_desc_set(bindless_tex);
2271
         tex_const = nir_src_is_const(bindless_tex->src[0]);
2272
         if (tex_const)
2273
            info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
2274
      } else {
2275
         /* To simplify some of the logic below, assume the index is
2276
          * constant 0 when it's not enabled.
2277
          */
2278
         tex_const = true;
2279
         info.tex_idx = 0;
2280
      }
2281
      nir_intrinsic_instr *bindless_samp = NULL;
2282
      bool samp_const;
2283
      if (sampler_idx >= 0) {
2284
         ctx->so->bindless_samp = true;
2285
         bindless_samp = ir3_bindless_resource(tex->src[sampler_idx].src);
2286
         assert(bindless_samp);
2287
         info.samp_base = nir_intrinsic_desc_set(bindless_samp);
2288
         samp_const = nir_src_is_const(bindless_samp->src[0]);
2289
         if (samp_const)
2290
            info.samp_idx = nir_src_as_uint(bindless_samp->src[0]);
2291
      } else {
2292
         samp_const = true;
2293
         info.samp_idx = 0;
2294
      }
2295

2296
      /* Choose encoding. */
2297
      if (tex_const && samp_const && info.tex_idx < 256 &&
2298
          info.samp_idx < 256) {
2299
         if (info.tex_idx < 16 && info.samp_idx < 16 &&
2300
             (!bindless_tex || !bindless_samp ||
2301
              info.tex_base == info.samp_base)) {
2302
            /* Everything fits within the instruction */
2303
            info.base = info.tex_base;
2304
            info.combined_idx = info.samp_idx | (info.tex_idx << 4);
2305
         } else {
2306
            info.base = info.tex_base;
2307
            info.a1_val = info.tex_idx << 3 | info.samp_base;
2308
            info.combined_idx = info.samp_idx;
2309
            info.flags |= IR3_INSTR_A1EN;
2310
         }
2311
         info.samp_tex = NULL;
2312
      } else {
2313
         info.flags |= IR3_INSTR_S2EN;
2314
         /* In the indirect case, we only use a1.x to store the sampler
2315
          * base if it differs from the texture base.
2316
          */
2317
         if (!bindless_tex || !bindless_samp ||
2318
             info.tex_base == info.samp_base) {
2319
            info.base = info.tex_base;
2320
         } else {
2321
            info.base = info.tex_base;
2322
            info.a1_val = info.samp_base;
2323
            info.flags |= IR3_INSTR_A1EN;
2324
         }
2325

2326
         /* Note: the indirect source is now a vec2 instead of hvec2, and
2327
          * for some reason the texture and sampler are swapped.
2328
          */
2329
         struct ir3_instruction *texture, *sampler;
2330

2331
         if (bindless_tex) {
2332
            texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
2333
         } else {
2334
            texture = create_immed(b, 0);
2335
         }
2336

2337
         if (bindless_samp) {
2338
            sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
2339
         } else {
2340
            sampler = create_immed(b, 0);
2341
         }
2342
         info.samp_tex = ir3_collect(ctx, texture, sampler);
2343
      }
2344
   } else {
2345
      info.flags |= IR3_INSTR_S2EN;
2346
      texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset);
2347
      sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);
2348
      if (texture_idx >= 0) {
2349
         texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
2350
         texture = ir3_COV(ctx->block, texture, TYPE_U32, TYPE_U16);
2351
      } else {
2352
         /* TODO what to do for dynamic case? I guess we only need the
2353
          * max index for astc srgb workaround so maybe not a problem
2354
          * to worry about if we don't enable indirect samplers for
2355
          * a4xx?
2356
          */
2357
         ctx->max_texture_index =
2358
            MAX2(ctx->max_texture_index, tex->texture_index);
2359
         texture = create_immed_typed(ctx->block, tex->texture_index, TYPE_U16);
2360
         info.tex_idx = tex->texture_index;
2361
      }
2362

2363
      if (sampler_idx >= 0) {
2364
         sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
2365
         sampler = ir3_COV(ctx->block, sampler, TYPE_U32, TYPE_U16);
2366
      } else {
2367
         sampler = create_immed_typed(ctx->block, tex->sampler_index, TYPE_U16);
2368
         info.samp_idx = tex->texture_index;
2369
      }
2370

2371
      info.samp_tex = ir3_collect(ctx, sampler, texture);
2372
   }
2373

2374
   return info;
2375
}
2376

2377
static void
2378
emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
2379
{
2380
   struct ir3_block *b = ctx->block;
2381
   struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
2382
   struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy;
2383
   struct ir3_instruction *lod, *compare, *proj, *sample_index;
2384
   struct tex_src_info info = {0};
2385
   bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
2386
   unsigned i, coords, flags, ncomp;
2387
   unsigned nsrc0 = 0, nsrc1 = 0;
2388
   type_t type;
2389
   opc_t opc = 0;
2390

2391
   ncomp = nir_dest_num_components(tex->dest);
2392

2393
   coord = off = ddx = ddy = NULL;
2394
   lod = proj = compare = sample_index = NULL;
2395

2396
   dst = ir3_get_dst(ctx, &tex->dest, ncomp);
2397

2398
   for (unsigned i = 0; i < tex->num_srcs; i++) {
2399
      switch (tex->src[i].src_type) {
2400
      case nir_tex_src_coord:
2401
         coord = ir3_get_src(ctx, &tex->src[i].src);
2402
         break;
2403
      case nir_tex_src_bias:
2404
         lod = ir3_get_src(ctx, &tex->src[i].src)[0];
2405
         has_bias = true;
2406
         break;
2407
      case nir_tex_src_lod:
2408
         lod = ir3_get_src(ctx, &tex->src[i].src)[0];
2409
         has_lod = true;
2410
         break;
2411
      case nir_tex_src_comparator: /* shadow comparator */
2412
         compare = ir3_get_src(ctx, &tex->src[i].src)[0];
2413
         break;
2414
      case nir_tex_src_projector:
2415
         proj = ir3_get_src(ctx, &tex->src[i].src)[0];
2416
         has_proj = true;
2417
         break;
2418
      case nir_tex_src_offset:
2419
         off = ir3_get_src(ctx, &tex->src[i].src);
2420
         has_off = true;
2421
         break;
2422
      case nir_tex_src_ddx:
2423
         ddx = ir3_get_src(ctx, &tex->src[i].src);
2424
         break;
2425
      case nir_tex_src_ddy:
2426
         ddy = ir3_get_src(ctx, &tex->src[i].src);
2427
         break;
2428
      case nir_tex_src_ms_index:
2429
         sample_index = ir3_get_src(ctx, &tex->src[i].src)[0];
2430
         break;
2431
      case nir_tex_src_texture_offset:
2432
      case nir_tex_src_sampler_offset:
2433
      case nir_tex_src_texture_handle:
2434
      case nir_tex_src_sampler_handle:
2435
         /* handled in get_tex_samp_src() */
2436
         break;
2437
      default:
2438
         ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n",
2439
                           tex->src[i].src_type);
2440
         return;
2441
      }
2442
   }
2443

2444
   switch (tex->op) {
2445
   case nir_texop_tex_prefetch:
2446
      compile_assert(ctx, !has_bias);
2447
      compile_assert(ctx, !has_lod);
2448
      compile_assert(ctx, !compare);
2449
      compile_assert(ctx, !has_proj);
2450
      compile_assert(ctx, !has_off);
2451
      compile_assert(ctx, !ddx);
2452
      compile_assert(ctx, !ddy);
2453
      compile_assert(ctx, !sample_index);
2454
      compile_assert(
2455
         ctx, nir_tex_instr_src_index(tex, nir_tex_src_texture_offset) < 0);
2456
      compile_assert(
2457
         ctx, nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset) < 0);
2458

2459
      if (ctx->so->num_sampler_prefetch < ctx->prefetch_limit) {
2460
         opc = OPC_META_TEX_PREFETCH;
2461
         ctx->so->num_sampler_prefetch++;
2462
         break;
2463
      }
2464
      FALLTHROUGH;
2465
   case nir_texop_tex:
2466
      opc = has_lod ? OPC_SAML : OPC_SAM;
2467
      break;
2468
   case nir_texop_txb:
2469
      opc = OPC_SAMB;
2470
      break;
2471
   case nir_texop_txl:
2472
      opc = OPC_SAML;
2473
      break;
2474
   case nir_texop_txd:
2475
      opc = OPC_SAMGQ;
2476
      break;
2477
   case nir_texop_txf:
2478
      opc = OPC_ISAML;
2479
      break;
2480
   case nir_texop_lod:
2481
      opc = OPC_GETLOD;
2482
      break;
2483
   case nir_texop_tg4:
2484
      /* NOTE: a4xx might need to emulate gather w/ txf (this is
2485
       * what blob does, seems gather  is broken?), and a3xx did
2486
       * not support it (but probably could also emulate).
2487
       */
2488
      switch (tex->component) {
2489
      case 0:
2490
         opc = OPC_GATHER4R;
2491
         break;
2492
      case 1:
2493
         opc = OPC_GATHER4G;
2494
         break;
2495
      case 2:
2496
         opc = OPC_GATHER4B;
2497
         break;
2498
      case 3:
2499
         opc = OPC_GATHER4A;
2500
         break;
2501
      }
2502
      break;
2503
   case nir_texop_txf_ms_fb:
2504
   case nir_texop_txf_ms:
2505
      opc = OPC_ISAMM;
2506
      break;
2507
   default:
2508
      ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
2509
      return;
2510
   }
2511

2512
   tex_info(tex, &flags, &coords);
2513

2514
   /*
2515
    * lay out the first argument in the proper order:
2516
    *  - actual coordinates first
2517
    *  - shadow reference
2518
    *  - array index
2519
    *  - projection w
2520
    *  - starting at offset 4, dpdx.xy, dpdy.xy
2521
    *
2522
    * bias/lod go into the second arg
2523
    */
2524

2525
   /* insert tex coords: */
2526
   for (i = 0; i < coords; i++)
2527
      src0[i] = coord[i];
2528

2529
   nsrc0 = i;
2530

2531
   /* scale up integer coords for TXF based on the LOD */
2532
   if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
2533
      assert(has_lod);
2534
      for (i = 0; i < coords; i++)
2535
         src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
2536
   }
2537

2538
   if (coords == 1) {
2539
      /* hw doesn't do 1d, so we treat it as 2d with
2540
       * height of 1, and patch up the y coord.
2541
       */
2542
      if (is_isam(opc)) {
2543
         src0[nsrc0++] = create_immed(b, 0);
2544
      } else {
2545
         src0[nsrc0++] = create_immed(b, fui(0.5));
2546
      }
2547
   }
2548

2549
   if (tex->is_shadow && tex->op != nir_texop_lod)
2550
      src0[nsrc0++] = compare;
2551

2552
   if (tex->is_array && tex->op != nir_texop_lod) {
2553
      struct ir3_instruction *idx = coord[coords];
2554

2555
      /* the array coord for cube arrays needs 0.5 added to it */
2556
      if (ctx->compiler->array_index_add_half && !is_isam(opc))
2557
         idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);
2558

2559
      src0[nsrc0++] = idx;
2560
   }
2561

2562
   if (has_proj) {
2563
      src0[nsrc0++] = proj;
2564
      flags |= IR3_INSTR_P;
2565
   }
2566

2567
   /* pad to 4, then ddx/ddy: */
2568
   if (tex->op == nir_texop_txd) {
2569
      while (nsrc0 < 4)
2570
         src0[nsrc0++] = create_immed(b, fui(0.0));
2571
      for (i = 0; i < coords; i++)
2572
         src0[nsrc0++] = ddx[i];
2573
      if (coords < 2)
2574
         src0[nsrc0++] = create_immed(b, fui(0.0));
2575
      for (i = 0; i < coords; i++)
2576
         src0[nsrc0++] = ddy[i];
2577
      if (coords < 2)
2578
         src0[nsrc0++] = create_immed(b, fui(0.0));
2579
   }
2580

2581
   /* NOTE a3xx (and possibly a4xx?) might be different, using isaml
2582
    * with scaled x coord according to requested sample:
2583
    */
2584
   if (opc == OPC_ISAMM) {
2585
      if (ctx->compiler->txf_ms_with_isaml) {
2586
         /* the samples are laid out in x dimension as
2587
          *     0 1 2 3
2588
          * x_ms = (x << ms) + sample_index;
2589
          */
2590
         struct ir3_instruction *ms;
2591
         ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
2592

2593
         src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
2594
         src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
2595

2596
         opc = OPC_ISAML;
2597
      } else {
2598
         src0[nsrc0++] = sample_index;
2599
      }
2600
   }
2601

2602
   /*
2603
    * second argument (if applicable):
2604
    *  - offsets
2605
    *  - lod
2606
    *  - bias
2607
    */
2608
   if (has_off | has_lod | has_bias) {
2609
      if (has_off) {
2610
         unsigned off_coords = coords;
2611
         if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
2612
            off_coords--;
2613
         for (i = 0; i < off_coords; i++)
2614
            src1[nsrc1++] = off[i];
2615
         if (off_coords < 2)
2616
            src1[nsrc1++] = create_immed(b, fui(0.0));
2617
         flags |= IR3_INSTR_O;
2618
      }
2619

2620
      if (has_lod | has_bias)
2621
         src1[nsrc1++] = lod;
2622
   }
2623

2624
   type = get_tex_dest_type(tex);
2625

2626
   if (opc == OPC_GETLOD)
2627
      type = TYPE_S32;
2628

2629
   if (tex->op == nir_texop_txf_ms_fb) {
2630
      /* only expect a single txf_ms_fb per shader: */
2631
      compile_assert(ctx, !ctx->so->fb_read);
2632
      compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT);
2633

2634
      ctx->so->fb_read = true;
2635
      info.samp_tex = ir3_collect(
2636
         ctx, create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16),
2637
         create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16));
2638
      info.flags = IR3_INSTR_S2EN;
2639

2640
      ctx->so->num_samp++;
2641
   } else {
2642
      info = get_tex_samp_tex_src(ctx, tex);
2643
   }
2644

2645
   struct ir3_instruction *col0 = ir3_create_collect(ctx, src0, nsrc0);
2646
   struct ir3_instruction *col1 = ir3_create_collect(ctx, src1, nsrc1);
2647

2648
   if (opc == OPC_META_TEX_PREFETCH) {
2649
      int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
2650

2651
      compile_assert(ctx, tex->src[idx].src.is_ssa);
2652

2653
      sam = ir3_SAM(b, opc, type, MASK(ncomp), 0, NULL,
2654
                    get_barycentric(ctx, IJ_PERSP_PIXEL), 0);
2655
      sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa);
2656
      /* make sure not to add irrelevant flags like S2EN */
2657
      sam->flags = flags | (info.flags & IR3_INSTR_B);
2658
      sam->prefetch.tex = info.tex_idx;
2659
      sam->prefetch.samp = info.samp_idx;
2660
      sam->prefetch.tex_base = info.tex_base;
2661
      sam->prefetch.samp_base = info.samp_base;
2662
   } else {
2663
      info.flags |= flags;
2664
      sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1);
2665
   }
2666

2667
   if ((ctx->astc_srgb & (1 << tex->texture_index)) &&
2668
       !nir_tex_instr_is_query(tex)) {
2669
      assert(opc != OPC_META_TEX_PREFETCH);
2670

2671
      /* only need first 3 components: */
2672
      sam->dsts[0]->wrmask = 0x7;
2673
      ir3_split_dest(b, dst, sam, 0, 3);
2674

2675
      /* we need to sample the alpha separately with a non-ASTC
2676
       * texture state:
2677
       */
2678
      sam = ir3_SAM(b, opc, type, 0b1000, flags | info.flags, info.samp_tex,
2679
                    col0, col1);
2680

2681
      array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
2682

2683
      /* fixup .w component: */
2684
      ir3_split_dest(b, &dst[3], sam, 3, 1);
2685
   } else {
2686
      /* normal (non-workaround) case: */
2687
      ir3_split_dest(b, dst, sam, 0, ncomp);
2688
   }
2689

2690
   /* GETLOD returns results in 4.8 fixed point */
2691
   if (opc == OPC_GETLOD) {
2692
      struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
2693

2694
      compile_assert(ctx, tex->dest_type == nir_type_float32);
2695
      for (i = 0; i < 2; i++) {
2696
         dst[i] =
2697
            ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_S32, TYPE_F32), 0, factor, 0);
2698
      }
2699
   }
2700

2701
   ir3_put_dst(ctx, &tex->dest);
2702
}
2703

2704
static void
2705
emit_tex_info(struct ir3_context *ctx, nir_tex_instr *tex, unsigned idx)
2706
{
2707
   struct ir3_block *b = ctx->block;
2708
   struct ir3_instruction **dst, *sam;
2709
   type_t dst_type = get_tex_dest_type(tex);
2710
   struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
2711

2712
   dst = ir3_get_dst(ctx, &tex->dest, 1);
2713

2714
   sam = emit_sam(ctx, OPC_GETINFO, info, dst_type, 1 << idx, NULL, NULL);
2715

2716
   /* even though there is only one component, since it ends
2717
    * up in .y/.z/.w rather than .x, we need a split_dest()
2718
    */
2719
   ir3_split_dest(b, dst, sam, idx, 1);
2720

2721
   /* The # of levels comes from getinfo.z. We need to add 1 to it, since
2722
    * the value in TEX_CONST_0 is zero-based.
2723
    */
2724
   if (ctx->compiler->levels_add_one)
2725
      dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
2726

2727
   ir3_put_dst(ctx, &tex->dest);
2728
}
2729

2730
static void
2731
emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
2732
{
2733
   struct ir3_block *b = ctx->block;
2734
   struct ir3_instruction **dst, *sam;
2735
   struct ir3_instruction *lod;
2736
   unsigned flags, coords;
2737
   type_t dst_type = get_tex_dest_type(tex);
2738
   struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
2739

2740
   tex_info(tex, &flags, &coords);
2741
   info.flags |= flags;
2742

2743
   /* Actually we want the number of dimensions, not coordinates. This
2744
    * distinction only matters for cubes.
2745
    */
2746
   if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
2747
      coords = 2;
2748

2749
   dst = ir3_get_dst(ctx, &tex->dest, 4);
2750

2751
   int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);
2752
   compile_assert(ctx, lod_idx >= 0);
2753

2754
   lod = ir3_get_src(ctx, &tex->src[lod_idx].src)[0];
2755

2756
   if (tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
2757
      sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
2758
   } else {
2759
      /*
2760
       * The maximum value which OPC_GETSIZE could return for one dimension
2761
       * is 0x007ff0, however sampler buffer could be much bigger.
2762
       * Blob uses OPC_GETBUF for them.
2763
       */
2764
      sam = emit_sam(ctx, OPC_GETBUF, info, dst_type, 0b1111, NULL, NULL);
2765
   }
2766

2767
   ir3_split_dest(b, dst, sam, 0, 4);
2768

2769
   /* Array size actually ends up in .w rather than .z. This doesn't
2770
    * matter for miplevel 0, but for higher mips the value in z is
2771
    * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
2772
    * returned, which means that we have to add 1 to it for arrays.
2773
    */
2774
   if (tex->is_array) {
2775
      if (ctx->compiler->levels_add_one) {
2776
         dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
2777
      } else {
2778
         dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
2779
      }
2780
   }
2781

2782
   ir3_put_dst(ctx, &tex->dest);
2783
}
2784

2785
/* phi instructions are left partially constructed.  We don't resolve
2786
 * their srcs until the end of the shader, since (eg. loops) one of
2787
 * the phi's srcs might be defined after the phi due to back edges in
2788
 * the CFG.
2789
 */
2790
static void
2791
emit_phi(struct ir3_context *ctx, nir_phi_instr *nphi)
2792
{
2793
   struct ir3_instruction *phi, **dst;
2794

2795
   /* NOTE: phi's should be lowered to scalar at this point */
2796
   compile_assert(ctx, nphi->dest.ssa.num_components == 1);
2797

2798
   dst = ir3_get_dst(ctx, &nphi->dest, 1);
2799

2800
   phi = ir3_instr_create(ctx->block, OPC_META_PHI, 1,
2801
                          exec_list_length(&nphi->srcs));
2802
   __ssa_dst(phi);
2803
   phi->phi.nphi = nphi;
2804

2805
   dst[0] = phi;
2806

2807
   ir3_put_dst(ctx, &nphi->dest);
2808
}
2809

2810
static struct ir3_block *get_block(struct ir3_context *ctx,
2811
                                   const nir_block *nblock);
2812

2813
static struct ir3_instruction *
2814
read_phi_src(struct ir3_context *ctx, struct ir3_block *blk,
2815
             struct ir3_instruction *phi, nir_phi_instr *nphi)
2816
{
2817
   if (!blk->nblock) {
2818
      struct ir3_instruction *continue_phi =
2819
         ir3_instr_create(blk, OPC_META_PHI, 1, blk->predecessors_count);
2820
      __ssa_dst(continue_phi)->flags = phi->dsts[0]->flags;
2821

2822
      for (unsigned i = 0; i < blk->predecessors_count; i++) {
2823
         struct ir3_instruction *src =
2824
            read_phi_src(ctx, blk->predecessors[i], phi, nphi);
2825
         if (src)
2826
            __ssa_src(continue_phi, src, 0);
2827
         else
2828
            ir3_src_create(continue_phi, INVALID_REG, phi->dsts[0]->flags);
2829
      }
2830

2831
      return continue_phi;
2832
   }
2833

2834
   nir_foreach_phi_src (nsrc, nphi) {
2835
      if (blk->nblock == nsrc->pred) {
2836
         if (nsrc->src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
2837
            /* Create an ir3 undef */
2838
            return NULL;
2839
         } else {
2840
            return ir3_get_src(ctx, &nsrc->src)[0];
2841
         }
2842
      }
2843
   }
2844

2845
   unreachable("couldn't find phi node ir3 block");
2846
   return NULL;
2847
}
2848

2849
static void
2850
resolve_phis(struct ir3_context *ctx, struct ir3_block *block)
2851
{
2852
   foreach_instr (phi, &block->instr_list) {
2853
      if (phi->opc != OPC_META_PHI)
2854
         break;
2855

2856
      nir_phi_instr *nphi = phi->phi.nphi;
2857

2858
      if (!nphi) /* skip continue phis created above */
2859
         continue;
2860

2861
      for (unsigned i = 0; i < block->predecessors_count; i++) {
2862
         struct ir3_block *pred = block->predecessors[i];
2863
         struct ir3_instruction *src = read_phi_src(ctx, pred, phi, nphi);
2864
         if (src) {
2865
            __ssa_src(phi, src, 0);
2866
         } else {
2867
            /* Create an ir3 undef */
2868
            ir3_src_create(phi, INVALID_REG, phi->dsts[0]->flags);
2869
         }
2870
      }
2871
   }
2872
}
2873

2874
static void
2875
emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
2876
{
2877
   switch (jump->type) {
2878
   case nir_jump_break:
2879
   case nir_jump_continue:
2880
   case nir_jump_return:
2881
      /* I *think* we can simply just ignore this, and use the
2882
       * successor block link to figure out where we need to
2883
       * jump to for break/continue
2884
       */
2885
      break;
2886
   default:
2887
      ir3_context_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
2888
      break;
2889
   }
2890
}
2891

2892
static void
2893
emit_instr(struct ir3_context *ctx, nir_instr *instr)
2894
{
2895
   switch (instr->type) {
2896
   case nir_instr_type_alu:
2897
      emit_alu(ctx, nir_instr_as_alu(instr));
2898
      break;
2899
   case nir_instr_type_deref:
2900
      /* ignored, handled as part of the intrinsic they are src to */
2901
      break;
2902
   case nir_instr_type_intrinsic:
2903
      emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
2904
      break;
2905
   case nir_instr_type_load_const:
2906
      emit_load_const(ctx, nir_instr_as_load_const(instr));
2907
      break;
2908
   case nir_instr_type_ssa_undef:
2909
      emit_undef(ctx, nir_instr_as_ssa_undef(instr));
2910
      break;
2911
   case nir_instr_type_tex: {
2912
      nir_tex_instr *tex = nir_instr_as_tex(instr);
2913
      /* couple tex instructions get special-cased:
2914
       */
2915
      switch (tex->op) {
2916
      case nir_texop_txs:
2917
         emit_tex_txs(ctx, tex);
2918
         break;
2919
      case nir_texop_query_levels:
2920
         emit_tex_info(ctx, tex, 2);
2921
         break;
2922
      case nir_texop_texture_samples:
2923
         emit_tex_info(ctx, tex, 3);
2924
         break;
2925
      default:
2926
         emit_tex(ctx, tex);
2927
         break;
2928
      }
2929
      break;
2930
   }
2931
   case nir_instr_type_jump:
2932
      emit_jump(ctx, nir_instr_as_jump(instr));
2933
      break;
2934
   case nir_instr_type_phi:
2935
      emit_phi(ctx, nir_instr_as_phi(instr));
2936
      break;
2937
   case nir_instr_type_call:
2938
   case nir_instr_type_parallel_copy:
2939
      ir3_context_error(ctx, "Unhandled NIR instruction type: %d\n",
2940
                        instr->type);
2941
      break;
2942
   }
2943
}
2944

2945
static struct ir3_block *
2946
get_block(struct ir3_context *ctx, const nir_block *nblock)
2947
{
2948
   struct ir3_block *block;
2949
   struct hash_entry *hentry;
2950

2951
   hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
2952
   if (hentry)
2953
      return hentry->data;
2954

2955
   block = ir3_block_create(ctx->ir);
2956
   block->nblock = nblock;
2957
   _mesa_hash_table_insert(ctx->block_ht, nblock, block);
2958

2959
   return block;
2960
}
2961

2962
static struct ir3_block *
2963
get_block_or_continue(struct ir3_context *ctx, const nir_block *nblock)
2964
{
2965
   struct hash_entry *hentry;
2966

2967
   hentry = _mesa_hash_table_search(ctx->continue_block_ht, nblock);
2968
   if (hentry)
2969
      return hentry->data;
2970

2971
   return get_block(ctx, nblock);
2972
}
2973

2974
static struct ir3_block *
2975
create_continue_block(struct ir3_context *ctx, const nir_block *nblock)
2976
{
2977
   struct ir3_block *block = ir3_block_create(ctx->ir);
2978
   block->nblock = NULL;
2979
   _mesa_hash_table_insert(ctx->continue_block_ht, nblock, block);
2980
   return block;
2981
}
2982

2983
static void
2984
emit_block(struct ir3_context *ctx, nir_block *nblock)
2985
{
2986
   ctx->block = get_block(ctx, nblock);
2987

2988
   list_addtail(&ctx->block->node, &ctx->ir->block_list);
2989

2990
   ctx->block->loop_id = ctx->loop_id;
2991

2992
   /* re-emit addr register in each block if needed: */
2993
   for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) {
2994
      _mesa_hash_table_destroy(ctx->addr0_ht[i], NULL);
2995
      ctx->addr0_ht[i] = NULL;
2996
   }
2997

2998
   _mesa_hash_table_u64_destroy(ctx->addr1_ht);
2999
   ctx->addr1_ht = NULL;
3000

3001
   nir_foreach_instr (instr, nblock) {
3002
      ctx->cur_instr = instr;
3003
      emit_instr(ctx, instr);
3004
      ctx->cur_instr = NULL;
3005
      if (ctx->error)
3006
         return;
3007
   }
3008

3009
   for (int i = 0; i < ARRAY_SIZE(ctx->block->successors); i++) {
3010
      if (nblock->successors[i]) {
3011
         ctx->block->successors[i] =
3012
            get_block_or_continue(ctx, nblock->successors[i]);
3013
         ctx->block->physical_successors[i] = ctx->block->successors[i];
3014
      }
3015
   }
3016

3017
   _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL);
3018
}
3019

3020
static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
3021

3022
static void
3023
emit_if(struct ir3_context *ctx, nir_if *nif)
3024
{
3025
   struct ir3_instruction *condition = ir3_get_src(ctx, &nif->condition)[0];
3026

3027
   if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) {
3028
      ctx->block->condition = ssa(condition->srcs[0]);
3029
      ctx->block->brtype = IR3_BRANCH_ANY;
3030
   } else if (condition->opc == OPC_ALL_MACRO &&
3031
              condition->block == ctx->block) {
3032
      ctx->block->condition = ssa(condition->srcs[0]);
3033
      ctx->block->brtype = IR3_BRANCH_ALL;
3034
   } else if (condition->opc == OPC_ELECT_MACRO &&
3035
              condition->block == ctx->block) {
3036
      ctx->block->condition = NULL;
3037
      ctx->block->brtype = IR3_BRANCH_GETONE;
3038
   } else {
3039
      ctx->block->condition = ir3_get_predicate(ctx, condition);
3040
      ctx->block->brtype = IR3_BRANCH_COND;
3041
   }
3042

3043
   emit_cf_list(ctx, &nif->then_list);
3044
   emit_cf_list(ctx, &nif->else_list);
3045

3046
   struct ir3_block *last_then = get_block(ctx, nir_if_last_then_block(nif));
3047
   struct ir3_block *first_else = get_block(ctx, nir_if_first_else_block(nif));
3048
   assert(last_then->physical_successors[0] &&
3049
          !last_then->physical_successors[1]);
3050
   last_then->physical_successors[1] = first_else;
3051

3052
   struct ir3_block *last_else = get_block(ctx, nir_if_last_else_block(nif));
3053
   struct ir3_block *after_if =
3054
      get_block(ctx, nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));
3055
   last_else->physical_successors[0] = after_if;
3056
}
3057

3058
static void
3059
emit_loop(struct ir3_context *ctx, nir_loop *nloop)
3060
{
3061
   unsigned old_loop_id = ctx->loop_id;
3062
   ctx->loop_id = ctx->so->loops + 1;
3063

3064
   struct nir_block *nstart = nir_loop_first_block(nloop);
3065
   struct ir3_block *continue_blk = NULL;
3066

3067
   /* There's always one incoming edge from outside the loop, and if there
3068
    * are more than two backedges from inside the loop (so more than 2 total
3069
    * edges) then we need to create a continue block after the loop to ensure
3070
    * that control reconverges at the end of each loop iteration.
3071
    */
3072
   if (nstart->predecessors->entries > 2) {
3073
      continue_blk = create_continue_block(ctx, nstart);
3074
   }
3075

3076
   emit_cf_list(ctx, &nloop->body);
3077

3078
   if (continue_blk) {
3079
      struct ir3_block *start = get_block(ctx, nstart);
3080
      continue_blk->successors[0] = start;
3081
      continue_blk->physical_successors[0] = start;
3082
      list_addtail(&continue_blk->node, &ctx->ir->block_list);
3083
   }
3084

3085
   ctx->so->loops++;
3086
   ctx->loop_id = old_loop_id;
3087
}
3088

3089
static void
3090
stack_push(struct ir3_context *ctx)
3091
{
3092
   ctx->stack++;
3093
   ctx->max_stack = MAX2(ctx->max_stack, ctx->stack);
3094
}
3095

3096
static void
3097
stack_pop(struct ir3_context *ctx)
3098
{
3099
   compile_assert(ctx, ctx->stack > 0);
3100
   ctx->stack--;
3101
}
3102

3103
static void
3104
emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
3105
{
3106
   foreach_list_typed (nir_cf_node, node, node, list) {
3107
      switch (node->type) {
3108
      case nir_cf_node_block:
3109
         emit_block(ctx, nir_cf_node_as_block(node));
3110
         break;
3111
      case nir_cf_node_if:
3112
         stack_push(ctx);
3113
         emit_if(ctx, nir_cf_node_as_if(node));
3114
         stack_pop(ctx);
3115
         break;
3116
      case nir_cf_node_loop:
3117
         stack_push(ctx);
3118
         emit_loop(ctx, nir_cf_node_as_loop(node));
3119
         stack_pop(ctx);
3120
         break;
3121
      case nir_cf_node_function:
3122
         ir3_context_error(ctx, "TODO\n");
3123
         break;
3124
      }
3125
   }
3126
}
3127

3128
/* emit stream-out code.  At this point, the current block is the original
3129
 * (nir) end block, and nir ensures that all flow control paths terminate
3130
 * into the end block.  We re-purpose the original end block to generate
3131
 * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
3132
 * block holding stream-out write instructions, followed by the new end
3133
 * block:
3134
 *
3135
 *   blockOrigEnd {
3136
 *      p0.x = (vtxcnt < maxvtxcnt)
3137
 *      // succs: blockStreamOut, blockNewEnd
3138
 *   }
3139
 *   blockStreamOut {
3140
 *      // preds: blockOrigEnd
3141
 *      ... stream-out instructions ...
3142
 *      // succs: blockNewEnd
3143
 *   }
3144
 *   blockNewEnd {
3145
 *      // preds: blockOrigEnd, blockStreamOut
3146
 *   }
3147
 */
3148
static void
3149
emit_stream_out(struct ir3_context *ctx)
3150
{
3151
   struct ir3 *ir = ctx->ir;
3152
   struct ir3_stream_output_info *strmout = &ctx->so->shader->stream_output;
3153
   struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
3154
   struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
3155
   struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
3156

3157
   /* create vtxcnt input in input block at top of shader,
3158
    * so that it is seen as live over the entire duration
3159
    * of the shader:
3160
    */
3161
   vtxcnt = create_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, 0x1);
3162
   maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
3163

3164
   /* at this point, we are at the original 'end' block,
3165
    * re-purpose this block to stream-out condition, then
3166
    * append stream-out block and new-end block
3167
    */
3168
   orig_end_block = ctx->block;
3169

3170
   // maybe w/ store_global intrinsic, we could do this
3171
   // stuff in nir->nir pass
3172

3173
   stream_out_block = ir3_block_create(ir);
3174
   list_addtail(&stream_out_block->node, &ir->block_list);
3175

3176
   new_end_block = ir3_block_create(ir);
3177
   list_addtail(&new_end_block->node, &ir->block_list);
3178

3179
   orig_end_block->successors[0] = stream_out_block;
3180
   orig_end_block->successors[1] = new_end_block;
3181

3182
   stream_out_block->successors[0] = new_end_block;
3183

3184
   /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
3185
   cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
3186
   cond->dsts[0]->num = regid(REG_P0, 0);
3187
   cond->dsts[0]->flags &= ~IR3_REG_SSA;
3188
   cond->cat2.condition = IR3_COND_LT;
3189

3190
   /* condition goes on previous block to the conditional,
3191
    * since it is used to pick which of the two successor
3192
    * paths to take:
3193
    */
3194
   orig_end_block->condition = cond;
3195

3196
   /* switch to stream_out_block to generate the stream-out
3197
    * instructions:
3198
    */
3199
   ctx->block = stream_out_block;
3200

3201
   /* Calculate base addresses based on vtxcnt.  Instructions
3202
    * generated for bases not used in following loop will be
3203
    * stripped out in the backend.
3204
    */
3205
   for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
3206
      const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
3207
      unsigned stride = strmout->stride[i];
3208
      struct ir3_instruction *base, *off;
3209

3210
      base = create_uniform(ctx->block, regid(const_state->offsets.tfbo, i));
3211

3212
      /* 24-bit should be enough: */
3213
      off = ir3_MUL_U24(ctx->block, vtxcnt, 0,
3214
                        create_immed(ctx->block, stride * 4), 0);
3215

3216
      bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
3217
   }
3218

3219
   /* Generate the per-output store instructions: */
3220
   for (unsigned i = 0; i < strmout->num_outputs; i++) {
3221
      for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
3222
         unsigned c = j + strmout->output[i].start_component;
3223
         struct ir3_instruction *base, *out, *stg;
3224

3225
         base = bases[strmout->output[i].output_buffer];
3226
         out = ctx->outputs[regid(strmout->output[i].register_index, c)];
3227

3228
         stg = ir3_STG(
3229
            ctx->block, base, 0,
3230
            create_immed(ctx->block, (strmout->output[i].dst_offset + j) * 4),
3231
            0, out, 0, create_immed(ctx->block, 1), 0);
3232
         stg->cat6.type = TYPE_U32;
3233

3234
         array_insert(ctx->block, ctx->block->keeps, stg);
3235
      }
3236
   }
3237

3238
   /* and finally switch to the new_end_block: */
3239
   ctx->block = new_end_block;
3240
}
3241

3242
static void
3243
setup_predecessors(struct ir3 *ir)
3244
{
3245
   foreach_block (block, &ir->block_list) {
3246
      for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
3247
         if (block->successors[i])
3248
            ir3_block_add_predecessor(block->successors[i], block);
3249
         if (block->physical_successors[i])
3250
            ir3_block_add_physical_predecessor(block->physical_successors[i],
3251
                                               block);
3252
      }
3253
   }
3254
}
3255

3256
static void
3257
emit_function(struct ir3_context *ctx, nir_function_impl *impl)
3258
{
3259
   nir_metadata_require(impl, nir_metadata_block_index);
3260

3261
   compile_assert(ctx, ctx->stack == 0);
3262

3263
   emit_cf_list(ctx, &impl->body);
3264
   emit_block(ctx, impl->end_block);
3265

3266
   compile_assert(ctx, ctx->stack == 0);
3267

3268
   /* at this point, we should have a single empty block,
3269
    * into which we emit the 'end' instruction.
3270
    */
3271
   compile_assert(ctx, list_is_empty(&ctx->block->instr_list));
3272

3273
   /* If stream-out (aka transform-feedback) enabled, emit the
3274
    * stream-out instructions, followed by a new empty block (into
3275
    * which the 'end' instruction lands).
3276
    *
3277
    * NOTE: it is done in this order, rather than inserting before
3278
    * we emit end_block, because NIR guarantees that all blocks
3279
    * flow into end_block, and that end_block has no successors.
3280
    * So by re-purposing end_block as the first block of stream-
3281
    * out, we guarantee that all exit paths flow into the stream-
3282
    * out instructions.
3283
    */
3284
   if ((ctx->compiler->gpu_id < 500) &&
3285
       (ctx->so->shader->stream_output.num_outputs > 0) &&
3286
       !ctx->so->binning_pass) {
3287
      debug_assert(ctx->so->type == MESA_SHADER_VERTEX);
3288
      emit_stream_out(ctx);
3289
   }
3290

3291
   setup_predecessors(ctx->ir);
3292
   foreach_block (block, &ctx->ir->block_list) {
3293
      resolve_phis(ctx, block);
3294
   }
3295
}
3296

3297
static void
3298
setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr)
3299
{
3300
   struct ir3_shader_variant *so = ctx->so;
3301
   struct ir3_instruction *coord = NULL;
3302

3303
   if (intr->intrinsic == nir_intrinsic_load_interpolated_input)
3304
      coord = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), 2);
3305

3306
   compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0]));
3307

3308
   unsigned frac = nir_intrinsic_component(intr);
3309
   unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]);
3310
   unsigned ncomp = nir_intrinsic_dest_components(intr);
3311
   unsigned n = nir_intrinsic_base(intr) + offset;
3312
   unsigned slot = nir_intrinsic_io_semantics(intr).location + offset;
3313
   unsigned compmask;
3314

3315
   /* Inputs are loaded using ldlw or ldg for other stages. */
3316
   compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT ||
3317
                          ctx->so->type == MESA_SHADER_VERTEX);
3318

3319
   if (ctx->so->type == MESA_SHADER_FRAGMENT)
3320
      compmask = BITFIELD_MASK(ncomp) << frac;
3321
   else
3322
      compmask = BITFIELD_MASK(ncomp + frac);
3323

3324
   /* for a4xx+ rasterflat */
3325
   if (so->inputs[n].rasterflat && ctx->so->key.rasterflat)
3326
      coord = NULL;
3327

3328
   so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask);
3329

3330
   so->inputs[n].slot = slot;
3331
   so->inputs[n].compmask |= compmask;
3332
   so->inputs_count = MAX2(so->inputs_count, n + 1);
3333
   compile_assert(ctx, so->inputs_count < ARRAY_SIZE(so->inputs));
3334
   so->inputs[n].flat = !coord;
3335

3336
   if (ctx->so->type == MESA_SHADER_FRAGMENT) {
3337
      compile_assert(ctx, slot != VARYING_SLOT_POS);
3338

3339
      so->inputs[n].bary = true;
3340

3341
      for (int i = 0; i < ncomp; i++) {
3342
         unsigned idx = (n * 4) + i + frac;
3343
         ctx->last_dst[i] = create_frag_input(ctx, coord, idx);
3344
      }
3345
   } else {
3346
      struct ir3_instruction *input = NULL;
3347

3348
      foreach_input (in, ctx->ir) {
3349
         if (in->input.inidx == n) {
3350
            input = in;
3351
            break;
3352
         }
3353
      }
3354

3355
      if (!input) {
3356
         input = create_input(ctx, compmask);
3357
         input->input.inidx = n;
3358
      } else {
3359
         /* For aliased inputs, just append to the wrmask.. ie. if we
3360
          * first see a vec2 index at slot N, and then later a vec4,
3361
          * the wrmask of the resulting overlapped vec2 and vec4 is 0xf
3362
          */
3363
         input->dsts[0]->wrmask |= compmask;
3364
      }
3365

3366
      for (int i = 0; i < ncomp + frac; i++) {
3367
         unsigned idx = (n * 4) + i;
3368
         compile_assert(ctx, idx < ctx->ninputs);
3369

3370
         /* fixup the src wrmask to avoid validation fail */
3371
         if (ctx->inputs[idx] && (ctx->inputs[idx] != input)) {
3372
            ctx->inputs[idx]->srcs[0]->wrmask = input->dsts[0]->wrmask;
3373
            continue;
3374
         }
3375

3376
         ir3_split_dest(ctx->block, &ctx->inputs[idx], input, i, 1);
3377
      }
3378

3379
      for (int i = 0; i < ncomp; i++) {
3380
         unsigned idx = (n * 4) + i + frac;
3381
         ctx->last_dst[i] = ctx->inputs[idx];
3382
      }
3383
   }
3384
}
3385

3386
/* Initially we assign non-packed inloc's for varyings, as we don't really
3387
 * know up-front which components will be unused.  After all the compilation
3388
 * stages we scan the shader to see which components are actually used, and
3389
 * re-pack the inlocs to eliminate unneeded varyings.
3390
 */
3391
static void
3392
pack_inlocs(struct ir3_context *ctx)
3393
{
3394
   struct ir3_shader_variant *so = ctx->so;
3395
   uint8_t used_components[so->inputs_count];
3396

3397
   memset(used_components, 0, sizeof(used_components));
3398

3399
   /*
3400
    * First Step: scan shader to find which bary.f/ldlv remain:
3401
    */
3402

3403
   foreach_block (block, &ctx->ir->block_list) {
3404
      foreach_instr (instr, &block->instr_list) {
3405
         if (is_input(instr)) {
3406
            unsigned inloc = instr->srcs[0]->iim_val;
3407
            unsigned i = inloc / 4;
3408
            unsigned j = inloc % 4;
3409

3410
            compile_assert(ctx, instr->srcs[0]->flags & IR3_REG_IMMED);
3411
            compile_assert(ctx, i < so->inputs_count);
3412

3413
            used_components[i] |= 1 << j;
3414
         } else if (instr->opc == OPC_META_TEX_PREFETCH) {
3415
            for (int n = 0; n < 2; n++) {
3416
               unsigned inloc = instr->prefetch.input_offset + n;
3417
               unsigned i = inloc / 4;
3418
               unsigned j = inloc % 4;
3419

3420
               compile_assert(ctx, i < so->inputs_count);
3421

3422
               used_components[i] |= 1 << j;
3423
            }
3424
         }
3425
      }
3426
   }
3427

3428
   /*
3429
    * Second Step: reassign varying inloc/slots:
3430
    */
3431

3432
   unsigned actual_in = 0;
3433
   unsigned inloc = 0;
3434

3435
   /* for clip+cull distances, unused components can't be eliminated because
3436
    * they're read by fixed-function, even if there's a hole.  Note that
3437
    * clip/cull distance arrays must be declared in the FS, so we can just
3438
    * use the NIR clip/cull distances to avoid reading ucp_enables in the
3439
    * shader key.
3440
    */
3441
   unsigned clip_cull_size =
3442
      ctx->so->shader->nir->info.clip_distance_array_size +
3443
      ctx->so->shader->nir->info.cull_distance_array_size;
3444
   unsigned clip_cull_mask = MASK(clip_cull_size);
3445

3446
   for (unsigned i = 0; i < so->inputs_count; i++) {
3447
      unsigned compmask = 0, maxcomp = 0;
3448

3449
      so->inputs[i].inloc = inloc;
3450
      so->inputs[i].bary = false;
3451

3452
      if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0 ||
3453
          so->inputs[i].slot == VARYING_SLOT_CLIP_DIST1) {
3454
         if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0)
3455
            compmask = clip_cull_mask & 0xf;
3456
         else
3457
            compmask = clip_cull_mask >> 4;
3458
         used_components[i] = compmask;
3459
      }
3460

3461
      for (unsigned j = 0; j < 4; j++) {
3462
         if (!(used_components[i] & (1 << j)))
3463
            continue;
3464

3465
         compmask |= (1 << j);
3466
         actual_in++;
3467
         maxcomp = j + 1;
3468

3469
         /* at this point, since used_components[i] mask is only
3470
          * considering varyings (ie. not sysvals) we know this
3471
          * is a varying:
3472
          */
3473
         so->inputs[i].bary = true;
3474
      }
3475

3476
      if (so->inputs[i].bary) {
3477
         so->varying_in++;
3478
         so->inputs[i].compmask = (1 << maxcomp) - 1;
3479
         inloc += maxcomp;
3480
      }
3481
   }
3482

3483
   /*
3484
    * Third Step: reassign packed inloc's:
3485
    */
3486

3487
   foreach_block (block, &ctx->ir->block_list) {
3488
      foreach_instr (instr, &block->instr_list) {
3489
         if (is_input(instr)) {
3490
            unsigned inloc = instr->srcs[0]->iim_val;
3491
            unsigned i = inloc / 4;
3492
            unsigned j = inloc % 4;
3493

3494
            instr->srcs[0]->iim_val = so->inputs[i].inloc + j;
3495
         } else if (instr->opc == OPC_META_TEX_PREFETCH) {
3496
            unsigned i = instr->prefetch.input_offset / 4;
3497
            unsigned j = instr->prefetch.input_offset % 4;
3498
            instr->prefetch.input_offset = so->inputs[i].inloc + j;
3499
         }
3500
      }
3501
   }
3502
}
3503

3504
static void
3505
setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr)
3506
{
3507
   struct ir3_shader_variant *so = ctx->so;
3508
   nir_io_semantics io = nir_intrinsic_io_semantics(intr);
3509

3510
   compile_assert(ctx, nir_src_is_const(intr->src[1]));
3511

3512
   unsigned offset = nir_src_as_uint(intr->src[1]);
3513
   unsigned n = nir_intrinsic_base(intr) + offset;
3514
   unsigned frac = nir_intrinsic_component(intr);
3515
   unsigned ncomp = nir_intrinsic_src_components(intr, 0);
3516

3517
   /* For per-view variables, each user-facing slot corresponds to multiple
3518
    * views, each with a corresponding driver_location, and the offset is for
3519
    * the driver_location. To properly figure out of the slot, we'd need to
3520
    * plumb through the number of views. However, for now we only use
3521
    * per-view with gl_Position, so we assume that the variable is not an
3522
    * array or matrix (so there are no indirect accesses to the variable
3523
    * itself) and the indirect offset corresponds to the view.
3524
    */
3525
   unsigned slot = io.location + (io.per_view ? 0 : offset);
3526

3527
   if (ctx->so->type == MESA_SHADER_FRAGMENT) {
3528
      switch (slot) {
3529
      case FRAG_RESULT_DEPTH:
3530
         so->writes_pos = true;
3531
         break;
3532
      case FRAG_RESULT_COLOR:
3533
         if (!ctx->s->info.fs.color_is_dual_source) {
3534
            so->color0_mrt = 1;
3535
         } else {
3536
            slot = FRAG_RESULT_DATA0 + io.dual_source_blend_index;
3537
         }
3538
         break;
3539
      case FRAG_RESULT_SAMPLE_MASK:
3540
         so->writes_smask = true;
3541
         break;
3542
      case FRAG_RESULT_STENCIL:
3543
         so->writes_stencilref = true;
3544
         break;
3545
      default:
3546
         slot += io.dual_source_blend_index; /* For dual-src blend */
3547
         if (slot >= FRAG_RESULT_DATA0)
3548
            break;
3549
         ir3_context_error(ctx, "unknown FS output name: %s\n",
3550
                           gl_frag_result_name(slot));
3551
      }
3552
   } else if (ctx->so->type == MESA_SHADER_VERTEX ||
3553
              ctx->so->type == MESA_SHADER_TESS_EVAL ||
3554
              ctx->so->type == MESA_SHADER_GEOMETRY) {
3555
      switch (slot) {
3556
      case VARYING_SLOT_POS:
3557
         so->writes_pos = true;
3558
         break;
3559
      case VARYING_SLOT_PSIZ:
3560
         so->writes_psize = true;
3561
         break;
3562
      case VARYING_SLOT_PRIMITIVE_ID:
3563
      case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
3564
         debug_assert(ctx->so->type == MESA_SHADER_GEOMETRY);
3565
         FALLTHROUGH;
3566
      case VARYING_SLOT_COL0:
3567
      case VARYING_SLOT_COL1:
3568
      case VARYING_SLOT_BFC0:
3569
      case VARYING_SLOT_BFC1:
3570
      case VARYING_SLOT_FOGC:
3571
      case VARYING_SLOT_CLIP_DIST0:
3572
      case VARYING_SLOT_CLIP_DIST1:
3573
      case VARYING_SLOT_CLIP_VERTEX:
3574
      case VARYING_SLOT_LAYER:
3575
      case VARYING_SLOT_VIEWPORT:
3576
         break;
3577
      default:
3578
         if (slot >= VARYING_SLOT_VAR0)
3579
            break;
3580
         if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
3581
            break;
3582
         ir3_context_error(ctx, "unknown %s shader output name: %s\n",
3583
                           _mesa_shader_stage_to_string(ctx->so->type),
3584
                           gl_varying_slot_name_for_stage(slot, ctx->so->type));
3585
      }
3586
   } else {
3587
      ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);
3588
   }
3589

3590
   so->outputs_count = MAX2(so->outputs_count, n + 1);
3591
   compile_assert(ctx, so->outputs_count < ARRAY_SIZE(so->outputs));
3592

3593
   so->outputs[n].slot = slot;
3594
   if (io.per_view)
3595
      so->outputs[n].view = offset;
3596

3597
   for (int i = 0; i < ncomp; i++) {
3598
      unsigned idx = (n * 4) + i + frac;
3599
      compile_assert(ctx, idx < ctx->noutputs);
3600
      ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
3601
   }
3602

3603
   /* if varying packing doesn't happen, we could end up in a situation
3604
    * with "holes" in the output, and since the per-generation code that
3605
    * sets up varying linkage registers doesn't expect to have more than
3606
    * one varying per vec4 slot, pad the holes.
3607
    *
3608
    * Note that this should probably generate a performance warning of
3609
    * some sort.
3610
    */
3611
   for (int i = 0; i < frac; i++) {
3612
      unsigned idx = (n * 4) + i;
3613
      if (!ctx->outputs[idx]) {
3614
         ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
3615
      }
3616
   }
3617

3618
   struct ir3_instruction *const *src = ir3_get_src(ctx, &intr->src[0]);
3619
   for (int i = 0; i < ncomp; i++) {
3620
      unsigned idx = (n * 4) + i + frac;
3621
      ctx->outputs[idx] = src[i];
3622
   }
3623
}
3624

3625
static bool
3626
uses_load_input(struct ir3_shader_variant *so)
3627
{
3628
   return so->type == MESA_SHADER_VERTEX || so->type == MESA_SHADER_FRAGMENT;
3629
}
3630

3631
static bool
3632
uses_store_output(struct ir3_shader_variant *so)
3633
{
3634
   switch (so->type) {
3635
   case MESA_SHADER_VERTEX:
3636
      return !so->key.has_gs && !so->key.tessellation;
3637
   case MESA_SHADER_TESS_EVAL:
3638
      return !so->key.has_gs;
3639
   case MESA_SHADER_GEOMETRY:
3640
   case MESA_SHADER_FRAGMENT:
3641
      return true;
3642
   case MESA_SHADER_TESS_CTRL:
3643
   case MESA_SHADER_COMPUTE:
3644
      return false;
3645
   default:
3646
      unreachable("unknown stage");
3647
   }
3648
}
3649

3650
static void
3651
emit_instructions(struct ir3_context *ctx)
3652
{
3653
   nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
3654

3655
   /* some varying setup which can't be done in setup_input(): */
3656
   if (ctx->so->type == MESA_SHADER_FRAGMENT) {
3657
      nir_foreach_shader_in_variable (var, ctx->s) {
3658
         /* if any varyings have 'sample' qualifer, that triggers us
3659
          * to run in per-sample mode:
3660
          */
3661
         if (var->data.sample)
3662
            ctx->so->per_samp = true;
3663

3664
         /* set rasterflat flag for front/back color */
3665
         if (var->data.interpolation == INTERP_MODE_NONE) {
3666
            switch (var->data.location) {
3667
            case VARYING_SLOT_COL0:
3668
            case VARYING_SLOT_COL1:
3669
            case VARYING_SLOT_BFC0:
3670
            case VARYING_SLOT_BFC1:
3671
               ctx->so->inputs[var->data.driver_location].rasterflat = true;
3672
               break;
3673
            default:
3674
               break;
3675
            }
3676
         }
3677
      }
3678
   }
3679

3680
   if (uses_load_input(ctx->so)) {
3681
      ctx->so->inputs_count = ctx->s->num_inputs;
3682
      compile_assert(ctx, ctx->so->inputs_count < ARRAY_SIZE(ctx->so->inputs));
3683
      ctx->ninputs = ctx->s->num_inputs * 4;
3684
      ctx->inputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);
3685
   } else {
3686
      ctx->ninputs = 0;
3687
      ctx->so->inputs_count = 0;
3688
   }
3689

3690
   if (uses_store_output(ctx->so)) {
3691
      ctx->noutputs = ctx->s->num_outputs * 4;
3692
      ctx->outputs =
3693
         rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs);
3694
   } else {
3695
      ctx->noutputs = 0;
3696
   }
3697

3698
   ctx->ir = ir3_create(ctx->compiler, ctx->so);
3699

3700
   /* Create inputs in first block: */
3701
   ctx->block = get_block(ctx, nir_start_block(fxn));
3702
   ctx->in_block = ctx->block;
3703

3704
   /* for fragment shader, the vcoord input register is used as the
3705
    * base for bary.f varying fetch instrs:
3706
    *
3707
    * TODO defer creating ctx->ij_pixel and corresponding sysvals
3708
    * until emit_intrinsic when we know they are actually needed.
3709
    * For now, we defer creating ctx->ij_centroid, etc, since we
3710
    * only need ij_pixel for "old style" varying inputs (ie.
3711
    * tgsi_to_nir)
3712
    */
3713
   if (ctx->so->type == MESA_SHADER_FRAGMENT) {
3714
      ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3);
3715
   }
3716

3717
   /* Defer add_sysval_input() stuff until after setup_inputs(),
3718
    * because sysvals need to be appended after varyings:
3719
    */
3720
   if (ctx->ij[IJ_PERSP_PIXEL]) {
3721
      add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL, 0x3,
3722
                                ctx->ij[IJ_PERSP_PIXEL]);
3723
   }
3724

3725
   /* Tesselation shaders always need primitive ID for indexing the
3726
    * BO. Geometry shaders don't always need it but when they do it has be
3727
    * delivered and unclobbered in the VS. To make things easy, we always
3728
    * make room for it in VS/DS.
3729
    */
3730
   bool has_tess = ctx->so->key.tessellation != IR3_TESS_NONE;
3731
   bool has_gs = ctx->so->key.has_gs;
3732
   switch (ctx->so->type) {
3733
   case MESA_SHADER_VERTEX:
3734
      if (has_tess) {
3735
         ctx->tcs_header =
3736
            create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
3737
         ctx->primitive_id =
3738
            create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
3739
      } else if (has_gs) {
3740
         ctx->gs_header =
3741
            create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
3742
         ctx->primitive_id =
3743
            create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
3744
      }
3745
      break;
3746
   case MESA_SHADER_TESS_CTRL:
3747
      ctx->tcs_header =
3748
         create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
3749
      ctx->primitive_id =
3750
         create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
3751
      break;
3752
   case MESA_SHADER_TESS_EVAL:
3753
      if (has_gs)
3754
         ctx->gs_header =
3755
            create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
3756
      ctx->primitive_id =
3757
         create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
3758
      break;
3759
   case MESA_SHADER_GEOMETRY:
3760
      ctx->gs_header =
3761
         create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
3762
      ctx->primitive_id =
3763
         create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
3764
      break;
3765
   default:
3766
      break;
3767
   }
3768

3769
   /* Find # of samplers. Just assume that we'll be reading from images.. if
3770
    * it is write-only we don't have to count it, but after lowering derefs
3771
    * is too late to compact indices for that.
3772
    */
3773
   ctx->so->num_samp =
3774
      BITSET_LAST_BIT(ctx->s->info.textures_used) + ctx->s->info.num_images;
3775

3776
   /* Save off clip+cull information. */
3777
   ctx->so->clip_mask = MASK(ctx->s->info.clip_distance_array_size);
3778
   ctx->so->cull_mask = MASK(ctx->s->info.cull_distance_array_size)
3779
                        << ctx->s->info.clip_distance_array_size;
3780

3781
   ctx->so->pvtmem_size = ctx->s->scratch_size;
3782
   ctx->so->shared_size = ctx->s->info.shared_size;
3783

3784
   /* NOTE: need to do something more clever when we support >1 fxn */
3785
   nir_foreach_register (reg, &fxn->registers) {
3786
      ir3_declare_array(ctx, reg);
3787
   }
3788
   /* And emit the body: */
3789
   ctx->impl = fxn;
3790
   emit_function(ctx, fxn);
3791
}
3792

3793
/* Fixup tex sampler state for astc/srgb workaround instructions.  We
3794
 * need to assign the tex state indexes for these after we know the
3795
 * max tex index.
3796
 */
3797
static void
3798
fixup_astc_srgb(struct ir3_context *ctx)
3799
{
3800
   struct ir3_shader_variant *so = ctx->so;
3801
   /* indexed by original tex idx, value is newly assigned alpha sampler
3802
    * state tex idx.  Zero is invalid since there is at least one sampler
3803
    * if we get here.
3804
    */
3805
   unsigned alt_tex_state[16] = {0};
3806
   unsigned tex_idx = ctx->max_texture_index + 1;
3807
   unsigned idx = 0;
3808

3809
   so->astc_srgb.base = tex_idx;
3810

3811
   for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
3812
      struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
3813

3814
      compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
3815

3816
      if (alt_tex_state[sam->cat5.tex] == 0) {
3817
         /* assign new alternate/alpha tex state slot: */
3818
         alt_tex_state[sam->cat5.tex] = tex_idx++;
3819
         so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
3820
         so->astc_srgb.count++;
3821
      }
3822

3823
      sam->cat5.tex = alt_tex_state[sam->cat5.tex];
3824
   }
3825
}
3826

3827
static bool
3828
output_slot_used_for_binning(gl_varying_slot slot)
3829
{
3830
   return slot == VARYING_SLOT_POS || slot == VARYING_SLOT_PSIZ ||
3831
          slot == VARYING_SLOT_CLIP_DIST0 || slot == VARYING_SLOT_CLIP_DIST1 ||
3832
          slot == VARYING_SLOT_VIEWPORT;
3833
}
3834

3835
static struct ir3_instruction *
3836
find_end(struct ir3 *ir)
3837
{
3838
   foreach_block_rev (block, &ir->block_list) {
3839
      foreach_instr_rev (instr, &block->instr_list) {
3840
         if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
3841
            return instr;
3842
      }
3843
   }
3844
   unreachable("couldn't find end instruction");
3845
}
3846

3847
static void
3848
fixup_binning_pass(struct ir3_context *ctx, struct ir3_instruction *end)
3849
{
3850
   struct ir3_shader_variant *so = ctx->so;
3851
   unsigned i, j;
3852

3853
   /* first pass, remove unused outputs from the IR level outputs: */
3854
   for (i = 0, j = 0; i < end->srcs_count; i++) {
3855
      unsigned outidx = end->end.outidxs[i];
3856
      unsigned slot = so->outputs[outidx].slot;
3857

3858
      if (output_slot_used_for_binning(slot)) {
3859
         end->srcs[j] = end->srcs[i];
3860
         end->end.outidxs[j] = end->end.outidxs[i];
3861
         j++;
3862
      }
3863
   }
3864
   end->srcs_count = j;
3865

3866
   /* second pass, cleanup the unused slots in ir3_shader_variant::outputs
3867
    * table:
3868
    */
3869
   for (i = 0, j = 0; i < so->outputs_count; i++) {
3870
      unsigned slot = so->outputs[i].slot;
3871

3872
      if (output_slot_used_for_binning(slot)) {
3873
         so->outputs[j] = so->outputs[i];
3874

3875
         /* fixup outidx to point to new output table entry: */
3876
         for (unsigned k = 0; k < end->srcs_count; k++) {
3877
            if (end->end.outidxs[k] == i) {
3878
               end->end.outidxs[k] = j;
3879
               break;
3880
            }
3881
         }
3882

3883
         j++;
3884
      }
3885
   }
3886
   so->outputs_count = j;
3887
}
3888

3889
static void
3890
collect_tex_prefetches(struct ir3_context *ctx, struct ir3 *ir)
3891
{
3892
   unsigned idx = 0;
3893

3894
   /* Collect sampling instructions eligible for pre-dispatch. */
3895
   foreach_block (block, &ir->block_list) {
3896
      foreach_instr_safe (instr, &block->instr_list) {
3897
         if (instr->opc == OPC_META_TEX_PREFETCH) {
3898
            assert(idx < ARRAY_SIZE(ctx->so->sampler_prefetch));
3899
            struct ir3_sampler_prefetch *fetch =
3900
               &ctx->so->sampler_prefetch[idx];
3901
            idx++;
3902

3903
            if (instr->flags & IR3_INSTR_B) {
3904
               fetch->cmd = IR3_SAMPLER_BINDLESS_PREFETCH_CMD;
3905
               /* In bindless mode, the index is actually the base */
3906
               fetch->tex_id = instr->prefetch.tex_base;
3907
               fetch->samp_id = instr->prefetch.samp_base;
3908
               fetch->tex_bindless_id = instr->prefetch.tex;
3909
               fetch->samp_bindless_id = instr->prefetch.samp;
3910
            } else {
3911
               fetch->cmd = IR3_SAMPLER_PREFETCH_CMD;
3912
               fetch->tex_id = instr->prefetch.tex;
3913
               fetch->samp_id = instr->prefetch.samp;
3914
            }
3915
            fetch->wrmask = instr->dsts[0]->wrmask;
3916
            fetch->dst = instr->dsts[0]->num;
3917
            fetch->src = instr->prefetch.input_offset;
3918

3919
            /* These are the limits on a5xx/a6xx, we might need to
3920
             * revisit if SP_FS_PREFETCH[n] changes on later gens:
3921
             */
3922
            assert(fetch->dst <= 0x3f);
3923
            assert(fetch->tex_id <= 0x1f);
3924
            assert(fetch->samp_id < 0xf);
3925

3926
            ctx->so->total_in =
3927
               MAX2(ctx->so->total_in, instr->prefetch.input_offset + 2);
3928

3929
            fetch->half_precision = !!(instr->dsts[0]->flags & IR3_REG_HALF);
3930

3931
            /* Remove the prefetch placeholder instruction: */
3932
            list_delinit(&instr->node);
3933
         }
3934
      }
3935
   }
3936
}
3937

3938
int
3939
ir3_compile_shader_nir(struct ir3_compiler *compiler,
3940
                       struct ir3_shader_variant *so)
3941
{
3942
   struct ir3_context *ctx;
3943
   struct ir3 *ir;
3944
   int ret = 0, max_bary;
3945
   bool progress;
3946

3947
   assert(!so->ir);
3948

3949
   ctx = ir3_context_init(compiler, so);
3950
   if (!ctx) {
3951
      DBG("INIT failed!");
3952
      ret = -1;
3953
      goto out;
3954
   }
3955

3956
   emit_instructions(ctx);
3957

3958
   if (ctx->error) {
3959
      DBG("EMIT failed!");
3960
      ret = -1;
3961
      goto out;
3962
   }
3963

3964
   ir = so->ir = ctx->ir;
3965

3966
   /* Vertex shaders in a tessellation or geometry pipeline treat END as a
3967
    * NOP and has an epilogue that writes the VS outputs to local storage, to
3968
    * be read by the HS.  Then it resets execution mask (chmask) and chains
3969
    * to the next shader (chsh). There are also a few output values which we
3970
    * must send to the next stage via registers, and in order for both stages
3971
    * to agree on the register used we must force these to be in specific
3972
    * registers.
3973
    */
3974
   if ((so->type == MESA_SHADER_VERTEX &&
3975
        (so->key.has_gs || so->key.tessellation)) ||
3976
       (so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
3977
      struct ir3_instruction *outputs[3];
3978
      unsigned outidxs[3];
3979
      unsigned regids[3];
3980
      unsigned outputs_count = 0;
3981

3982
      if (ctx->primitive_id) {
3983
         unsigned n = so->outputs_count++;
3984
         so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
3985

3986
         struct ir3_instruction *out = ir3_collect(ctx, ctx->primitive_id);
3987
         outputs[outputs_count] = out;
3988
         outidxs[outputs_count] = n;
3989
         regids[outputs_count] = regid(0, 1);
3990
         outputs_count++;
3991
      }
3992

3993
      if (ctx->gs_header) {
3994
         unsigned n = so->outputs_count++;
3995
         so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
3996
         struct ir3_instruction *out = ir3_collect(ctx, ctx->gs_header);
3997
         outputs[outputs_count] = out;
3998
         outidxs[outputs_count] = n;
3999
         regids[outputs_count] = regid(0, 0);
4000
         outputs_count++;
4001
      }
4002

4003
      if (ctx->tcs_header) {
4004
         unsigned n = so->outputs_count++;
4005
         so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
4006
         struct ir3_instruction *out = ir3_collect(ctx, ctx->tcs_header);
4007
         outputs[outputs_count] = out;
4008
         outidxs[outputs_count] = n;
4009
         regids[outputs_count] = regid(0, 0);
4010
         outputs_count++;
4011
      }
4012

4013
      struct ir3_instruction *chmask =
4014
         ir3_instr_create(ctx->block, OPC_CHMASK, 0, outputs_count);
4015
      chmask->barrier_class = IR3_BARRIER_EVERYTHING;
4016
      chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;
4017

4018
      for (unsigned i = 0; i < outputs_count; i++)
4019
         __ssa_src(chmask, outputs[i], 0)->num = regids[i];
4020

4021
      chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
4022
      memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
4023

4024
      array_insert(ctx->block, ctx->block->keeps, chmask);
4025

4026
      struct ir3_instruction *chsh = ir3_CHSH(ctx->block);
4027
      chsh->barrier_class = IR3_BARRIER_EVERYTHING;
4028
      chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
4029
   } else {
4030
      assert((ctx->noutputs % 4) == 0);
4031
      unsigned outidxs[ctx->noutputs / 4];
4032
      struct ir3_instruction *outputs[ctx->noutputs / 4];
4033
      unsigned outputs_count = 0;
4034

4035
      struct ir3_block *old_block = ctx->block;
4036
      /* Insert these collect's in the block before the end-block if
4037
       * possible, so that any moves they generate can be shuffled around to
4038
       * reduce nop's:
4039
       */
4040
      if (ctx->block->predecessors_count == 1)
4041
         ctx->block = ctx->block->predecessors[0];
4042

4043
      /* Setup IR level outputs, which are "collects" that gather
4044
       * the scalar components of outputs.
4045
       */
4046
      for (unsigned i = 0; i < ctx->noutputs; i += 4) {
4047
         unsigned ncomp = 0;
4048
         /* figure out the # of components written:
4049
          *
4050
          * TODO do we need to handle holes, ie. if .x and .z
4051
          * components written, but .y component not written?
4052
          */
4053
         for (unsigned j = 0; j < 4; j++) {
4054
            if (!ctx->outputs[i + j])
4055
               break;
4056
            ncomp++;
4057
         }
4058

4059
         /* Note that in some stages, like TCS, store_output is
4060
          * lowered to memory writes, so no components of the
4061
          * are "written" from the PoV of traditional store-
4062
          * output instructions:
4063
          */
4064
         if (!ncomp)
4065
            continue;
4066

4067
         struct ir3_instruction *out =
4068
            ir3_create_collect(ctx, &ctx->outputs[i], ncomp);
4069

4070
         int outidx = i / 4;
4071
         assert(outidx < so->outputs_count);
4072

4073
         outidxs[outputs_count] = outidx;
4074
         outputs[outputs_count] = out;
4075
         outputs_count++;
4076
      }
4077

4078
      /* for a6xx+, binning and draw pass VS use same VBO state, so we
4079
       * need to make sure not to remove any inputs that are used by
4080
       * the nonbinning VS.
4081
       */
4082
      if (ctx->compiler->gpu_id >= 600 && so->binning_pass &&
4083
          so->type == MESA_SHADER_VERTEX) {
4084
         for (int i = 0; i < ctx->ninputs; i++) {
4085
            struct ir3_instruction *in = ctx->inputs[i];
4086

4087
            if (!in)
4088
               continue;
4089

4090
            unsigned n = i / 4;
4091
            unsigned c = i % 4;
4092

4093
            debug_assert(n < so->nonbinning->inputs_count);
4094

4095
            if (so->nonbinning->inputs[n].sysval)
4096
               continue;
4097

4098
            /* be sure to keep inputs, even if only used in VS */
4099
            if (so->nonbinning->inputs[n].compmask & (1 << c))
4100
               array_insert(in->block, in->block->keeps, in);
4101
         }
4102
      }
4103

4104
      ctx->block = old_block;
4105

4106
      struct ir3_instruction *end =
4107
         ir3_instr_create(ctx->block, OPC_END, 0, outputs_count);
4108

4109
      for (unsigned i = 0; i < outputs_count; i++) {
4110
         __ssa_src(end, outputs[i], 0);
4111
      }
4112

4113
      end->end.outidxs = ralloc_array(end, unsigned, outputs_count);
4114
      memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
4115

4116
      array_insert(ctx->block, ctx->block->keeps, end);
4117

4118
      /* at this point, for binning pass, throw away unneeded outputs: */
4119
      if (so->binning_pass && (ctx->compiler->gpu_id < 600))
4120
         fixup_binning_pass(ctx, end);
4121
   }
4122

4123
   ir3_debug_print(ir, "AFTER: nir->ir3");
4124
   ir3_validate(ir);
4125

4126
   IR3_PASS(ir, ir3_array_to_ssa);
4127

4128
   do {
4129
      progress = false;
4130

4131
      progress |= IR3_PASS(ir, ir3_cf);
4132
      progress |= IR3_PASS(ir, ir3_cp, so);
4133
      progress |= IR3_PASS(ir, ir3_cse);
4134
      progress |= IR3_PASS(ir, ir3_dce, so);
4135
   } while (progress);
4136

4137
   /* at this point, for binning pass, throw away unneeded outputs:
4138
    * Note that for a6xx and later, we do this after ir3_cp to ensure
4139
    * that the uniform/constant layout for BS and VS matches, so that
4140
    * we can re-use same VS_CONST state group.
4141
    */
4142
   if (so->binning_pass && (ctx->compiler->gpu_id >= 600)) {
4143
      fixup_binning_pass(ctx, find_end(ctx->so->ir));
4144
      /* cleanup the result of removing unneeded outputs: */
4145
      while (IR3_PASS(ir, ir3_dce, so)) {
4146
      }
4147
   }
4148

4149
   IR3_PASS(ir, ir3_sched_add_deps);
4150

4151
   /* At this point, all the dead code should be long gone: */
4152
   assert(!IR3_PASS(ir, ir3_dce, so));
4153

4154
   ret = ir3_sched(ir);
4155
   if (ret) {
4156
      DBG("SCHED failed!");
4157
      goto out;
4158
   }
4159

4160
   ir3_debug_print(ir, "AFTER: ir3_sched");
4161

4162
   if (IR3_PASS(ir, ir3_cp_postsched)) {
4163
      /* cleanup the result of removing unneeded mov's: */
4164
      while (IR3_PASS(ir, ir3_dce, so)) {
4165
      }
4166
   }
4167

4168
   /* Pre-assign VS inputs on a6xx+ binning pass shader, to align
4169
    * with draw pass VS, so binning and draw pass can both use the
4170
    * same VBO state.
4171
    *
4172
    * Note that VS inputs are expected to be full precision.
4173
    */
4174
   bool pre_assign_inputs = (ir->compiler->gpu_id >= 600) &&
4175
                            (ir->type == MESA_SHADER_VERTEX) &&
4176
                            so->binning_pass;
4177

4178
   if (pre_assign_inputs) {
4179
      foreach_input (in, ir) {
4180
         assert(in->opc == OPC_META_INPUT);
4181
         unsigned inidx = in->input.inidx;
4182

4183
         in->dsts[0]->num = so->nonbinning->inputs[inidx].regid;
4184
      }
4185
   } else if (ctx->tcs_header) {
4186
      /* We need to have these values in the same registers between VS and TCS
4187
       * since the VS chains to TCS and doesn't get the sysvals redelivered.
4188
       */
4189

4190
      ctx->tcs_header->dsts[0]->num = regid(0, 0);
4191
      ctx->primitive_id->dsts[0]->num = regid(0, 1);
4192
   } else if (ctx->gs_header) {
4193
      /* We need to have these values in the same registers between producer
4194
       * (VS or DS) and GS since the producer chains to GS and doesn't get
4195
       * the sysvals redelivered.
4196
       */
4197

4198
      ctx->gs_header->dsts[0]->num = regid(0, 0);
4199
      ctx->primitive_id->dsts[0]->num = regid(0, 1);
4200
   } else if (so->num_sampler_prefetch) {
4201
      assert(so->type == MESA_SHADER_FRAGMENT);
4202
      int idx = 0;
4203

4204
      foreach_input (instr, ir) {
4205
         if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
4206
            continue;
4207

4208
         assert(idx < 2);
4209
         instr->dsts[0]->num = idx;
4210
         idx++;
4211
      }
4212
   }
4213

4214
   ret = ir3_ra(so);
4215

4216
   if (ret) {
4217
      mesa_loge("ir3_ra() failed!");
4218
      goto out;
4219
   }
4220

4221
   IR3_PASS(ir, ir3_postsched, so);
4222

4223
   IR3_PASS(ir, ir3_lower_subgroups);
4224

4225
   if (so->type == MESA_SHADER_FRAGMENT)
4226
      pack_inlocs(ctx);
4227

4228
   /*
4229
    * Fixup inputs/outputs to point to the actual registers assigned:
4230
    *
4231
    * 1) initialize to r63.x (invalid/unused)
4232
    * 2) iterate IR level inputs/outputs and update the variants
4233
    *    inputs/outputs table based on the assigned registers for
4234
    *    the remaining inputs/outputs.
4235
    */
4236

4237
   for (unsigned i = 0; i < so->inputs_count; i++)
4238
      so->inputs[i].regid = INVALID_REG;
4239
   for (unsigned i = 0; i < so->outputs_count; i++)
4240
      so->outputs[i].regid = INVALID_REG;
4241

4242
   struct ir3_instruction *end = find_end(so->ir);
4243

4244
   for (unsigned i = 0; i < end->srcs_count; i++) {
4245
      unsigned outidx = end->end.outidxs[i];
4246
      struct ir3_register *reg = end->srcs[i];
4247

4248
      so->outputs[outidx].regid = reg->num;
4249
      so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF);
4250
   }
4251

4252
   foreach_input (in, ir) {
4253
      assert(in->opc == OPC_META_INPUT);
4254
      unsigned inidx = in->input.inidx;
4255

4256
      if (pre_assign_inputs && !so->inputs[inidx].sysval) {
4257
         if (VALIDREG(so->nonbinning->inputs[inidx].regid)) {
4258
            compile_assert(
4259
               ctx, in->dsts[0]->num == so->nonbinning->inputs[inidx].regid);
4260
            compile_assert(ctx, !!(in->dsts[0]->flags & IR3_REG_HALF) ==
4261
                                   so->nonbinning->inputs[inidx].half);
4262
         }
4263
         so->inputs[inidx].regid = so->nonbinning->inputs[inidx].regid;
4264
         so->inputs[inidx].half = so->nonbinning->inputs[inidx].half;
4265
      } else {
4266
         so->inputs[inidx].regid = in->dsts[0]->num;
4267
         so->inputs[inidx].half = !!(in->dsts[0]->flags & IR3_REG_HALF);
4268
      }
4269
   }
4270

4271
   if (ctx->astc_srgb)
4272
      fixup_astc_srgb(ctx);
4273

4274
   /* We need to do legalize after (for frag shader's) the "bary.f"
4275
    * offsets (inloc) have been assigned.
4276
    */
4277
   IR3_PASS(ir, ir3_legalize, so, &max_bary);
4278

4279
   /* Set (ss)(sy) on first TCS and GEOMETRY instructions, since we don't
4280
    * know what we might have to wait on when coming in from VS chsh.
4281
    */
4282
   if (so->type == MESA_SHADER_TESS_CTRL || so->type == MESA_SHADER_GEOMETRY) {
4283
      foreach_block (block, &ir->block_list) {
4284
         foreach_instr (instr, &block->instr_list) {
4285
            instr->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
4286
            break;
4287
         }
4288
      }
4289
   }
4290

4291
   so->branchstack = ctx->max_stack;
4292

4293
   /* Note that actual_in counts inputs that are not bary.f'd for FS: */
4294
   if (so->type == MESA_SHADER_FRAGMENT)
4295
      so->total_in = max_bary + 1;
4296

4297
   /* Collect sampling instructions eligible for pre-dispatch. */
4298
   collect_tex_prefetches(ctx, ir);
4299

4300
   if (so->type == MESA_SHADER_FRAGMENT &&
4301
       ctx->s->info.fs.needs_quad_helper_invocations)
4302
      so->need_pixlod = true;
4303

4304
   if (so->type == MESA_SHADER_COMPUTE) {
4305
      so->local_size[0] = ctx->s->info.workgroup_size[0];
4306
      so->local_size[1] = ctx->s->info.workgroup_size[1];
4307
      so->local_size[2] = ctx->s->info.workgroup_size[2];
4308
      so->local_size_variable = ctx->s->info.workgroup_size_variable;
4309
   }
4310

4311
out:
4312
   if (ret) {
4313
      if (so->ir)
4314
         ir3_destroy(so->ir);
4315
      so->ir = NULL;
4316
   }
4317
   ir3_context_free(ctx);
4318

4319
   return ret;
4320
}
4321

4322
Product

Resources

Company