CoCalc -- nir_to

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/auxiliary/nir/nir_to_tgsi.c
⁴⁵⁶¹ views
1
/*
2
 * Copyright © 2014-2015 Broadcom
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23

24
#include "compiler/nir/nir.h"
25
#include "compiler/nir/nir_deref.h"
26
#include "nir/nir_to_tgsi.h"
27
#include "pipe/p_screen.h"
28
#include "pipe/p_state.h"
29
#include "tgsi/tgsi_dump.h"
30
#include "tgsi/tgsi_from_mesa.h"
31
#include "tgsi/tgsi_info.h"
32
#include "tgsi/tgsi_ureg.h"
33
#include "util/debug.h"
34
#include "util/u_math.h"
35
#include "util/u_memory.h"
36

37
struct ntt_compile {
38
   nir_shader *s;
39
   nir_function_impl *impl;
40
   struct pipe_screen *screen;
41
   struct ureg_program *ureg;
42

43
   bool needs_texcoord_semantic;
44
   bool any_reg_as_address;
45
   bool native_integers;
46
   bool has_txf_lz;
47

48
   int next_addr_reg;
49
   bool addr_declared[2];
50
   struct ureg_dst addr_reg[2];
51

52
   /* if condition set up at the end of a block, for ntt_emit_if(). */
53
   struct ureg_src if_cond;
54

55
   /* TGSI temps for our NIR SSA and register values. */
56
   struct ureg_dst *reg_temp;
57
   struct ureg_dst *ssa_temp;
58

59
   nir_instr_liveness *liveness;
60

61
   /* Mappings from driver_location to TGSI input/output number.
62
    *
63
    * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
64
    * their numbers assigned incrementally, unlike inputs or constants.
65
    */
66
   struct ureg_src *input_index_map;
67
   uint64_t centroid_inputs;
68

69
   struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
70
};
71

72
static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
73

74
/**
75
 * Interprets a nir_load_const used as a NIR src as a uint.
76
 *
77
 * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
78
 * instruction (or in a phi-web used by an integer ALU instruction) were
79
 * converted to floats and the ALU instruction swapped to the float equivalent.
80
 * However, this means that integer load_consts used by intrinsics (which don't
81
 * normally get that conversion) may have been reformatted to be floats.  Given
82
 * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
83
 * we can just look and see if they look like floats and convert them back to
84
 * ints.
85
 */
86
static uint32_t
87
ntt_src_as_uint(struct ntt_compile *c, nir_src src)
88
{
89
   uint32_t val = nir_src_as_uint(src);
90
   if (!c->native_integers && val >= fui(1.0))
91
      val = (uint32_t)uif(val);
92
   return val;
93
}
94

95
static unsigned
96
ntt_64bit_write_mask(unsigned write_mask)
97
{
98
   return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
99
}
100

101
static struct ureg_src
102
ntt_64bit_1f(struct ntt_compile *c)
103
{
104
   return ureg_imm4u(c->ureg,
105
                     0x00000000, 0x3ff00000,
106
                     0x00000000, 0x3ff00000);
107
}
108

109
static const struct glsl_type *
110
ntt_shader_input_type(struct ntt_compile *c,
111
                      struct nir_variable *var)
112
{
113
   switch (c->s->info.stage) {
114
   case MESA_SHADER_GEOMETRY:
115
   case MESA_SHADER_TESS_EVAL:
116
   case MESA_SHADER_TESS_CTRL:
117
      if (glsl_type_is_array(var->type))
118
         return glsl_get_array_element(var->type);
119
      else
120
         return var->type;
121
   default:
122
      return var->type;
123
   }
124
}
125

126
static void
127
ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
128
                            unsigned *semantic_name, unsigned *semantic_index)
129
{
130
   /* We want to use most of tgsi_get_gl_varying_semantic(), but the
131
    * !texcoord shifting has already been applied, so avoid that.
132
    */
133
   if (!c->needs_texcoord_semantic &&
134
       (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
135
      *semantic_name = TGSI_SEMANTIC_GENERIC;
136
      *semantic_index = location - VARYING_SLOT_VAR0;
137
      return;
138
   }
139

140
   tgsi_get_gl_varying_semantic(location, true,
141
                                semantic_name, semantic_index);
142
}
143

144
/* TGSI varying declarations have a component usage mask associated (used by
145
 * r600 and svga).
146
 */
147
static uint32_t
148
ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
149
                    bool is_64)
150
{
151
   uint32_t usage_mask =
152
      u_bit_consecutive(start_component, num_components);
153

154
   if (is_64) {
155
      if (start_component >= 2)
156
         usage_mask >>= 2;
157

158
      uint32_t tgsi_usage_mask = 0;
159

160
      if (usage_mask & TGSI_WRITEMASK_X)
161
         tgsi_usage_mask |= TGSI_WRITEMASK_XY;
162
      if (usage_mask & TGSI_WRITEMASK_Y)
163
         tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
164

165
      return tgsi_usage_mask;
166
   } else {
167
      return usage_mask;
168
   }
169
}
170

171
/* TGSI varying declarations have a component usage mask associated (used by
172
 * r600 and svga).
173
 */
174
static uint32_t
175
ntt_tgsi_var_usage_mask(const struct nir_variable *var)
176
{
177
   const struct glsl_type *type_without_array =
178
      glsl_without_array(var->type);
179
   unsigned num_components = glsl_get_vector_elements(type_without_array);
180
   if (num_components == 0) /* structs */
181
      num_components = 4;
182

183
   return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
184
                              glsl_type_is_64bit(type_without_array));
185
}
186

187
static struct ureg_dst
188
ntt_store_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
189
{
190
   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
191
   int base = nir_intrinsic_base(instr);
192
   *frac = nir_intrinsic_component(instr);
193
   bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
194

195
   struct ureg_dst out;
196
   if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
197
      if (semantics.location == FRAG_RESULT_COLOR)
198
         ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
199

200
      unsigned semantic_name, semantic_index;
201
      tgsi_get_gl_frag_result_semantic(semantics.location,
202
                                       &semantic_name, &semantic_index);
203
      semantic_index += semantics.dual_source_blend_index;
204

205
      switch (semantics.location) {
206
      case FRAG_RESULT_DEPTH:
207
         *frac = 2; /* z write is the to the .z channel in TGSI */
208
         break;
209
      case FRAG_RESULT_STENCIL:
210
         *frac = 1;
211
         break;
212
      default:
213
         break;
214
      }
215

216
      out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
217
   } else {
218
      unsigned semantic_name, semantic_index;
219

220
      ntt_get_gl_varying_semantic(c, semantics.location,
221
                                  &semantic_name, &semantic_index);
222

223
      uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,
224
                                                instr->num_components,
225
                                                is_64);
226
      uint32_t gs_streams = semantics.gs_streams;
227
      for (int i = 0; i < 4; i++) {
228
         if (!(usage_mask & (1 << i)))
229
            gs_streams &= ~(0x3 << 2 * i);
230
      }
231

232
      /* No driver appears to use array_id of outputs. */
233
      unsigned array_id = 0;
234

235
      /* This bit is lost in the i/o semantics, but it's unused in in-tree
236
       * drivers.
237
       */
238
      bool invariant = false;
239

240
      out = ureg_DECL_output_layout(c->ureg,
241
                                    semantic_name, semantic_index,
242
                                    gs_streams,
243
                                    base,
244
                                    usage_mask,
245
                                    array_id,
246
                                    semantics.num_slots,
247
                                    invariant);
248
   }
249

250
   unsigned write_mask = nir_intrinsic_write_mask(instr);
251

252
   if (is_64) {
253
      write_mask = ntt_64bit_write_mask(write_mask);
254
      if (*frac >= 2)
255
         write_mask = write_mask << 2;
256
   } else {
257
      write_mask = write_mask << *frac;
258
   }
259
   return ureg_writemask(out, write_mask);
260
}
261

262
/* If this reg or SSA def is used only for storing an output, then in the simple
263
 * cases we can write directly to the TGSI output instead of having store_output
264
 * emit its own MOV.
265
 */
266
static bool
267
ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
268
                             struct list_head *uses, struct list_head *if_uses)
269
{
270
   *dst = ureg_dst_undef();
271

272
   switch (c->s->info.stage) {
273
   case MESA_SHADER_FRAGMENT:
274
   case MESA_SHADER_VERTEX:
275
      break;
276
   default:
277
      /* tgsi_exec (at least) requires that output stores happen per vertex
278
       * emitted, you don't get to reuse a previous output value for the next
279
       * vertex.
280
       */
281
      return false;
282
   }
283

284
   if (!list_is_empty(if_uses) || !list_is_singular(uses))
285
      return false;
286

287
   nir_src *src = list_first_entry(uses, nir_src, use_link);
288

289
   if (src->parent_instr->type != nir_instr_type_intrinsic)
290
      return false;
291

292
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src->parent_instr);
293
   if (intr->intrinsic != nir_intrinsic_store_output ||
294
       !nir_src_is_const(intr->src[1])) {
295
      return false;
296
   }
297

298
   uint32_t frac;
299
   *dst = ntt_store_output_decl(c, intr, &frac);
300
   dst->Index += ntt_src_as_uint(c, intr->src[1]);
301

302
   return frac == 0;
303
}
304

305
static void
306
ntt_setup_inputs(struct ntt_compile *c)
307
{
308
   if (c->s->info.stage != MESA_SHADER_FRAGMENT)
309
      return;
310

311
   unsigned num_inputs = 0;
312
   int num_input_arrays = 0;
313

314
   nir_foreach_shader_in_variable(var, c->s) {
315
      const struct glsl_type *type = ntt_shader_input_type(c, var);
316
      unsigned array_len =
317
         glsl_count_attribute_slots(type, false);
318

319
      num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
320
   }
321

322
   c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
323

324
   nir_foreach_shader_in_variable(var, c->s) {
325
      const struct glsl_type *type = ntt_shader_input_type(c, var);
326
      unsigned array_len =
327
         glsl_count_attribute_slots(type, false);
328

329
      unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
330
      unsigned sample_loc;
331
      struct ureg_src decl;
332

333
      if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
334
         interpolation =
335
            tgsi_get_interp_mode(var->data.interpolation,
336
                                 var->data.location == VARYING_SLOT_COL0 ||
337
                                 var->data.location == VARYING_SLOT_COL1);
338

339
         if (var->data.location == VARYING_SLOT_POS)
340
            interpolation = TGSI_INTERPOLATE_LINEAR;
341
      }
342

343
      unsigned semantic_name, semantic_index;
344
      ntt_get_gl_varying_semantic(c, var->data.location,
345
                                  &semantic_name, &semantic_index);
346

347
      if (var->data.sample) {
348
         sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
349
      } else if (var->data.centroid) {
350
         sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
351
         c->centroid_inputs |= (BITSET_MASK(array_len) <<
352
                                var->data.driver_location);
353
      } else {
354
         sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
355
      }
356

357
      unsigned array_id = 0;
358
      if (glsl_type_is_array(type))
359
         array_id = ++num_input_arrays;
360

361
      uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
362

363
      decl = ureg_DECL_fs_input_cyl_centroid_layout(c->ureg,
364
                                                    semantic_name,
365
                                                    semantic_index,
366
                                                    interpolation,
367
                                                    0,
368
                                                    sample_loc,
369
                                                    var->data.driver_location,
370
                                                    usage_mask,
371
                                                    array_id, array_len);
372

373
      if (semantic_name == TGSI_SEMANTIC_FACE) {
374
         struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
375
         /* NIR is ~0 front and 0 back, while TGSI is +1 front */
376
         ureg_SGE(c->ureg, temp, decl, ureg_imm1f(c->ureg, 0));
377
         decl = ureg_src(temp);
378
      }
379

380
      for (unsigned i = 0; i < array_len; i++) {
381
         c->input_index_map[var->data.driver_location + i] = decl;
382
         c->input_index_map[var->data.driver_location + i].Index += i;
383
      }
384
   }
385
}
386

387
static void
388
ntt_setup_uniforms(struct ntt_compile *c)
389
{
390
   struct pipe_screen *screen = c->screen;
391
   bool packed = screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS);
392

393
   nir_foreach_uniform_variable(var, c->s) {
394
      if (glsl_type_is_image(var->type)) {
395
         c->images[var->data.binding] = ureg_DECL_image(c->ureg,
396
                                                        var->data.binding,
397
                                                        TGSI_TEXTURE_2D,
398
                                                        var->data.image.format,
399
                                                        !var->data.read_only,
400
                                                        false);
401
      } else {
402
         unsigned size;
403
         if (packed) {
404
            size = DIV_ROUND_UP(glsl_count_dword_slots(var->type,
405
                                                       var->data.bindless), 4);
406
         } else {
407
            size = glsl_count_vec4_slots(var->type, false, var->data.bindless);
408
         }
409

410
         for (unsigned i = 0; i < size; i++)
411
            ureg_DECL_constant(c->ureg, var->data.driver_location + i);
412
      }
413
   }
414

415
   nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
416
      ureg_DECL_constant2D(c->ureg, 0, 0, var->data.driver_location);
417
   }
418

419
   nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ssbo) {
420
      /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic
421
       * counters
422
       */
423
      bool atomic = false;
424
      ureg_DECL_buffer(c->ureg, var->data.binding, atomic);
425
   }
426

427
   for (int i = 0; i < PIPE_MAX_SAMPLERS; i++) {
428
      if (BITSET_TEST(c->s->info.textures_used, i))
429
         ureg_DECL_sampler(c->ureg, i);
430
   }
431
}
432

433
static void
434
ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
435
{
436
   foreach_list_typed(nir_register, nir_reg, node, list) {
437
      struct ureg_dst decl;
438
      if (nir_reg->num_array_elems == 0) {
439
         uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);
440
         if (!ntt_try_store_in_tgsi_output(c, &decl, &nir_reg->uses, &nir_reg->if_uses)) {
441
            if (nir_reg->bit_size == 64) {
442
               if (nir_reg->num_components > 2) {
443
                  fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
444
                        nir_reg->num_components, nir_reg->index);
445
               }
446

447
               write_mask = ntt_64bit_write_mask(write_mask);
448
            }
449

450
            decl = ureg_writemask(ureg_DECL_temporary(c->ureg), write_mask);
451
         }
452
      } else {
453
         decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems,
454
                                          true);
455
      }
456
      c->reg_temp[nir_reg->index] = decl;
457
   }
458
}
459

460
static struct ureg_src
461
ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
462
{
463
   int num_components = instr->def.num_components;
464

465
   if (!c->native_integers) {
466
      float values[4];
467
      assert(instr->def.bit_size == 32);
468
      for (int i = 0; i < num_components; i++)
469
         values[i] = uif(instr->value[i].u32);
470

471
      return ureg_DECL_immediate(c->ureg, values, num_components);
472
   } else {
473
      uint32_t values[4];
474

475
      if (instr->def.bit_size == 32) {
476
         for (int i = 0; i < num_components; i++)
477
            values[i] = instr->value[i].u32;
478
      } else {
479
         assert(num_components <= 2);
480
         for (int i = 0; i < num_components; i++) {
481
            values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
482
            values[i * 2 + 1] = instr->value[i].u64 >> 32;
483
         }
484
         num_components *= 2;
485
      }
486

487
      return ureg_DECL_immediate_uint(c->ureg, values, num_components);
488
   }
489
}
490

491
static struct ureg_src
492
ntt_reladdr(struct ntt_compile *c, struct ureg_src addr)
493
{
494
   if (c->any_reg_as_address) {
495
      /* Make sure we're getting the refcounting right even on any_reg
496
       * drivers.
497
       */
498
      c->next_addr_reg++;
499

500
      return ureg_scalar(addr, 0);
501
   }
502

503
   assert(c->next_addr_reg < ARRAY_SIZE(c->addr_reg));
504

505
   if (!c->addr_declared[c->next_addr_reg]) {
506
      c->addr_reg[c->next_addr_reg] = ureg_writemask(ureg_DECL_address(c->ureg),
507
                                                     TGSI_WRITEMASK_X);
508
      c->addr_declared[c->next_addr_reg] = true;
509
   }
510

511
   if (c->native_integers)
512
      ureg_UARL(c->ureg, c->addr_reg[c->next_addr_reg], addr);
513
   else
514
      ureg_ARL(c->ureg, c->addr_reg[c->next_addr_reg], addr);
515
   return ureg_scalar(ureg_src(c->addr_reg[c->next_addr_reg++]), 0);
516
}
517

518
static void
519
ntt_put_reladdr(struct ntt_compile *c)
520
{
521
   c->next_addr_reg--;
522
   assert(c->next_addr_reg >= 0);
523
}
524

525
static void
526
ntt_reladdr_dst_put(struct ntt_compile *c, struct ureg_dst dst)
527
{
528
   if (c->any_reg_as_address)
529
      return;
530

531
   if (dst.Indirect)
532
      ntt_put_reladdr(c);
533
   if (dst.DimIndirect)
534
      ntt_put_reladdr(c);
535
}
536

537
static struct ureg_src
538
ntt_get_src(struct ntt_compile *c, nir_src src)
539
{
540
   if (src.is_ssa) {
541
      if (src.ssa->parent_instr->type == nir_instr_type_load_const)
542
         return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));
543

544
      return ureg_src(c->ssa_temp[src.ssa->index]);
545
   } else {
546
      nir_register *reg = src.reg.reg;
547
      struct ureg_dst reg_temp = c->reg_temp[reg->index];
548
      reg_temp.Index += src.reg.base_offset;
549

550
      if (src.reg.indirect) {
551
         struct ureg_src offset = ntt_get_src(c, *src.reg.indirect);
552
         return ureg_src_indirect(ureg_src(reg_temp),
553
                                  ntt_reladdr(c, offset));
554
      } else {
555
         return ureg_src(reg_temp);
556
      }
557
   }
558
}
559

560
static struct ureg_src
561
ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
562
{
563
   nir_alu_src src = instr->src[i];
564
   struct ureg_src usrc = ntt_get_src(c, src.src);
565

566
   if (nir_src_bit_size(src.src) == 64) {
567
      int chan0 = 0, chan1 = 1;
568
      if (nir_op_infos[instr->op].input_sizes[i] == 0) {
569
         chan0 = ffs(instr->dest.write_mask) - 1;
570
         chan1 = ffs(instr->dest.write_mask & ~(1 << chan0)) - 1;
571
         if (chan1 == -1)
572
            chan1 = chan0;
573
      }
574
      usrc = ureg_swizzle(usrc,
575
                          src.swizzle[chan0] * 2,
576
                          src.swizzle[chan0] * 2 + 1,
577
                          src.swizzle[chan1] * 2,
578
                          src.swizzle[chan1] * 2 + 1);
579
   } else {
580
      usrc = ureg_swizzle(usrc,
581
                          src.swizzle[0],
582
                          src.swizzle[1],
583
                          src.swizzle[2],
584
                          src.swizzle[3]);
585
   }
586

587
   if (src.abs)
588
      usrc = ureg_abs(usrc);
589
   if (src.negate)
590
      usrc = ureg_negate(usrc);
591

592
   return usrc;
593
}
594

595
/* Reswizzles a source so that the unset channels in the write mask still refer
596
 * to one of the channels present in the write mask.
597
 */
598
static struct ureg_src
599
ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
600
{
601
   assert(write_mask);
602
   int first_chan = ffs(write_mask) - 1;
603
   return ureg_swizzle(src,
604
                       (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
605
                       (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
606
                       (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
607
                       (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
608
}
609

610
static struct ureg_dst *
611
ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
612
{
613
   uint32_t writemask = BITSET_MASK(ssa->num_components);
614
   if (ssa->bit_size == 64)
615
      writemask = ntt_64bit_write_mask(writemask);
616

617
   struct ureg_dst dst;
618
   if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses, &ssa->if_uses))
619
      dst = ureg_DECL_temporary(c->ureg);
620

621
   c->ssa_temp[ssa->index] = ureg_writemask(dst, writemask);
622

623
   return &c->ssa_temp[ssa->index];
624
}
625

626
static struct ureg_dst *
627
ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
628
{
629
   if (dest->is_ssa)
630
      return ntt_get_ssa_def_decl(c, &dest->ssa);
631
   else
632
      return &c->reg_temp[dest->reg.reg->index];
633
}
634

635
static struct ureg_dst
636
ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
637
{
638
   struct ureg_dst dst = *ntt_get_dest_decl(c, dest);
639

640
   if (!dest->is_ssa) {
641
      dst.Index += dest->reg.base_offset;
642

643
      if (dest->reg.indirect) {
644
         struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect);
645
         dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset));
646
      }
647
   }
648

649
   return dst;
650
}
651

652
/* For an SSA dest being populated by a constant src, replace the storage with
653
 * a copy of the ureg_src.
654
 */
655
static void
656
ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
657
{
658
   if (!src.Negate && !src.Absolute && !src.Indirect && !src.DimIndirect &&
659
       src.SwizzleX == TGSI_SWIZZLE_X &&
660
       (src.SwizzleY == TGSI_SWIZZLE_Y || def->num_components < 2) &&
661
       (src.SwizzleZ == TGSI_SWIZZLE_Z || def->num_components < 3) &&
662
       (src.SwizzleW == TGSI_SWIZZLE_W || def->num_components < 4)) {
663
      switch (src.File) {
664
      case TGSI_FILE_IMMEDIATE:
665
      case TGSI_FILE_INPUT:
666
      case TGSI_FILE_CONSTANT:
667
      case TGSI_FILE_SYSTEM_VALUE:
668
         c->ssa_temp[def->index] = ureg_dst(src);
669
         return;
670
      }
671
   }
672

673
   ureg_MOV(c->ureg, *ntt_get_ssa_def_decl(c, def), src);
674
}
675

676
static void
677
ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src)
678
{
679
   if (dest->is_ssa)
680
      ntt_store_def(c, &dest->ssa, src);
681
   else {
682
      struct ureg_dst dst = ntt_get_dest(c, dest);
683
      ureg_MOV(c->ureg, dst, src);
684
   }
685
}
686

687
static void
688
ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
689
                struct ureg_dst dst,
690
                struct ureg_src src0,
691
                struct ureg_src src1)
692
{
693
   unsigned i;
694
   int num_src;
695

696
   /* POW is the only 2-operand scalar op. */
697
   if (tgsi_op  == TGSI_OPCODE_POW) {
698
      num_src = 2;
699
   } else {
700
      num_src = 1;
701
      src1 = src0;
702
   }
703

704
   for (i = 0; i < 4; i++) {
705
      if (dst.WriteMask & (1 << i)) {
706
         struct ureg_dst this_dst = dst;
707
         struct ureg_src srcs[2] = {
708
            ureg_scalar(src0, i),
709
            ureg_scalar(src1, i),
710
         };
711
         this_dst.WriteMask = (1 << i);
712

713
         ureg_insn(c->ureg, tgsi_op, &this_dst, 1, srcs, num_src, false);
714
      }
715
   }
716
}
717

718
static void
719
ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
720
{
721
   struct ureg_src src[4];
722
   struct ureg_dst dst;
723
   unsigned i;
724
   int dst_64 = nir_dest_bit_size(instr->dest.dest) == 64;
725
   int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
726
   int num_srcs = nir_op_infos[instr->op].num_inputs;
727

728
   assert(num_srcs <= ARRAY_SIZE(src));
729
   for (i = 0; i < num_srcs; i++)
730
      src[i] = ntt_get_alu_src(c, instr, i);
731
   dst = ntt_get_dest(c, &instr->dest.dest);
732

733
   if (instr->dest.saturate)
734
      dst.Saturate = true;
735

736
   if (dst_64)
737
      dst = ureg_writemask(dst, ntt_64bit_write_mask(instr->dest.write_mask));
738
   else
739
      dst = ureg_writemask(dst, instr->dest.write_mask);
740

741
   static enum tgsi_opcode op_map[][2] = {
742
      [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
743

744
      /* fabs/fneg 32-bit are special-cased below. */
745
      [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
746
      [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
747

748
      [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
749
      [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
750
      [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
751
      [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
752
      [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
753
      [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
754
      [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
755
      [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
756
      [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
757
      [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
758

759
      [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
760
      [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
761
      [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
762

763
      /* The conversions will have one combination of src and dst bitsize. */
764
      [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
765
      [nir_op_f2f64] = { TGSI_OPCODE_F2D },
766
      [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
767

768
      [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
769
      [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
770
      [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
771
      [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
772
      [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
773
      [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
774
      [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
775
      [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
776

777
      [nir_op_slt] = { TGSI_OPCODE_SLT },
778
      [nir_op_sge] = { TGSI_OPCODE_SGE },
779
      [nir_op_seq] = { TGSI_OPCODE_SEQ },
780
      [nir_op_sne] = { TGSI_OPCODE_SNE },
781

782
      [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
783
      [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
784
      [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
785
      [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
786

787
      [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
788
      [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
789
      [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
790
      [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
791

792
      [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
793
      [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
794

795
      [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
796
      [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
797
      [nir_op_fsign] = { TGSI_OPCODE_SSG },
798
      [nir_op_isign] = { TGSI_OPCODE_ISSG },
799
      [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
800
      [nir_op_fddx] = { TGSI_OPCODE_DDX },
801
      [nir_op_fddy] = { TGSI_OPCODE_DDY },
802
      [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },
803
      [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },
804
      [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },
805
      [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },
806
      [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
807
      [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
808
      [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
809
      [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
810
      [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
811
      [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
812
      [nir_op_bit_count] = { TGSI_OPCODE_POPC },
813
      [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
814
      [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
815
      [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
816
      [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
817
      [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
818
      [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
819
      [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
820
      [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
821
      [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
822
      [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
823
      [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
824
      [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
825
      [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
826
      [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
827

828
      /* These bitwise ops don't care about 32 vs 64 types, so they have the
829
       * same TGSI op.
830
       */
831
      [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
832
      [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
833
      [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
834
      [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
835

836
      [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
837
      [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
838
      [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
839
      [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
840
      [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
841
      [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
842
      [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
843
      [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
844
   };
845

846
   /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
847
    * of .xy.  Store to a temp and move it to the real dst.
848
    */
849
   bool tgsi_64bit_compare = src_64 && !dst_64 &&
850
      (num_srcs == 2 ||
851
        nir_op_infos[instr->op].output_type == nir_type_bool32) &&
852
      (dst.WriteMask != TGSI_WRITEMASK_X);
853

854
   /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
855
    * channels and will need to get fixed up.
856
    */
857
   bool tgsi_64bit_downconvert = (src_64 && !dst_64 &&
858
                                  num_srcs == 1 && !tgsi_64bit_compare &&
859
                                  (dst.WriteMask & ~TGSI_WRITEMASK_XY));
860

861
   struct ureg_dst real_dst = ureg_dst_undef();
862
   if (tgsi_64bit_compare || tgsi_64bit_downconvert) {
863
      real_dst = dst;
864
      dst = ureg_DECL_temporary(c->ureg);
865
   }
866

867
   bool table_op64 = src_64;
868
   if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
869
      /* The normal path for NIR to TGSI ALU op translation */
870
      ureg_insn(c->ureg, op_map[instr->op][table_op64],
871
                &dst, 1, src, num_srcs, false);
872
   } else {
873
      /* Special cases for NIR to TGSI ALU op translation. */
874

875
      /* TODO: Use something like the ntt_store() path for the MOV calls so we
876
       * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
877
       */
878

879
      switch (instr->op) {
880
      case nir_op_u2u64:
881
         ureg_AND(c->ureg, dst, ureg_swizzle(src[0],
882
                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
883
                                             TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
884
                  ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
885
         break;
886

887
      case nir_op_i2i32:
888
      case nir_op_u2u32:
889
         assert(src_64);
890
         ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
891
                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
892
                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
893
         break;
894

895
      case nir_op_fabs:
896
         ureg_MOV(c->ureg, dst, ureg_abs(src[0]));
897
         break;
898

899
      case nir_op_fsat:
900
         if (dst_64) {
901
            ureg_MIN(c->ureg, dst, src[0], ntt_64bit_1f(c));
902
            ureg_MAX(c->ureg, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
903
         } else {
904
            ureg_MOV(c->ureg, ureg_saturate(dst), src[0]);
905
         }
906
         break;
907

908
      case nir_op_fneg:
909
         ureg_MOV(c->ureg, dst, ureg_negate(src[0]));
910
         break;
911

912
         /* NOTE: TGSI 32-bit math ops have the old "one source channel
913
          * replicated to all dst channels" behavior, while 64 is normal mapping
914
          * of src channels to dst.
915
          */
916
      case nir_op_frcp:
917
         assert(!dst_64);
918
         ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], src[1]);
919
         break;
920

921
      case nir_op_frsq:
922
         assert(!dst_64);
923
         ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], src[1]);
924
         break;
925

926
      case nir_op_fsqrt:
927
         assert(!dst_64);
928
         ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], src[1]);
929
         break;
930

931
      case nir_op_fexp2:
932
         assert(!dst_64);
933
         ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], src[1]);
934
         break;
935

936
      case nir_op_flog2:
937
         assert(!dst_64);
938
         ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], src[1]);
939
         break;
940

941
      case nir_op_b2f32:
942
         ureg_AND(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 1.0));
943
         break;
944

945
      case nir_op_b2f64:
946
         ureg_AND(c->ureg, dst,
947
                  ureg_swizzle(src[0],
948
                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
949
                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
950
                  ntt_64bit_1f(c));
951
         break;
952

953
      case nir_op_f2b32:
954
         if (src_64)
955
            ureg_DSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
956
         else
957
            ureg_FSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
958
         break;
959

960
      case nir_op_i2b32:
961
         if (src_64) {
962
            ureg_U64SNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
963
         } else
964
            ureg_USNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
965
         break;
966

967
      case nir_op_b2i32:
968
         ureg_AND(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 1));
969
         break;
970

971
      case nir_op_b2i64:
972
         ureg_AND(c->ureg, dst,
973
                  ureg_swizzle(src[0],
974
                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
975
                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
976
                  ureg_imm4u(c->ureg, 1, 0, 1, 0));
977
         break;
978

979
      case nir_op_fsin:
980
         ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], src[1]);
981
         break;
982

983
      case nir_op_fcos:
984
         ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], src[1]);
985
         break;
986

987
      case nir_op_fsub:
988
         assert(!dst_64);
989
         ureg_ADD(c->ureg, dst, src[0], ureg_negate(src[1]));
990
         break;
991

992
      case nir_op_isub:
993
         assert(!dst_64);
994
         ureg_UADD(c->ureg, dst, src[0], ureg_negate(src[1]));
995
         break;
996

997
      case nir_op_fmod:
998
         unreachable("should be handled by .lower_fmod = true");
999
         break;
1000

1001
      case nir_op_fpow:
1002
         ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
1003
         break;
1004

1005
      case nir_op_flrp:
1006
         ureg_LRP(c->ureg, dst, src[2], src[1], src[0]);
1007
         break;
1008

1009
      case nir_op_pack_64_2x32_split:
1010
         ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
1011
                  ureg_swizzle(src[0],
1012
                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1013
                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1014
         ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_YW),
1015
                  ureg_swizzle(src[1],
1016
                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1017
                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1018
         break;
1019

1020
      case nir_op_unpack_64_2x32_split_x:
1021
         ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
1022
                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1023
                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
1024
         break;
1025

1026
      case nir_op_unpack_64_2x32_split_y:
1027
         ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
1028
                                             TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
1029
                                             TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
1030
         break;
1031

1032
      case nir_op_b32csel:
1033
         if (nir_src_bit_size(instr->src[1].src) == 64) {
1034
            ureg_UCMP(c->ureg, dst, ureg_swizzle(src[0],
1035
                                                 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1036
                                                 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1037
                      src[1], src[2]);
1038
         } else {
1039
            ureg_UCMP(c->ureg, dst, src[0], src[1], src[2]);
1040
         }
1041
         break;
1042

1043
      case nir_op_fcsel:
1044
         /* NIR is src0 != 0 ? src1 : src2.
1045
          * TGSI is src0 < 0 ? src1 : src2.
1046
          *
1047
          * However, fcsel so far as I can find only appears on
1048
          * bools-as-floats (1.0 or 0.0), so we can negate it for the TGSI op.
1049
          */
1050
         ureg_CMP(c->ureg, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);
1051
         break;
1052

1053
         /* It would be nice if we could get this left as scalar in NIR, since
1054
          * the TGSI op is scalar.
1055
          */
1056
      case nir_op_frexp_sig:
1057
      case nir_op_frexp_exp: {
1058
         assert(src_64);
1059
         struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
1060

1061
         for (int chan = 0; chan < 2; chan++) {
1062
            int wm = 1 << chan;
1063

1064
            if (!(instr->dest.write_mask & wm))
1065
               continue;
1066

1067
            struct ureg_dst dsts[2] = { temp, temp };
1068
            if (instr->op == nir_op_frexp_sig) {
1069
               dsts[0] = ureg_writemask(dst, ntt_64bit_write_mask(wm));
1070
            } else {
1071
               dsts[1] = ureg_writemask(dst, wm);
1072
            }
1073

1074
            struct ureg_src chan_src = ureg_swizzle(src[0],
1075
                                                    chan * 2, chan * 2 + 1,
1076
                                                    chan * 2, chan * 2 + 1);
1077

1078
            ureg_insn(c->ureg, TGSI_OPCODE_DFRACEXP,
1079
                      dsts, 2,
1080
                      &chan_src, 1, false);
1081
         }
1082

1083
         ureg_release_temporary(c->ureg, temp);
1084
         break;
1085
      }
1086

1087
      case nir_op_ldexp:
1088
         assert(dst_64); /* 32bit handled in table. */
1089
         ureg_DLDEXP(c->ureg, dst, src[0],
1090
                     ureg_swizzle(src[1],
1091
                                  TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1092
                                  TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1093
         break;
1094

1095
      case nir_op_vec4:
1096
      case nir_op_vec3:
1097
      case nir_op_vec2:
1098
         unreachable("covered by nir_lower_vec_to_movs()");
1099

1100
      default:
1101
         fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
1102
         unreachable("Unknown NIR opcode");
1103
      }
1104
   }
1105

1106
   /* 64-bit op fixup movs */
1107
   if (!ureg_dst_is_undef(real_dst)) {
1108
      if (tgsi_64bit_compare) {
1109
         ureg_MOV(c->ureg, real_dst,
1110
                  ureg_swizzle(ureg_src(dst), 0, 2, 0, 2));
1111
      } else {
1112
         assert(tgsi_64bit_downconvert);
1113
         uint8_t swizzle[] = {0, 0, 0, 0};
1114
         uint32_t second_bit = real_dst.WriteMask & ~(1 << (ffs(real_dst.WriteMask) - 1));
1115
         if (second_bit)
1116
            swizzle[ffs(second_bit) - 1] = 1;
1117
         ureg_MOV(c->ureg, real_dst, ureg_swizzle(ureg_src(dst),
1118
                                                  swizzle[0],
1119
                                                  swizzle[1],
1120
                                                  swizzle[2],
1121
                                                  swizzle[3]));
1122
      }
1123
      ureg_release_temporary(c->ureg, dst);
1124
   }
1125
}
1126

1127
static struct ureg_src
1128
ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
1129
                      nir_src src)
1130
{
1131
   if (nir_src_is_const(src)) {
1132
      usrc.Index += ntt_src_as_uint(c, src);
1133
      return usrc;
1134
   } else {
1135
      return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src)));
1136
   }
1137
}
1138

1139
static struct ureg_dst
1140
ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
1141
                      nir_src src)
1142
{
1143
   if (nir_src_is_const(src)) {
1144
      dst.Index += ntt_src_as_uint(c, src);
1145
      return dst;
1146
   } else {
1147
      return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src)));
1148
   }
1149
}
1150

1151
static struct ureg_src
1152
ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
1153
                         nir_src src)
1154
{
1155
   if (nir_src_is_const(src)) {
1156
      return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));
1157
   }
1158
   else
1159
   {
1160
      return ureg_src_dimension_indirect(usrc,
1161
                                         ntt_reladdr(c, ntt_get_src(c, src)),
1162
                                         0);
1163
   }
1164
}
1165

1166
static struct ureg_dst
1167
ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,
1168
                                nir_src src)
1169
{
1170
   if (nir_src_is_const(src)) {
1171
      return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));
1172
   } else {
1173
      return ureg_dst_dimension_indirect(udst,
1174
                                         ntt_reladdr(c, ntt_get_src(c, src)),
1175
                                         0);
1176
   }
1177
}
1178
/* Some load operations in NIR will have a fractional offset that we need to
1179
 * swizzle down before storing to the result register.
1180
 */
1181
static struct ureg_src
1182
ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1183
{
1184
   return ureg_swizzle(src,
1185
                       frac,
1186
                       frac + MIN2(num_components - 1, 1),
1187
                       frac + MIN2(num_components - 1, 2),
1188
                       frac + MIN2(num_components - 1, 3));
1189
}
1190

1191

1192
static void
1193
ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
1194
{
1195
   int bit_size = nir_dest_bit_size(instr->dest);
1196
   assert(bit_size == 32 || instr->num_components <= 2);
1197

1198
   struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1199

1200
   src = ntt_ureg_src_dimension_indirect(c, src, instr->src[0]);
1201

1202
   if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
1203
      /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
1204
       * file.
1205
       */
1206

1207
      if (nir_src_is_const(instr->src[1])) {
1208
         src.Index += ntt_src_as_uint(c, instr->src[1]);
1209
      } else {
1210
         src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1])));
1211
      }
1212

1213
      int start_component = nir_intrinsic_component(instr);
1214
      if (bit_size == 64)
1215
         start_component *= 2;
1216

1217
      src = ntt_shift_by_frac(src, start_component,
1218
                              instr->num_components * bit_size / 32);
1219

1220
      ntt_store(c, &instr->dest, src);
1221
   } else {
1222
      /* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a
1223
       * TGSI_OPCODE_LOAD instruction from the const file.
1224
       */
1225
      struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
1226
      struct ureg_src srcs[2] = {
1227
          src,
1228
          ntt_get_src(c, instr->src[1]),
1229
      };
1230
      ureg_memory_insn(c->ureg, TGSI_OPCODE_LOAD,
1231
                       &dst, 1,
1232
                       srcs, ARRAY_SIZE(srcs),
1233
                       0 /* qualifier */,
1234
                       0 /* tex target */,
1235
                       0 /* format: unused */
1236
      );
1237
   }
1238
}
1239

1240
static unsigned
1241
ntt_get_access_qualifier(nir_intrinsic_instr *instr)
1242
{
1243
   enum gl_access_qualifier access = nir_intrinsic_access(instr);
1244
   unsigned qualifier = 0;
1245

1246
   if (access & ACCESS_COHERENT)
1247
      qualifier |= TGSI_MEMORY_COHERENT;
1248
   if (access & ACCESS_VOLATILE)
1249
      qualifier |= TGSI_MEMORY_VOLATILE;
1250
   if (access & ACCESS_RESTRICT)
1251
      qualifier |= TGSI_MEMORY_RESTRICT;
1252

1253
   return qualifier;
1254
}
1255

1256
static void
1257
ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
1258
             nir_variable_mode mode)
1259
{
1260
   bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
1261
                    instr->intrinsic == nir_intrinsic_store_shared);
1262
   bool is_load = (instr->intrinsic == nir_intrinsic_load_ssbo ||
1263
                    instr->intrinsic == nir_intrinsic_load_shared);
1264
   unsigned opcode;
1265
   struct ureg_src src[4];
1266
   int num_src = 0;
1267
   int nir_src;
1268

1269
   struct ureg_src memory;
1270
   switch (mode) {
1271
   case nir_var_mem_ssbo:
1272
      memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, 0),
1273
                                     instr->src[is_store ? 1 : 0]);
1274
      nir_src = 1;
1275
      break;
1276
   case nir_var_mem_shared:
1277
      memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
1278
      nir_src = 0;
1279
      break;
1280
   default:
1281
      unreachable("unknown memory type");
1282
   }
1283

1284
   if (is_store) {
1285
      src[num_src++] = ntt_get_src(c, instr->src[nir_src + 1]); /* offset */
1286
      src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
1287
   } else {
1288
      src[num_src++] = memory;
1289
      if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1290
         src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* offset */
1291
         if (!is_load)
1292
            src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* value */
1293
      }
1294
   }
1295

1296

1297
   switch (instr->intrinsic) {
1298
   case nir_intrinsic_ssbo_atomic_add:
1299
   case nir_intrinsic_shared_atomic_add:
1300
      opcode = TGSI_OPCODE_ATOMUADD;
1301
      break;
1302
   case nir_intrinsic_ssbo_atomic_fadd:
1303
   case nir_intrinsic_shared_atomic_fadd:
1304
      opcode = TGSI_OPCODE_ATOMFADD;
1305
      break;
1306
   case nir_intrinsic_ssbo_atomic_imin:
1307
   case nir_intrinsic_shared_atomic_imin:
1308
      opcode = TGSI_OPCODE_ATOMIMIN;
1309
      break;
1310
   case nir_intrinsic_ssbo_atomic_imax:
1311
   case nir_intrinsic_shared_atomic_imax:
1312
      opcode = TGSI_OPCODE_ATOMIMAX;
1313
      break;
1314
   case nir_intrinsic_ssbo_atomic_umin:
1315
   case nir_intrinsic_shared_atomic_umin:
1316
      opcode = TGSI_OPCODE_ATOMUMIN;
1317
      break;
1318
   case nir_intrinsic_ssbo_atomic_umax:
1319
   case nir_intrinsic_shared_atomic_umax:
1320
      opcode = TGSI_OPCODE_ATOMUMAX;
1321
      break;
1322
   case nir_intrinsic_ssbo_atomic_and:
1323
   case nir_intrinsic_shared_atomic_and:
1324
      opcode = TGSI_OPCODE_ATOMAND;
1325
      break;
1326
   case nir_intrinsic_ssbo_atomic_or:
1327
   case nir_intrinsic_shared_atomic_or:
1328
      opcode = TGSI_OPCODE_ATOMOR;
1329
      break;
1330
   case nir_intrinsic_ssbo_atomic_xor:
1331
   case nir_intrinsic_shared_atomic_xor:
1332
      opcode = TGSI_OPCODE_ATOMXOR;
1333
      break;
1334
   case nir_intrinsic_ssbo_atomic_exchange:
1335
   case nir_intrinsic_shared_atomic_exchange:
1336
      opcode = TGSI_OPCODE_ATOMXCHG;
1337
      break;
1338
   case nir_intrinsic_ssbo_atomic_comp_swap:
1339
   case nir_intrinsic_shared_atomic_comp_swap:
1340
      opcode = TGSI_OPCODE_ATOMCAS;
1341
      src[num_src++] = ntt_get_src(c, instr->src[nir_src++]);
1342
      break;
1343
   case nir_intrinsic_load_ssbo:
1344
   case nir_intrinsic_load_shared:
1345
      opcode = TGSI_OPCODE_LOAD;
1346
      break;
1347
   case nir_intrinsic_store_ssbo:
1348
   case nir_intrinsic_store_shared:
1349
      opcode = TGSI_OPCODE_STORE;
1350
      break;
1351
   case nir_intrinsic_get_ssbo_size:
1352
      opcode = TGSI_OPCODE_RESQ;
1353
      break;
1354
   default:
1355
      unreachable("unknown memory op");
1356
   }
1357

1358
   unsigned qualifier = 0;
1359
   if (mode == nir_var_mem_ssbo &&
1360
       instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1361
      qualifier = ntt_get_access_qualifier(instr);
1362
   }
1363

1364
   struct ureg_dst dst;
1365
   if (is_store) {
1366
      dst = ureg_dst(memory);
1367

1368
      unsigned write_mask = nir_intrinsic_write_mask(instr);
1369
      if (nir_src_bit_size(instr->src[0]) == 64)
1370
         write_mask = ntt_64bit_write_mask(write_mask);
1371
      dst = ureg_writemask(dst, write_mask);
1372
   } else {
1373
      dst = ntt_get_dest(c, &instr->dest);
1374
   }
1375

1376
   ureg_memory_insn(c->ureg, opcode,
1377
                    &dst, 1,
1378
                    src, num_src,
1379
                    qualifier,
1380
                    TGSI_TEXTURE_BUFFER,
1381
                    0 /* format: unused */);
1382
}
1383

1384
static enum tgsi_texture_type
1385
tgsi_target_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array)
1386
{
1387
   switch (dim) {
1388
   case GLSL_SAMPLER_DIM_1D:
1389
      return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
1390
   case GLSL_SAMPLER_DIM_2D:
1391
      return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
1392
   case GLSL_SAMPLER_DIM_3D:
1393
      return TGSI_TEXTURE_3D;
1394
   case GLSL_SAMPLER_DIM_CUBE:
1395
      return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
1396
   case GLSL_SAMPLER_DIM_RECT:
1397
      return TGSI_TEXTURE_RECT;
1398
   case GLSL_SAMPLER_DIM_BUF:
1399
      return TGSI_TEXTURE_BUFFER;
1400
   default:
1401
      unreachable("unknown sampler dim");
1402
   }
1403
}
1404

1405
static void
1406
ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
1407
{
1408
   unsigned op;
1409
   struct ureg_src srcs[4];
1410
   int num_src = 0;
1411
   enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1412
   bool is_array = nir_intrinsic_image_array(instr);
1413

1414
   struct ureg_dst temp = ureg_dst_undef();
1415

1416
   enum tgsi_texture_type target = tgsi_target_from_sampler_dim(dim, is_array);
1417

1418
   struct ureg_src resource =
1419
      ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
1420
                            instr->src[0]);
1421

1422
   struct ureg_dst dst;
1423
   if (instr->intrinsic == nir_intrinsic_image_store) {
1424
      dst = ureg_dst(resource);
1425
   } else {
1426
      srcs[num_src++] = resource;
1427
      dst = ntt_get_dest(c, &instr->dest);
1428
   }
1429

1430
   if (instr->intrinsic != nir_intrinsic_image_size) {
1431
      struct ureg_src coord = ntt_get_src(c, instr->src[1]);
1432

1433
      if (dim == GLSL_SAMPLER_DIM_MS) {
1434
         temp = ureg_DECL_temporary(c->ureg);
1435
         ureg_MOV(c->ureg, temp, coord);
1436
         ureg_MOV(c->ureg, ureg_writemask(temp, 1 << (is_array ? 3 : 2)),
1437
                  ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X));
1438
         coord = ureg_src(temp);
1439
      }
1440
      srcs[num_src++] = coord;
1441

1442
      if (instr->intrinsic != nir_intrinsic_image_load) {
1443
         srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
1444
         if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
1445
            srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
1446
      }
1447
   }
1448

1449
   switch (instr->intrinsic) {
1450
   case nir_intrinsic_image_load:
1451
      op = TGSI_OPCODE_LOAD;
1452
      break;
1453
   case nir_intrinsic_image_store:
1454
      op = TGSI_OPCODE_STORE;
1455
      break;
1456
   case nir_intrinsic_image_size:
1457
      op = TGSI_OPCODE_RESQ;
1458
      break;
1459
   case nir_intrinsic_image_atomic_add:
1460
      op = TGSI_OPCODE_ATOMUADD;
1461
      break;
1462
   case nir_intrinsic_image_atomic_fadd:
1463
      op = TGSI_OPCODE_ATOMFADD;
1464
      break;
1465
   case nir_intrinsic_image_atomic_imin:
1466
      op = TGSI_OPCODE_ATOMIMIN;
1467
      break;
1468
   case nir_intrinsic_image_atomic_umin:
1469
      op = TGSI_OPCODE_ATOMUMIN;
1470
      break;
1471
   case nir_intrinsic_image_atomic_imax:
1472
      op = TGSI_OPCODE_ATOMIMAX;
1473
      break;
1474
   case nir_intrinsic_image_atomic_umax:
1475
      op = TGSI_OPCODE_ATOMUMAX;
1476
      break;
1477
   case nir_intrinsic_image_atomic_and:
1478
      op = TGSI_OPCODE_ATOMAND;
1479
      break;
1480
   case nir_intrinsic_image_atomic_or:
1481
      op = TGSI_OPCODE_ATOMOR;
1482
      break;
1483
   case nir_intrinsic_image_atomic_xor:
1484
      op = TGSI_OPCODE_ATOMXOR;
1485
      break;
1486
   case nir_intrinsic_image_atomic_exchange:
1487
      op = TGSI_OPCODE_ATOMXCHG;
1488
      break;
1489
   case nir_intrinsic_image_atomic_comp_swap:
1490
      op = TGSI_OPCODE_ATOMCAS;
1491
      break;
1492
   default:
1493
      unreachable("bad op");
1494
   }
1495

1496
   ureg_memory_insn(c->ureg, op, &dst, 1, srcs, num_src,
1497
                    ntt_get_access_qualifier(instr),
1498
                    target,
1499
                    nir_intrinsic_format(instr));
1500

1501
   if (!ureg_dst_is_undef(temp))
1502
      ureg_release_temporary(c->ureg, temp);
1503
}
1504

1505
static void
1506
ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
1507
{
1508
   uint32_t frac = nir_intrinsic_component(instr);
1509
   uint32_t num_components = instr->num_components;
1510
   unsigned base = nir_intrinsic_base(instr);
1511
   struct ureg_src input;
1512
   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
1513
   bool is_64 = nir_dest_bit_size(instr->dest) == 64;
1514

1515
   if (c->s->info.stage == MESA_SHADER_VERTEX) {
1516
      input = ureg_DECL_vs_input(c->ureg, base);
1517
      for (int i = 1; i < semantics.num_slots; i++)
1518
         ureg_DECL_vs_input(c->ureg, base + i);
1519
   } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
1520
      unsigned semantic_name, semantic_index;
1521
      ntt_get_gl_varying_semantic(c, semantics.location,
1522
                                  &semantic_name, &semantic_index);
1523

1524
      /* XXX: ArrayID is used in r600 gs inputs */
1525
      uint32_t array_id = 0;
1526

1527
      input = ureg_DECL_input_layout(c->ureg,
1528
                                     semantic_name,
1529
                                     semantic_index,
1530
                                     base,
1531
                                     ntt_tgsi_usage_mask(frac,
1532
                                                         instr->num_components,
1533
                                                         is_64),
1534
                                     array_id,
1535
                                     semantics.num_slots);
1536
   } else {
1537
      input = c->input_index_map[base];
1538
   }
1539

1540
   if (is_64)
1541
      num_components *= 2;
1542

1543
   input = ntt_shift_by_frac(input, frac, num_components);
1544

1545
   switch (instr->intrinsic) {
1546
   case nir_intrinsic_load_input:
1547
      input = ntt_ureg_src_indirect(c, input, instr->src[0]);
1548
      ntt_store(c, &instr->dest, input);
1549
      break;
1550

1551
   case nir_intrinsic_load_per_vertex_input:
1552
      input = ntt_ureg_src_indirect(c, input, instr->src[1]);
1553
      input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
1554
      ntt_store(c, &instr->dest, input);
1555
      break;
1556

1557
   case nir_intrinsic_load_interpolated_input: {
1558
      input = ntt_ureg_src_indirect(c, input, instr->src[1]);
1559

1560
      nir_intrinsic_instr *bary_instr =
1561
         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
1562

1563
      switch (bary_instr->intrinsic) {
1564
      case nir_intrinsic_load_barycentric_pixel:
1565
         ntt_store(c, &instr->dest, input);
1566
         break;
1567

1568
      case nir_intrinsic_load_barycentric_centroid:
1569
         /* If the input was declared centroid, then there's no need to
1570
          * emit the extra TGSI interp instruction, we can just read the
1571
          * input.
1572
          */
1573
         if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
1574
            ntt_store(c, &instr->dest, input);
1575
         } else {
1576
            ureg_INTERP_CENTROID(c->ureg, ntt_get_dest(c, &instr->dest),
1577
                                 input);
1578
         }
1579
         break;
1580

1581
      case nir_intrinsic_load_barycentric_at_sample:
1582
         ureg_INTERP_SAMPLE(c->ureg, ntt_get_dest(c, &instr->dest), input,
1583
                            ureg_imm1u(c->ureg,
1584
                                       ntt_src_as_uint(c, bary_instr->src[0])));
1585
         break;
1586

1587
      case nir_intrinsic_load_barycentric_at_offset:
1588
         /* We stored the offset in the fake "bary" dest. */
1589
         ureg_INTERP_OFFSET(c->ureg, ntt_get_dest(c, &instr->dest), input,
1590
                            ntt_get_src(c, instr->src[0]));
1591
         break;
1592

1593
      default:
1594
         unreachable("bad barycentric interp intrinsic\n");
1595
      }
1596
      break;
1597
   }
1598

1599
   default:
1600
      unreachable("bad load input intrinsic\n");
1601
   }
1602
}
1603

1604
static void
1605
ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
1606
{
1607
   struct ureg_src src = ntt_get_src(c, instr->src[0]);
1608

1609
   if (src.File == TGSI_FILE_OUTPUT) {
1610
      /* If our src is the output file, that's an indication that we were able
1611
       * to emit the output stores in the generating instructions and we have
1612
       * nothing to do here.
1613
       */
1614
      return;
1615
   }
1616

1617
   uint32_t frac;
1618
   struct ureg_dst out = ntt_store_output_decl(c, instr, &frac);
1619

1620
   if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
1621
      out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
1622
      out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);
1623
   } else {
1624
      out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
1625
   }
1626

1627
   uint8_t swizzle[4] = { 0, 0, 0, 0 };
1628
   for (int i = frac; i <= 4; i++) {
1629
      if (out.WriteMask & (1 << i))
1630
         swizzle[i] = i - frac;
1631
   }
1632

1633
   src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1634

1635
   ureg_MOV(c->ureg, out, src);
1636
   ntt_reladdr_dst_put(c, out);
1637
}
1638

1639
static void
1640
ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
1641
{
1642
   gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
1643
   enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
1644
   struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
1645

1646
   /* virglrenderer doesn't like references to channels of the sysval that
1647
    * aren't defined, even if they aren't really read.  (GLSL compile fails on
1648
    * gl_NumWorkGroups.w, for example).
1649
    */
1650
   uint32_t write_mask = BITSET_MASK(nir_dest_num_components(instr->dest));
1651
   sv = ntt_swizzle_for_write_mask(sv, write_mask);
1652

1653
   /* TGSI and NIR define these intrinsics as always loading ints, but they can
1654
    * still appear on hardware with non-native-integers fragment shaders using
1655
    * the draw path (i915g).  In that case, having called nir_lower_int_to_float
1656
    * means that we actually want floats instead.
1657
    */
1658
   if (!c->native_integers) {
1659
      switch (instr->intrinsic) {
1660
      case nir_intrinsic_load_vertex_id:
1661
      case nir_intrinsic_load_instance_id:
1662
         ureg_U2F(c->ureg, ntt_get_dest(c, &instr->dest), sv);
1663
         return;
1664

1665
      default:
1666
         break;
1667
      }
1668
   }
1669

1670
   ntt_store(c, &instr->dest, sv);
1671
}
1672

1673
static void
1674
ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
1675
{
1676
   switch (instr->intrinsic) {
1677
   case nir_intrinsic_load_ubo:
1678
   case nir_intrinsic_load_ubo_vec4:
1679
      ntt_emit_load_ubo(c, instr);
1680
      break;
1681

1682
      /* Vertex */
1683
   case nir_intrinsic_load_vertex_id:
1684
   case nir_intrinsic_load_vertex_id_zero_base:
1685
   case nir_intrinsic_load_base_vertex:
1686
   case nir_intrinsic_load_base_instance:
1687
   case nir_intrinsic_load_instance_id:
1688
   case nir_intrinsic_load_draw_id:
1689
   case nir_intrinsic_load_invocation_id:
1690
   case nir_intrinsic_load_frag_coord:
1691
   case nir_intrinsic_load_point_coord:
1692
   case nir_intrinsic_load_front_face:
1693
   case nir_intrinsic_load_sample_id:
1694
   case nir_intrinsic_load_sample_mask_in:
1695
   case nir_intrinsic_load_helper_invocation:
1696
   case nir_intrinsic_load_tess_coord:
1697
   case nir_intrinsic_load_patch_vertices_in:
1698
   case nir_intrinsic_load_primitive_id:
1699
   case nir_intrinsic_load_tess_level_outer:
1700
   case nir_intrinsic_load_tess_level_inner:
1701
   case nir_intrinsic_load_local_invocation_id:
1702
   case nir_intrinsic_load_workgroup_id:
1703
   case nir_intrinsic_load_num_workgroups:
1704
   case nir_intrinsic_load_workgroup_size:
1705
   case nir_intrinsic_load_subgroup_size:
1706
   case nir_intrinsic_load_subgroup_invocation:
1707
   case nir_intrinsic_load_subgroup_eq_mask:
1708
   case nir_intrinsic_load_subgroup_ge_mask:
1709
   case nir_intrinsic_load_subgroup_gt_mask:
1710
   case nir_intrinsic_load_subgroup_lt_mask:
1711
      ntt_emit_load_sysval(c, instr);
1712
      break;
1713

1714
   case nir_intrinsic_load_input:
1715
   case nir_intrinsic_load_per_vertex_input:
1716
   case nir_intrinsic_load_interpolated_input:
1717
      ntt_emit_load_input(c, instr);
1718
      break;
1719

1720
   case nir_intrinsic_store_output:
1721
   case nir_intrinsic_store_per_vertex_output:
1722
      ntt_emit_store_output(c, instr);
1723
      break;
1724

1725
   case nir_intrinsic_discard:
1726
      ureg_KILL(c->ureg);
1727
      break;
1728

1729
   case nir_intrinsic_discard_if: {
1730
      struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
1731

1732
      if (c->native_integers) {
1733
         struct ureg_dst temp = ureg_writemask(ureg_DECL_temporary(c->ureg), 1);
1734
         ureg_AND(c->ureg, temp, cond, ureg_imm1f(c->ureg, 1.0));
1735
         ureg_KILL_IF(c->ureg, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
1736
         ureg_release_temporary(c->ureg, temp);
1737
      } else {
1738
         /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
1739
         ureg_KILL_IF(c->ureg, ureg_negate(cond));
1740
      }
1741
      break;
1742
   }
1743

1744
   case nir_intrinsic_load_ssbo:
1745
   case nir_intrinsic_store_ssbo:
1746
   case nir_intrinsic_ssbo_atomic_add:
1747
   case nir_intrinsic_ssbo_atomic_fadd:
1748
   case nir_intrinsic_ssbo_atomic_imin:
1749
   case nir_intrinsic_ssbo_atomic_imax:
1750
   case nir_intrinsic_ssbo_atomic_umin:
1751
   case nir_intrinsic_ssbo_atomic_umax:
1752
   case nir_intrinsic_ssbo_atomic_and:
1753
   case nir_intrinsic_ssbo_atomic_or:
1754
   case nir_intrinsic_ssbo_atomic_xor:
1755
   case nir_intrinsic_ssbo_atomic_exchange:
1756
   case nir_intrinsic_ssbo_atomic_comp_swap:
1757
   case nir_intrinsic_get_ssbo_size:
1758
      ntt_emit_mem(c, instr, nir_var_mem_ssbo);
1759
      break;
1760

1761
   case nir_intrinsic_load_shared:
1762
   case nir_intrinsic_store_shared:
1763
   case nir_intrinsic_shared_atomic_add:
1764
   case nir_intrinsic_shared_atomic_fadd:
1765
   case nir_intrinsic_shared_atomic_imin:
1766
   case nir_intrinsic_shared_atomic_imax:
1767
   case nir_intrinsic_shared_atomic_umin:
1768
   case nir_intrinsic_shared_atomic_umax:
1769
   case nir_intrinsic_shared_atomic_and:
1770
   case nir_intrinsic_shared_atomic_or:
1771
   case nir_intrinsic_shared_atomic_xor:
1772
   case nir_intrinsic_shared_atomic_exchange:
1773
   case nir_intrinsic_shared_atomic_comp_swap:
1774
      ntt_emit_mem(c, instr, nir_var_mem_shared);
1775
      break;
1776

1777
   case nir_intrinsic_image_load:
1778
   case nir_intrinsic_image_store:
1779
   case nir_intrinsic_image_size:
1780
   case nir_intrinsic_image_atomic_add:
1781
   case nir_intrinsic_image_atomic_fadd:
1782
   case nir_intrinsic_image_atomic_imin:
1783
   case nir_intrinsic_image_atomic_umin:
1784
   case nir_intrinsic_image_atomic_imax:
1785
   case nir_intrinsic_image_atomic_umax:
1786
   case nir_intrinsic_image_atomic_and:
1787
   case nir_intrinsic_image_atomic_or:
1788
   case nir_intrinsic_image_atomic_xor:
1789
   case nir_intrinsic_image_atomic_exchange:
1790
   case nir_intrinsic_image_atomic_comp_swap:
1791
      ntt_emit_image_load_store(c, instr);
1792
      break;
1793

1794
   case nir_intrinsic_control_barrier:
1795
      ureg_BARRIER(c->ureg);
1796
      break;
1797

1798
   case nir_intrinsic_memory_barrier:
1799
      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
1800
                                      TGSI_MEMBAR_SHADER_BUFFER |
1801
                                      TGSI_MEMBAR_ATOMIC_BUFFER |
1802
                                      TGSI_MEMBAR_SHADER_IMAGE |
1803
                                      TGSI_MEMBAR_SHARED));
1804
      break;
1805

1806
   case nir_intrinsic_memory_barrier_atomic_counter:
1807
      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_ATOMIC_BUFFER));
1808
      break;
1809

1810
   case nir_intrinsic_memory_barrier_buffer:
1811
      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_BUFFER));
1812
      break;
1813

1814
   case nir_intrinsic_memory_barrier_image:
1815
      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_IMAGE));
1816
      break;
1817

1818
   case nir_intrinsic_memory_barrier_shared:
1819
      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHARED));
1820
      break;
1821

1822
   case nir_intrinsic_group_memory_barrier:
1823
      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
1824
                                      TGSI_MEMBAR_SHADER_BUFFER |
1825
                                      TGSI_MEMBAR_ATOMIC_BUFFER |
1826
                                      TGSI_MEMBAR_SHADER_IMAGE |
1827
                                      TGSI_MEMBAR_SHARED |
1828
                                      TGSI_MEMBAR_THREAD_GROUP));
1829
      break;
1830

1831
   case nir_intrinsic_end_primitive:
1832
      ureg_ENDPRIM(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
1833
      break;
1834

1835
   case nir_intrinsic_emit_vertex:
1836
      ureg_EMIT(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
1837
      break;
1838

1839
      /* In TGSI we don't actually generate the barycentric coords, and emit
1840
       * interp intrinsics later.  However, we do need to store the _at_offset
1841
       * argument so that we can use it at that point.
1842
       */
1843
   case nir_intrinsic_load_barycentric_pixel:
1844
   case nir_intrinsic_load_barycentric_centroid:
1845
   case nir_intrinsic_load_barycentric_at_sample:
1846
      break;
1847

1848
   case nir_intrinsic_load_barycentric_at_offset:
1849
      ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));
1850
      break;
1851

1852
   default:
1853
      fprintf(stderr, "Unknown intrinsic: ");
1854
      nir_print_instr(&instr->instr, stderr);
1855
      fprintf(stderr, "\n");
1856
      break;
1857
   }
1858
}
1859

1860
struct ntt_tex_operand_state {
1861
   struct ureg_src srcs[4];
1862
   unsigned i;
1863
   unsigned chan;
1864
   bool is_temp[4];
1865
};
1866

1867
static void
1868
ntt_push_tex_arg(struct ntt_compile *c,
1869
                 nir_tex_instr *instr,
1870
                 nir_tex_src_type tex_src_type,
1871
                 struct ntt_tex_operand_state *s)
1872
{
1873
   int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
1874
   if (tex_src < 0)
1875
      return;
1876

1877
   struct ureg_src src = ntt_get_src(c, instr->src[tex_src].src);
1878
   int num_components = nir_tex_instr_src_size(instr, tex_src);
1879

1880
   /* Find which src in the tex args we'll fit in. */
1881
   if (s->chan + num_components > 4) {
1882
      s->chan = 0;
1883
      s->i++;
1884
   }
1885

1886
   /* Would need to fix up swizzling up to the writemask channel here. */
1887
   assert(num_components == 1 || s->chan == 0);
1888
   if (num_components == 1)
1889
      src = ureg_scalar(src, 0);
1890

1891
   if (ureg_src_is_undef(s->srcs[s->i])) {
1892
      /* First emit of a tex operand's components, no need for a mov. */
1893
      s->srcs[s->i] = src;
1894
   } else {
1895
      /* Otherwise, we need to have a temporary for all the components that go
1896
       * in this operand.
1897
       */
1898
      if (!s->is_temp[s->i]) {
1899
         struct ureg_src prev_src = s->srcs[s->i];
1900
         s->srcs[s->i] = ureg_src(ureg_DECL_temporary(c->ureg));
1901
         s->is_temp[s->i] = true;
1902

1903
         ureg_MOV(c->ureg,
1904
                  ureg_writemask(ureg_dst(s->srcs[s->i]),
1905
                                 BITFIELD_MASK(s->chan)), prev_src);
1906
      }
1907

1908
      ureg_MOV(c->ureg,
1909
               ureg_writemask(ureg_dst(s->srcs[s->i]),
1910
                              BITFIELD_RANGE(s->chan, num_components)),
1911
               src);
1912
   }
1913

1914
   s->chan += num_components;
1915
}
1916

1917
static void
1918
ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
1919
{
1920
   struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
1921
   unsigned target;
1922
   unsigned tex_opcode;
1923

1924
   struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
1925
   int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
1926
   if (sampler_src >= 0) {
1927
      struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
1928
      sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr));
1929
   }
1930

1931
   switch (instr->op) {
1932
   case nir_texop_tex:
1933
      tex_opcode = TGSI_OPCODE_TEX;
1934
      break;
1935
   case nir_texop_txf:
1936
   case nir_texop_txf_ms:
1937
      tex_opcode = TGSI_OPCODE_TXF;
1938

1939
      if (c->has_txf_lz) {
1940
         int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);
1941
         if (lod_src >= 0 &&
1942
             nir_src_is_const(instr->src[lod_src].src) &&
1943
             ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {
1944
            tex_opcode = TGSI_OPCODE_TXF_LZ;
1945
         }
1946
      }
1947
      break;
1948
   case nir_texop_txl:
1949
      tex_opcode = TGSI_OPCODE_TXL;
1950
      break;
1951
   case nir_texop_txb:
1952
      tex_opcode = TGSI_OPCODE_TXB;
1953
      break;
1954
   case nir_texop_txd:
1955
      tex_opcode = TGSI_OPCODE_TXD;
1956
      break;
1957
   case nir_texop_txs:
1958
      tex_opcode = TGSI_OPCODE_TXQ;
1959
      break;
1960
   case nir_texop_tg4:
1961
      tex_opcode = TGSI_OPCODE_TG4;
1962
      break;
1963
   case nir_texop_query_levels:
1964
      tex_opcode = TGSI_OPCODE_TXQ;
1965
      break;
1966
   case nir_texop_lod:
1967
      tex_opcode = TGSI_OPCODE_LODQ;
1968
      break;
1969
   case nir_texop_texture_samples:
1970
      tex_opcode = TGSI_OPCODE_TXQS;
1971
      break;
1972
   default:
1973
      unreachable("unsupported tex op");
1974
   }
1975

1976
   struct ntt_tex_operand_state s = { .i = 0 };
1977
   ntt_push_tex_arg(c, instr, nir_tex_src_coord, &s);
1978
   /* We always have at least two slots for the coordinate, even on 1D. */
1979
   s.chan = MAX2(s.chan, 2);
1980

1981
   ntt_push_tex_arg(c, instr, nir_tex_src_comparator, &s);
1982
   s.chan = MAX2(s.chan, 3);
1983

1984
   ntt_push_tex_arg(c, instr, nir_tex_src_bias, &s);
1985
   if (tex_opcode != TGSI_OPCODE_TXF_LZ)
1986
      ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
1987

1988
   /* End of packed src setup, everything that follows gets its own operand. */
1989
   if (s.chan)
1990
      s.i++;
1991

1992
   switch (instr->sampler_dim) {
1993
   case GLSL_SAMPLER_DIM_1D:
1994
      if (instr->is_array) {
1995
         if (instr->is_shadow) {
1996
            target = TGSI_TEXTURE_SHADOW1D_ARRAY;
1997
         } else {
1998
            target = TGSI_TEXTURE_1D_ARRAY;
1999
         }
2000
      } else {
2001
         if (instr->is_shadow) {
2002
            target = TGSI_TEXTURE_SHADOW1D;
2003
         } else {
2004
            target = TGSI_TEXTURE_1D;
2005
         }
2006
      }
2007
      break;
2008
   case GLSL_SAMPLER_DIM_2D:
2009
   case GLSL_SAMPLER_DIM_EXTERNAL:
2010
      if (instr->is_array) {
2011
         if (instr->is_shadow) {
2012
            target = TGSI_TEXTURE_SHADOW2D_ARRAY;
2013
         } else {
2014
            target = TGSI_TEXTURE_2D_ARRAY;
2015
         }
2016
      } else {
2017
         if (instr->is_shadow) {
2018
            target = TGSI_TEXTURE_SHADOW2D;
2019
         } else {
2020
            target = TGSI_TEXTURE_2D;
2021
         }
2022
      }
2023
      break;
2024
   case GLSL_SAMPLER_DIM_MS:
2025
      if (instr->is_array) {
2026
         target = TGSI_TEXTURE_2D_ARRAY_MSAA;
2027
      } else {
2028
         target = TGSI_TEXTURE_2D_ARRAY;
2029
      }
2030
      break;
2031
   case GLSL_SAMPLER_DIM_3D:
2032
      assert(!instr->is_shadow);
2033
      target = TGSI_TEXTURE_3D;
2034
      break;
2035
   case GLSL_SAMPLER_DIM_RECT:
2036
      if (instr->is_shadow) {
2037
         target = TGSI_TEXTURE_SHADOWRECT;
2038
      } else {
2039
         target = TGSI_TEXTURE_RECT;
2040
      }
2041
      break;
2042
   case GLSL_SAMPLER_DIM_CUBE:
2043
      if (instr->is_array) {
2044
         if (instr->is_shadow) {
2045
            target = TGSI_TEXTURE_SHADOWCUBE_ARRAY;
2046
         } else {
2047
            target = TGSI_TEXTURE_CUBE_ARRAY;
2048
         }
2049
      } else {
2050
         if (instr->is_shadow) {
2051
            target = TGSI_TEXTURE_SHADOWCUBE;
2052
         } else {
2053
            target = TGSI_TEXTURE_CUBE;
2054
         }
2055
      }
2056
      break;
2057
   case GLSL_SAMPLER_DIM_BUF:
2058
      target = TGSI_TEXTURE_BUFFER;
2059
      break;
2060
   default:
2061
      fprintf(stderr, "Unknown sampler dimensions: %d\n", instr->sampler_dim);
2062
      abort();
2063
   }
2064

2065
   if (s.i > 1) {
2066
      if (tex_opcode == TGSI_OPCODE_TEX)
2067
         tex_opcode = TGSI_OPCODE_TEX2;
2068
      if (tex_opcode == TGSI_OPCODE_TXB)
2069
         tex_opcode = TGSI_OPCODE_TXB2;
2070
      if (tex_opcode == TGSI_OPCODE_TXL)
2071
         tex_opcode = TGSI_OPCODE_TXL2;
2072
   }
2073

2074
   if (instr->op == nir_texop_txd) {
2075
      /* Derivs appear in their own src args */
2076
      int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
2077
      int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2078
      s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
2079
      s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
2080
   }
2081

2082
   if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2083
      if (c->screen->get_param(c->screen,
2084
                               PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
2085
         sampler = ureg_scalar(sampler, instr->component);
2086
         s.srcs[s.i++] = ureg_src_undef();
2087
      } else {
2088
         s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
2089
      }
2090
   }
2091

2092
   s.srcs[s.i++] = sampler;
2093

2094
   enum tgsi_return_type tex_type;
2095
   switch (instr->dest_type) {
2096
   case nir_type_float32:
2097
      tex_type = TGSI_RETURN_TYPE_FLOAT;
2098
      break;
2099
   case nir_type_int32:
2100
      tex_type = TGSI_RETURN_TYPE_SINT;
2101
      break;
2102
   case nir_type_uint32:
2103
      tex_type = TGSI_RETURN_TYPE_UINT;
2104
      break;
2105
   default:
2106
      unreachable("unknown texture type");
2107
   }
2108

2109
   struct tgsi_texture_offset tex_offsets[4];
2110
   unsigned num_tex_offsets = 0;
2111
   int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2112
   if (tex_offset_src >= 0) {
2113
      struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
2114

2115
      tex_offsets[0].File = offset.File;
2116
      tex_offsets[0].Index = offset.Index;
2117
      tex_offsets[0].SwizzleX = offset.SwizzleX;
2118
      tex_offsets[0].SwizzleY = offset.SwizzleY;
2119
      tex_offsets[0].SwizzleZ = offset.SwizzleZ;
2120
      tex_offsets[0].Padding = 0;
2121

2122
      num_tex_offsets = 1;
2123
   }
2124

2125
   struct ureg_dst tex_dst;
2126
   if (instr->op == nir_texop_query_levels)
2127
      tex_dst = ureg_writemask(ureg_DECL_temporary(c->ureg), TGSI_WRITEMASK_W);
2128
   else
2129
      tex_dst = dst;
2130

2131
   ureg_tex_insn(c->ureg, tex_opcode,
2132
                 &tex_dst, 1,
2133
                 target,
2134
                 tex_type,
2135
                 tex_offsets, num_tex_offsets,
2136
                 s.srcs, s.i);
2137

2138
   if (instr->op == nir_texop_query_levels) {
2139
      ureg_MOV(c->ureg, dst, ureg_scalar(ureg_src(tex_dst), 3));
2140
      ureg_release_temporary(c->ureg, tex_dst);
2141
   }
2142

2143
   for (int i = 0; i < s.i; i++) {
2144
      if (s.is_temp[i])
2145
         ureg_release_temporary(c->ureg, ureg_dst(s.srcs[i]));
2146
   }
2147
}
2148

2149
static void
2150
ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
2151
{
2152
   switch (jump->type) {
2153
   case nir_jump_break:
2154
      ureg_BRK(c->ureg);
2155
      break;
2156

2157
   case nir_jump_continue:
2158
      ureg_CONT(c->ureg);
2159
      break;
2160

2161
   default:
2162
      fprintf(stderr, "Unknown jump instruction: ");
2163
      nir_print_instr(&jump->instr, stderr);
2164
      fprintf(stderr, "\n");
2165
      abort();
2166
   }
2167
}
2168

2169
static void
2170
ntt_emit_ssa_undef(struct ntt_compile *c, nir_ssa_undef_instr *instr)
2171
{
2172
   /* Nothing to do but make sure that we have some storage to deref. */
2173
   (void)ntt_get_ssa_def_decl(c, &instr->def);
2174
}
2175

2176
static void
2177
ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
2178
{
2179
   /* There is no addr reg in use before we start emitting an instr. */
2180
   c->next_addr_reg = 0;
2181

2182
   switch (instr->type) {
2183
   case nir_instr_type_deref:
2184
      /* ignored, will be walked by nir_intrinsic_image_*_deref. */
2185
      break;
2186

2187
   case nir_instr_type_alu:
2188
      ntt_emit_alu(c, nir_instr_as_alu(instr));
2189
      break;
2190

2191
   case nir_instr_type_intrinsic:
2192
      ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2193
      break;
2194

2195
   case nir_instr_type_load_const:
2196
      /* Nothing to do here, as load consts are done directly from
2197
       * ntt_get_src() (since many constant NIR srcs will often get folded
2198
       * directly into a register file index instead of as a TGSI src).
2199
       */
2200
      break;
2201

2202
   case nir_instr_type_tex:
2203
      ntt_emit_texture(c, nir_instr_as_tex(instr));
2204
      break;
2205

2206
   case nir_instr_type_jump:
2207
      ntt_emit_jump(c, nir_instr_as_jump(instr));
2208
      break;
2209

2210
   case nir_instr_type_ssa_undef:
2211
      ntt_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
2212
      break;
2213

2214
   default:
2215
      fprintf(stderr, "Unknown NIR instr type: ");
2216
      nir_print_instr(instr, stderr);
2217
      fprintf(stderr, "\n");
2218
      abort();
2219
   }
2220
}
2221

2222
static void
2223
ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
2224
{
2225
   unsigned label;
2226
   ureg_UIF(c->ureg, c->if_cond, &label);
2227
   ntt_emit_cf_list(c, &if_stmt->then_list);
2228

2229
   if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
2230
      ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
2231
      ureg_ELSE(c->ureg, &label);
2232
      ntt_emit_cf_list(c, &if_stmt->else_list);
2233
   }
2234

2235
   ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
2236
   ureg_ENDIF(c->ureg);
2237
}
2238

2239
static void
2240
ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
2241
{
2242
   /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
2243
    * does reference BGNLOOP's.  Follow the former behavior unless something comes up
2244
    * with a need.
2245
    */
2246
   unsigned begin_label;
2247
   ureg_BGNLOOP(c->ureg, &begin_label);
2248
   ntt_emit_cf_list(c, &loop->body);
2249

2250
   unsigned end_label;
2251
   ureg_ENDLOOP(c->ureg, &end_label);
2252
}
2253

2254
static void
2255
ntt_free_ssa_temp_by_index(struct ntt_compile *c, int index)
2256
{
2257
   /* We do store CONST/IMM/INPUT/etc. in ssa_temp[] */
2258
   if (c->ssa_temp[index].File != TGSI_FILE_TEMPORARY)
2259
      return;
2260

2261
   ureg_release_temporary(c->ureg, c->ssa_temp[index]);
2262
   memset(&c->ssa_temp[index], 0, sizeof(c->ssa_temp[index]));
2263
}
2264

2265
/* Releases any temporaries for SSA defs with a live interval ending at this
2266
 * instruction.
2267
 */
2268
static bool
2269
ntt_src_live_interval_end_cb(nir_src *src, void *state)
2270
{
2271
   struct ntt_compile *c = state;
2272

2273
   if (src->is_ssa) {
2274
      nir_ssa_def *def = src->ssa;
2275

2276
      if (c->liveness->defs[def->index].end == src->parent_instr->index)
2277
         ntt_free_ssa_temp_by_index(c, def->index);
2278
   }
2279

2280
   return true;
2281
}
2282

2283
static void
2284
ntt_emit_block(struct ntt_compile *c, nir_block *block)
2285
{
2286
   nir_foreach_instr(instr, block) {
2287
      ntt_emit_instr(c, instr);
2288

2289
      nir_foreach_src(instr, ntt_src_live_interval_end_cb, c);
2290
   }
2291

2292
   /* Set up the if condition for ntt_emit_if(), which we have to do before
2293
    * freeing up the temps (the "if" is treated as inside the block for liveness
2294
    * purposes, despite not being an instruction)
2295
    *
2296
    * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
2297
    * looks at all of .xyzw.  No harm in working around the bug.
2298
    */
2299
   nir_if *nif = nir_block_get_following_if(block);
2300
   if (nif)
2301
      c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X);
2302

2303
   /* Free up any SSA temps that are unused at the end of the block. */
2304
   unsigned index;
2305
   BITSET_FOREACH_SET(index, block->live_out, BITSET_WORDS(c->impl->ssa_alloc)) {
2306
      unsigned def_end_ip = c->liveness->defs[index].end;
2307
      if (def_end_ip == block->end_ip)
2308
         ntt_free_ssa_temp_by_index(c, index);
2309
   }
2310
}
2311

2312
static void
2313
ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
2314
{
2315
   /* There is no addr reg in use before we start emitting any part of a CF
2316
    * node (such as an if condition)
2317
    */
2318
   c->next_addr_reg = 0;
2319

2320
   foreach_list_typed(nir_cf_node, node, node, list) {
2321
      switch (node->type) {
2322
      case nir_cf_node_block:
2323
         ntt_emit_block(c, nir_cf_node_as_block(node));
2324
         break;
2325

2326
      case nir_cf_node_if:
2327
         ntt_emit_if(c, nir_cf_node_as_if(node));
2328
         break;
2329

2330
      case nir_cf_node_loop:
2331
         ntt_emit_loop(c, nir_cf_node_as_loop(node));
2332
         break;
2333

2334
      default:
2335
         unreachable("unknown CF type");
2336
      }
2337
   }
2338
}
2339

2340
static void
2341
ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
2342
{
2343
   c->impl = impl;
2344
   c->liveness = nir_live_ssa_defs_per_instr(impl);
2345

2346
   c->ssa_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
2347
   c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);
2348

2349
   ntt_setup_registers(c, &impl->registers);
2350
   ntt_emit_cf_list(c, &impl->body);
2351

2352
   ralloc_free(c->liveness);
2353
   c->liveness = NULL;
2354
}
2355

2356
static int
2357
type_size(const struct glsl_type *type, bool bindless)
2358
{
2359
   return glsl_count_attribute_slots(type, false);
2360
}
2361

2362
/* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
2363
 * can handle for 64-bit values in TGSI.
2364
 */
2365
static bool
2366
ntt_should_vectorize_instr(const nir_instr *instr, void *data)
2367
{
2368
   if (instr->type != nir_instr_type_alu)
2369
      return false;
2370

2371
   nir_alu_instr *alu = nir_instr_as_alu(instr);
2372

2373
   switch (alu->op) {
2374
   case nir_op_ibitfield_extract:
2375
   case nir_op_ubitfield_extract:
2376
   case nir_op_bitfield_insert:
2377
      /* virglrenderer only looks at the .x channel of the offset/bits operands
2378
       * when translating to GLSL.  tgsi.rst doesn't seem to require scalar
2379
       * offset/bits operands.
2380
       *
2381
       * https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195
2382
       */
2383
      return false;
2384

2385
   default:
2386
      break;
2387
   }
2388

2389
   unsigned num_components = alu->dest.dest.ssa.num_components;
2390

2391
   int src_bit_size = nir_src_bit_size(alu->src[0].src);
2392
   int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
2393

2394
   if (src_bit_size == 64 || dst_bit_size == 64) {
2395
      if (num_components > 1)
2396
         return false;
2397
   }
2398

2399
   return true;
2400
}
2401

2402
static bool
2403
ntt_should_vectorize_io(unsigned align, unsigned bit_size,
2404
                        unsigned num_components, unsigned high_offset,
2405
                        nir_intrinsic_instr *low, nir_intrinsic_instr *high,
2406
                        void *data)
2407
{
2408
   if (bit_size != 32)
2409
      return false;
2410

2411
   /* Our offset alignment should aways be at least 4 bytes */
2412
   if (align < 4)
2413
      return false;
2414

2415
   /* No wrapping off the end of a TGSI reg.  We could do a bit better by
2416
    * looking at low's actual offset.  XXX: With LOAD_CONSTBUF maybe we don't
2417
    * need this restriction.
2418
    */
2419
   unsigned worst_start_component = align == 4 ? 3 : align / 4;
2420
   if (worst_start_component + num_components > 4)
2421
      return false;
2422

2423
   return true;
2424
}
2425

2426
static nir_variable_mode
2427
ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
2428
{
2429
   unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
2430
   unsigned indirect_mask = 0;
2431

2432
   if (!screen->get_shader_param(screen, pipe_stage,
2433
                                 PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
2434
      indirect_mask |= nir_var_shader_in;
2435
   }
2436

2437
   if (!screen->get_shader_param(screen, pipe_stage,
2438
                                 PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
2439
      indirect_mask |= nir_var_shader_out;
2440
   }
2441

2442
   if (!screen->get_shader_param(screen, pipe_stage,
2443
                                 PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
2444
      indirect_mask |= nir_var_function_temp;
2445
   }
2446

2447
   return indirect_mask;
2448
}
2449

2450
static void
2451
ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
2452
{
2453
   bool progress;
2454
   nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
2455
   unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
2456
   unsigned control_flow_depth =
2457
      screen->get_shader_param(screen, pipe_stage,
2458
                               PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
2459
   do {
2460
      progress = false;
2461

2462
      NIR_PASS_V(s, nir_lower_vars_to_ssa);
2463

2464
      NIR_PASS(progress, s, nir_copy_prop);
2465
      NIR_PASS(progress, s, nir_opt_algebraic);
2466
      NIR_PASS(progress, s, nir_opt_constant_folding);
2467
      NIR_PASS(progress, s, nir_opt_remove_phis);
2468
      NIR_PASS(progress, s, nir_opt_conditional_discard);
2469
      NIR_PASS(progress, s, nir_opt_dce);
2470
      NIR_PASS(progress, s, nir_opt_dead_cf);
2471
      NIR_PASS(progress, s, nir_opt_cse);
2472
      NIR_PASS(progress, s, nir_opt_find_array_copies);
2473
      NIR_PASS(progress, s, nir_opt_if, true);
2474
      NIR_PASS(progress, s, nir_opt_peephole_select,
2475
               control_flow_depth == 0 ? ~0 : 8, true, true);
2476
      NIR_PASS(progress, s, nir_opt_algebraic);
2477
      NIR_PASS(progress, s, nir_opt_constant_folding);
2478
      nir_load_store_vectorize_options vectorize_opts = {
2479
         .modes = nir_var_mem_ubo,
2480
         .callback = ntt_should_vectorize_io,
2481
         .robust_modes = 0,
2482
      };
2483
      NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
2484
      NIR_PASS(progress, s, nir_opt_shrink_vectors, true);
2485
      NIR_PASS(progress, s, nir_opt_trivial_continues);
2486
      NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
2487
      NIR_PASS(progress, s, nir_opt_undef);
2488
      NIR_PASS(progress, s, nir_opt_loop_unroll, no_indirects_mask);
2489

2490
   } while (progress);
2491
}
2492

2493
/* Scalarizes all 64-bit ALU ops.  Note that we only actually need to
2494
 * scalarize vec3/vec4s, should probably fix that.
2495
 */
2496
static bool
2497
scalarize_64bit(const nir_instr *instr, const void *data)
2498
{
2499
   const nir_alu_instr *alu = nir_instr_as_alu(instr);
2500

2501
   return (nir_dest_bit_size(alu->dest.dest) == 64 ||
2502
           nir_src_bit_size(alu->src[0].src) == 64);
2503
}
2504

2505
static bool
2506
nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
2507
{
2508
   b->cursor = nir_after_instr(&instr->instr);
2509

2510
   switch (instr->intrinsic) {
2511
   case nir_intrinsic_load_ubo:
2512
   case nir_intrinsic_load_ubo_vec4:
2513
   case nir_intrinsic_load_ssbo:
2514
   case nir_intrinsic_load_input:
2515
   case nir_intrinsic_load_interpolated_input:
2516
   case nir_intrinsic_load_per_vertex_input:
2517
   case nir_intrinsic_store_output:
2518
   case nir_intrinsic_store_ssbo:
2519
      break;
2520
   default:
2521
      return false;
2522
   }
2523

2524
   if (instr->num_components <= 2)
2525
      return false;
2526

2527
   bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
2528
   if (has_dest) {
2529
      if (nir_dest_bit_size(instr->dest) != 64)
2530
         return false;
2531
   } else  {
2532
      if (nir_src_bit_size(instr->src[0]) != 64)
2533
          return false;
2534
   }
2535

2536
   nir_intrinsic_instr *first =
2537
      nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
2538
   nir_intrinsic_instr *second =
2539
      nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
2540

2541
   switch (instr->intrinsic) {
2542
   case nir_intrinsic_load_ubo:
2543
   case nir_intrinsic_load_ubo_vec4:
2544
   case nir_intrinsic_load_ssbo:
2545
   case nir_intrinsic_store_ssbo:
2546
      break;
2547

2548
   default: {
2549
      nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
2550
      semantics.location++;
2551
      semantics.num_slots--;
2552
      nir_intrinsic_set_io_semantics(second, semantics);
2553

2554
      nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
2555
      break;
2556
   }
2557
   }
2558

2559
   first->num_components = 2;
2560
   second->num_components -= 2;
2561
   if (has_dest) {
2562
      first->dest.ssa.num_components = 2;
2563
      second->dest.ssa.num_components -= 2;
2564
   }
2565

2566
   nir_builder_instr_insert(b, &first->instr);
2567
   nir_builder_instr_insert(b, &second->instr);
2568

2569
   if (has_dest) {
2570
      /* Merge the two loads' results back into a vector. */
2571
      nir_ssa_def *channels[4] = {
2572
         nir_channel(b, &first->dest.ssa, 0),
2573
         nir_channel(b, &first->dest.ssa, 1),
2574
         nir_channel(b, &second->dest.ssa, 0),
2575
         second->num_components > 1 ? nir_channel(b, &second->dest.ssa, 1) : NULL,
2576
      };
2577
      nir_ssa_def *new = nir_vec(b, channels, instr->num_components);
2578
      nir_ssa_def_rewrite_uses(&instr->dest.ssa, new);
2579
   } else {
2580
      /* Split the src value across the two stores. */
2581
      b->cursor = nir_before_instr(&instr->instr);
2582

2583
      nir_ssa_def *src0 = instr->src[0].ssa;
2584
      nir_ssa_def *channels[4] = { 0 };
2585
      for (int i = 0; i < instr->num_components; i++)
2586
         channels[i] = nir_channel(b, src0, i);
2587

2588
      nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
2589
      nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
2590

2591
      nir_instr_rewrite_src(&first->instr, &first->src[0],
2592
                            nir_src_for_ssa(nir_vec(b, channels, 2)));
2593
      nir_instr_rewrite_src(&second->instr, &second->src[0],
2594
                            nir_src_for_ssa(nir_vec(b, &channels[2],
2595
                                                    second->num_components)));
2596
   }
2597

2598
   int offset_src = -1;
2599
   uint32_t offset_amount = 16;
2600

2601
   switch (instr->intrinsic) {
2602
   case nir_intrinsic_load_ssbo:
2603
   case nir_intrinsic_load_ubo:
2604
      offset_src = 1;
2605
      break;
2606
   case nir_intrinsic_load_ubo_vec4:
2607
      offset_src = 1;
2608
      offset_amount = 1;
2609
      break;
2610
   case nir_intrinsic_store_ssbo:
2611
      offset_src = 2;
2612
      break;
2613
   default:
2614
      break;
2615
   }
2616
   if (offset_src != -1) {
2617
      b->cursor = nir_before_instr(&second->instr);
2618
      nir_ssa_def *second_offset =
2619
         nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
2620
      nir_instr_rewrite_src(&second->instr, &second->src[offset_src],
2621
                            nir_src_for_ssa(second_offset));
2622
   }
2623

2624
   /* DCE stores we generated with no writemask (nothing else does this
2625
    * currently).
2626
    */
2627
   if (!has_dest) {
2628
      if (nir_intrinsic_write_mask(first) == 0)
2629
         nir_instr_remove(&first->instr);
2630
      if (nir_intrinsic_write_mask(second) == 0)
2631
         nir_instr_remove(&second->instr);
2632
   }
2633

2634
   nir_instr_remove(&instr->instr);
2635

2636
   return true;
2637
}
2638

2639
static bool
2640
nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
2641
{
2642
   int num_components = instr->def.num_components;
2643

2644
   if (instr->def.bit_size != 64 || num_components <= 2)
2645
      return false;
2646

2647
   b->cursor = nir_before_instr(&instr->instr);
2648

2649
   nir_load_const_instr *first =
2650
      nir_load_const_instr_create(b->shader, 2, 64);
2651
   nir_load_const_instr *second =
2652
      nir_load_const_instr_create(b->shader, num_components - 2, 64);
2653

2654
   first->value[0] = instr->value[0];
2655
   first->value[1] = instr->value[1];
2656
   second->value[0] = instr->value[2];
2657
   if (num_components == 4)
2658
      second->value[1] = instr->value[3];
2659

2660
   nir_builder_instr_insert(b, &first->instr);
2661
   nir_builder_instr_insert(b, &second->instr);
2662

2663
   nir_ssa_def *channels[4] = {
2664
      nir_channel(b, &first->def, 0),
2665
      nir_channel(b, &first->def, 1),
2666
      nir_channel(b, &second->def, 0),
2667
      num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
2668
   };
2669
   nir_ssa_def *new = nir_vec(b, channels, num_components);
2670
   nir_ssa_def_rewrite_uses(&instr->def, new);
2671
   nir_instr_remove(&instr->instr);
2672

2673
   return true;
2674
}
2675

2676
static bool
2677
nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
2678
                                      void *data)
2679
{
2680
   switch (instr->type) {
2681
   case nir_instr_type_load_const:
2682
      return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
2683

2684
   case nir_instr_type_intrinsic:
2685
      return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
2686
   default:
2687
      return false;
2688
   }
2689
}
2690

2691
static bool
2692
nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
2693
{
2694
   return nir_shader_instructions_pass(s,
2695
                                       nir_to_tgsi_lower_64bit_to_vec2_instr,
2696
                                       nir_metadata_block_index |
2697
                                       nir_metadata_dominance,
2698
                                       NULL);
2699
}
2700

2701
static void
2702
ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s)
2703
{
2704
   const struct nir_shader_compiler_options *options = s->options;
2705
   bool lower_fsqrt =
2706
      !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),
2707
                                PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
2708

2709
   if (!options->lower_extract_byte ||
2710
       !options->lower_extract_word ||
2711
       !options->lower_insert_byte ||
2712
       !options->lower_insert_word ||
2713
       !options->lower_fdph ||
2714
       !options->lower_flrp64 ||
2715
       !options->lower_fmod ||
2716
       !options->lower_rotate ||
2717
       !options->lower_uniforms_to_ubo ||
2718
       !options->lower_vector_cmp ||
2719
       options->lower_fsqrt != lower_fsqrt) {
2720
      nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);
2721
      *new_options = *s->options;
2722

2723
      new_options->lower_extract_byte = true;
2724
      new_options->lower_extract_word = true;
2725
      new_options->lower_insert_byte = true;
2726
      new_options->lower_insert_word = true;
2727
      new_options->lower_fdph = true;
2728
      new_options->lower_flrp64 = true;
2729
      new_options->lower_fmod = true;
2730
      new_options->lower_rotate = true;
2731
      new_options->lower_uniforms_to_ubo = true,
2732
      new_options->lower_vector_cmp = true;
2733
      new_options->lower_fsqrt = lower_fsqrt;
2734

2735
      s->options = new_options;
2736
   }
2737
}
2738

2739
/**
2740
 * Translates the NIR shader to TGSI.
2741
 *
2742
 * This requires some lowering of the NIR shader to prepare it for translation.
2743
 * We take ownership of the NIR shader passed, returning a reference to the new
2744
 * TGSI tokens instead.  If you need to keep the NIR, then pass us a clone.
2745
 */
2746
const void *
2747
nir_to_tgsi(struct nir_shader *s,
2748
            struct pipe_screen *screen)
2749
{
2750
   struct ntt_compile *c;
2751
   const void *tgsi_tokens;
2752
   bool debug = env_var_as_boolean("NIR_TO_TGSI_DEBUG", false);
2753
   nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
2754
   bool native_integers = screen->get_shader_param(screen,
2755
                                                   pipe_shader_type_from_mesa(s->info.stage),
2756
                                                   PIPE_SHADER_CAP_INTEGERS);
2757
   const struct nir_shader_compiler_options *original_options = s->options;
2758

2759
   ntt_fix_nir_options(screen, s);
2760

2761
   NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
2762
              type_size, (nir_lower_io_options)0);
2763
   NIR_PASS_V(s, nir_lower_regs_to_ssa);
2764

2765
   const nir_lower_tex_options lower_tex_options = {
2766
      /* XXX: We could skip lowering of TXP for TEX with <=3 coord_compoennts.
2767
       */
2768
      .lower_txp = ~0,
2769
   };
2770
   NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
2771

2772
   if (!original_options->lower_uniforms_to_ubo) {
2773
      NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
2774
                 screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
2775
                 !native_integers);
2776
   }
2777

2778
   /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
2779
    * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
2780
    * duplication logic we just make it so that we only see vec2s.
2781
    */
2782
   NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
2783
   NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
2784

2785
   if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
2786
      NIR_PASS_V(s, nir_lower_ubo_vec4);
2787

2788
   ntt_optimize_nir(s, screen);
2789

2790
   NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
2791

2792
   bool progress;
2793
   do {
2794
      progress = false;
2795
      NIR_PASS(progress, s, nir_opt_algebraic_late);
2796
      if (progress) {
2797
         NIR_PASS_V(s, nir_copy_prop);
2798
         NIR_PASS_V(s, nir_opt_dce);
2799
         NIR_PASS_V(s, nir_opt_cse);
2800
      }
2801
   } while (progress);
2802

2803
   if (screen->get_shader_param(screen,
2804
                                pipe_shader_type_from_mesa(s->info.stage),
2805
                                PIPE_SHADER_CAP_INTEGERS)) {
2806
      NIR_PASS_V(s, nir_lower_bool_to_int32);
2807
   } else {
2808
      NIR_PASS_V(s, nir_lower_int_to_float);
2809
      NIR_PASS_V(s, nir_lower_bool_to_float);
2810
      /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
2811
      NIR_PASS_V(s, nir_copy_prop);
2812
      NIR_PASS_V(s, nir_opt_dce);
2813
   }
2814

2815
   /* Only lower 32-bit floats.  The only other modifier type officially
2816
    * supported by TGSI is 32-bit integer negates, but even those are broken on
2817
    * virglrenderer, so skip lowering all integer and f64 float mods.
2818
    */
2819
   NIR_PASS_V(s, nir_lower_to_source_mods, nir_lower_float_source_mods);
2820
   NIR_PASS_V(s, nir_convert_from_ssa, true);
2821
   NIR_PASS_V(s, nir_lower_vec_to_movs, NULL, NULL);
2822

2823
   /* locals_to_regs will leave dead derefs that are good to clean up. */
2824
   NIR_PASS_V(s, nir_lower_locals_to_regs);
2825
   NIR_PASS_V(s, nir_opt_dce);
2826

2827
   if (debug) {
2828
      fprintf(stderr, "NIR before translation to TGSI:\n");
2829
      nir_print_shader(s, stderr);
2830
   }
2831

2832
   c = rzalloc(NULL, struct ntt_compile);
2833
   c->screen = screen;
2834

2835
   c->needs_texcoord_semantic =
2836
      screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
2837
   c->any_reg_as_address =
2838
      screen->get_param(screen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS);
2839
   c->has_txf_lz =
2840
      screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ);
2841

2842
   c->s = s;
2843
   c->native_integers = native_integers;
2844
   c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
2845
   ureg_setup_shader_info(c->ureg, &s->info);
2846

2847
   ntt_setup_inputs(c);
2848
   ntt_setup_uniforms(c);
2849

2850
   if (s->info.stage == MESA_SHADER_FRAGMENT) {
2851
      /* The draw module's polygon stipple layer doesn't respect the chosen
2852
       * coordinate mode, so leave it as unspecified unless we're actually
2853
       * reading the position in the shader already.  See
2854
       * gl-2.1-polygon-stipple-fs on softpipe.
2855
       */
2856
      if ((s->info.inputs_read & VARYING_BIT_POS) ||
2857
          BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
2858
         ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
2859
                       s->info.fs.origin_upper_left ?
2860
                       TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
2861
                       TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
2862

2863
         ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
2864
                       s->info.fs.pixel_center_integer ?
2865
                       TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
2866
                       TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
2867
      }
2868
   }
2869
   /* Emit the main function */
2870
   nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
2871
   ntt_emit_impl(c, impl);
2872
   ureg_END(c->ureg);
2873

2874
   tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
2875

2876
   if (debug) {
2877
      fprintf(stderr, "TGSI after translation from NIR:\n");
2878
      tgsi_dump(tgsi_tokens, 0);
2879
   }
2880

2881
   ureg_destroy(c->ureg);
2882

2883
   ralloc_free(c);
2884
   ralloc_free(s);
2885

2886
   return tgsi_tokens;
2887
}
2888

2889
static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
2890
   .fuse_ffma32 = true,
2891
   .fuse_ffma64 = true,
2892
   .lower_extract_byte = true,
2893
   .lower_extract_word = true,
2894
   .lower_insert_byte = true,
2895
   .lower_insert_word = true,
2896
   .lower_fdph = true,
2897
   .lower_flrp64 = true,
2898
   .lower_fmod = true,
2899
   .lower_rotate = true,
2900
   .lower_uniforms_to_ubo = true,
2901
   .lower_vector_cmp = true,
2902
   .use_interpolated_input_intrinsics = true,
2903
};
2904

2905
/* Returns a default compiler options for drivers with only nir-to-tgsi-based
2906
 * NIR support.
2907
 */
2908
const void *
2909
nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
2910
                                 enum pipe_shader_ir ir,
2911
                                 unsigned shader)
2912
{
2913
   assert(ir == PIPE_SHADER_IR_NIR);
2914
   return &nir_to_tgsi_compiler_options;
2915
}
2916

2917
Product

Resources

Company