CoCalc -- agx_compile.c

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/asahi/compiler/agx_compile.c
⁴⁵⁶⁴ views
1
/*
2
 * Copyright (C) 2021 Alyssa Rosenzweig <[email protected]>
3
 * Copyright (C) 2020 Collabora Ltd.
4
 * Copyright © 2016 Broadcom
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a
7
 * copy of this software and associated documentation files (the "Software"),
8
 * to deal in the Software without restriction, including without limitation
9
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10
 * and/or sell copies of the Software, and to permit persons to whom the
11
 * Software is furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice (including the next
14
 * paragraph) shall be included in all copies or substantial portions of the
15
 * Software.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
 * SOFTWARE.
24
 */
25

26
#include "main/mtypes.h"
27
#include "compiler/nir_types.h"
28
#include "compiler/nir/nir_builder.h"
29
#include "util/u_debug.h"
30
#include "agx_compile.h"
31
#include "agx_compiler.h"
32
#include "agx_builder.h"
33

34
static const struct debug_named_value agx_debug_options[] = {
35
   {"msgs",      AGX_DBG_MSGS,		"Print debug messages"},
36
   {"shaders",   AGX_DBG_SHADERS,	"Dump shaders in NIR and AIR"},
37
   {"shaderdb",  AGX_DBG_SHADERDB,	"Print statistics"},
38
   {"verbose",   AGX_DBG_VERBOSE,	"Disassemble verbosely"},
39
   {"internal",  AGX_DBG_INTERNAL,	"Dump even internal shaders"},
40
   DEBUG_NAMED_VALUE_END
41
};
42

43
DEBUG_GET_ONCE_FLAGS_OPTION(agx_debug, "AGX_MESA_DEBUG", agx_debug_options, 0)
44

45
int agx_debug = 0;
46

47
#define DBG(fmt, ...) \
48
   do { if (agx_debug & AGX_DBG_MSGS) \
49
      fprintf(stderr, "%s:%d: "fmt, \
50
            __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
51

52
static void
53
agx_block_add_successor(agx_block *block, agx_block *successor)
54
{
55
   assert(block != NULL && successor != NULL);
56

57
   /* Cull impossible edges */
58
   if (block->unconditional_jumps)
59
      return;
60

61
   for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
62
      if (block->successors[i]) {
63
         if (block->successors[i] == successor)
64
            return;
65
         else
66
            continue;
67
      }
68

69
      block->successors[i] = successor;
70
      _mesa_set_add(successor->predecessors, block);
71
      return;
72
   }
73

74
   unreachable("Too many successors");
75
}
76

77
static void
78
agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr)
79
{
80
   /* Ensure we've been scalarized and bit size lowered */
81
   unsigned bit_size = instr->def.bit_size;
82
   assert(instr->def.num_components == 1);
83
   assert(bit_size == 1 || bit_size == 16 || bit_size == 32);
84

85
   /* Emit move, later passes can inline/push if useful */
86
   agx_mov_imm_to(b,
87
                  agx_get_index(instr->def.index, agx_size_for_bits(bit_size)),
88
                  nir_const_value_as_uint(instr->value[0], bit_size));
89
}
90

91
/* AGX appears to lack support for vertex attributes. Lower to global loads. */
92
static agx_instr *
93
agx_emit_load_attr(agx_builder *b, nir_intrinsic_instr *instr)
94
{
95
   nir_src *offset_src = nir_get_io_offset_src(instr);
96
   assert(nir_src_is_const(*offset_src) && "no attribute indirects");
97
   unsigned index = nir_intrinsic_base(instr) +
98
                    nir_src_as_uint(*offset_src);
99

100
   struct agx_shader_key *key = b->shader->key;
101
   struct agx_attribute attrib = key->vs.attributes[index];
102

103
   /* address = base + (stride * vertex_id) + src_offset */
104
   unsigned buf = attrib.buf;
105
   agx_index stride = agx_mov_imm(b, 32, key->vs.vbuf_strides[buf]);
106
   agx_index src_offset = agx_mov_imm(b, 32, attrib.src_offset);
107
   agx_index vertex_id = agx_register(10, AGX_SIZE_32); // TODO: RA
108
   agx_index offset = agx_imad(b, vertex_id, stride, src_offset, 0);
109

110
   /* Each VBO has a 64-bit = 4 x 16-bit address, lookup the base address as a sysval */
111
   unsigned num_vbos = key->vs.num_vbufs;
112
   unsigned base_length = (num_vbos * 4);
113
   agx_index base = agx_indexed_sysval(b->shader,
114
                                       AGX_PUSH_VBO_BASES, AGX_SIZE_64, buf * 4, base_length);
115

116
   /* Load the data */
117
   assert(instr->num_components <= 4);
118

119
   bool pad = ((attrib.nr_comps_minus_1 + 1) < instr->num_components);
120
   agx_index real_dest = agx_dest_index(&instr->dest);
121
   agx_index dest = pad ? agx_temp(b->shader, AGX_SIZE_32) : real_dest;
122

123
   agx_device_load_to(b, dest, base, offset, attrib.format,
124
                      BITFIELD_MASK(attrib.nr_comps_minus_1 + 1), 0);
125

126
   agx_wait(b, 0);
127

128
   if (pad) {
129
      agx_index one = agx_mov_imm(b, 32, fui(1.0));
130
      agx_index zero = agx_mov_imm(b, 32, 0);
131
      agx_index channels[4] = { zero, zero, zero, one };
132
      for (unsigned i = 0; i < (attrib.nr_comps_minus_1 + 1); ++i)
133
         channels[i] = agx_p_extract(b, dest, i);
134
      for (unsigned i = instr->num_components; i < 4; ++i)
135
         channels[i] = agx_null();
136
      agx_p_combine_to(b, real_dest, channels[0], channels[1], channels[2], channels[3]);
137
   }
138

139
   return NULL;
140
}
141

142
static agx_instr *
143
agx_emit_load_vary_flat(agx_builder *b, nir_intrinsic_instr *instr)
144
{
145
   unsigned components = instr->num_components;
146
   assert(components >= 1 && components <= 4);
147

148
   nir_src *offset = nir_get_io_offset_src(instr);
149
   assert(nir_src_is_const(*offset) && "no indirects");
150
   unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
151
   imm_index += nir_src_as_uint(*offset);
152

153
   agx_index chan[4] = { agx_null() };
154

155
   for (unsigned i = 0; i < components; ++i) {
156
      /* vec3 for each vertex, unknown what first 2 channels are for */
157
      agx_index values = agx_ld_vary_flat(b, agx_immediate(imm_index + i), 1);
158
      chan[i] = agx_p_extract(b, values, 2);
159
   }
160

161
   return agx_p_combine_to(b, agx_dest_index(&instr->dest),
162
         chan[0], chan[1], chan[2], chan[3]);
163
}
164

165
static agx_instr *
166
agx_emit_load_vary(agx_builder *b, nir_intrinsic_instr *instr)
167
{
168
   ASSERTED unsigned components = instr->num_components;
169
   ASSERTED nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
170

171
   assert(components >= 1 && components <= 4);
172
   assert(parent);
173

174
   /* TODO: Interpolation modes */
175
   assert(parent->intrinsic == nir_intrinsic_load_barycentric_pixel);
176

177
   nir_src *offset = nir_get_io_offset_src(instr);
178
   assert(nir_src_is_const(*offset) && "no indirects");
179
   unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
180
   imm_index += nir_src_as_uint(*offset) * 4;
181

182
   return agx_ld_vary_to(b, agx_dest_index(&instr->dest),
183
         agx_immediate(imm_index), components, true);
184
}
185

186
static agx_instr *
187
agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr)
188
{
189
   nir_src *offset = nir_get_io_offset_src(instr);
190
   assert(nir_src_is_const(*offset) && "todo: indirects");
191
   unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
192
   imm_index += nir_intrinsic_component(instr);
193
   imm_index += nir_src_as_uint(*offset);
194

195
   /* nir_lower_io_to_scalar */
196
   assert(nir_intrinsic_write_mask(instr) == 0x1);
197

198
   return agx_st_vary(b,
199
               agx_immediate(imm_index),
200
               agx_src_index(&instr->src[0]));
201
}
202

203
static agx_instr *
204
agx_emit_fragment_out(agx_builder *b, nir_intrinsic_instr *instr)
205
{
206
   const nir_variable *var =
207
      nir_find_variable_with_driver_location(b->shader->nir,
208
            nir_var_shader_out, nir_intrinsic_base(instr));
209
   assert(var);
210

211
   unsigned loc = var->data.location;
212
   assert(var->data.index == 0 && "todo: dual-source blending");
213
   assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
214
   unsigned rt = (loc - FRAG_RESULT_DATA0);
215

216
   /* TODO: Reverse-engineer interactions with MRT */
217
   if (b->shader->nir->info.internal) {
218
      /* clear */
219
   } else if (b->shader->did_writeout) {
220
	   agx_writeout(b, 0x0004);
221
   } else {
222
	   agx_writeout(b, 0xC200);
223
	   agx_writeout(b, 0x000C);
224
   }
225

226
   b->shader->did_writeout = true;
227
   return agx_st_tile(b, agx_src_index(&instr->src[0]),
228
             b->shader->key->fs.tib_formats[rt]);
229
}
230

231
static agx_instr *
232
agx_emit_load_tile(agx_builder *b, nir_intrinsic_instr *instr)
233
{
234
   const nir_variable *var =
235
      nir_find_variable_with_driver_location(b->shader->nir,
236
            nir_var_shader_out, nir_intrinsic_base(instr));
237
   assert(var);
238

239
   unsigned loc = var->data.location;
240
   assert(var->data.index == 0 && "todo: dual-source blending");
241
   assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
242
   unsigned rt = (loc - FRAG_RESULT_DATA0);
243

244
   /* TODO: Reverse-engineer interactions with MRT */
245
   agx_writeout(b, 0xC200);
246
   agx_writeout(b, 0x0008);
247
   b->shader->did_writeout = true;
248
   b->shader->out->reads_tib = true;
249

250
   return agx_ld_tile_to(b, agx_dest_index(&instr->dest),
251
         b->shader->key->fs.tib_formats[rt]);
252
}
253

254
static enum agx_format
255
agx_format_for_bits(unsigned bits)
256
{
257
   switch (bits) {
258
   case 8: return AGX_FORMAT_I8;
259
   case 16: return AGX_FORMAT_I16;
260
   case 32: return AGX_FORMAT_I32;
261
   default: unreachable("Invalid bit size for load/store");
262
   }
263
}
264

265
static agx_instr *
266
agx_emit_load_ubo(agx_builder *b, nir_intrinsic_instr *instr)
267
{
268
   bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);
269
   nir_src *offset = nir_get_io_offset_src(instr);
270

271
   if (!kernel_input && !nir_src_is_const(instr->src[0]))
272
      unreachable("todo: indirect UBO access");
273

274
   /* Constant offsets for device_load are 16-bit */
275
   bool offset_is_const = nir_src_is_const(*offset);
276
   assert(offset_is_const && "todo: indirect UBO access");
277
   int32_t const_offset = offset_is_const ? nir_src_as_int(*offset) : 0;
278

279
   /* Offsets are shifted by the type size, so divide that out */
280
   unsigned bytes = nir_dest_bit_size(instr->dest) / 8;
281
   assert((const_offset & (bytes - 1)) == 0);
282
   const_offset = const_offset / bytes;
283
   int16_t const_as_16 = const_offset;
284

285
   /* UBO blocks are specified (kernel inputs are always 0) */
286
   uint32_t block = kernel_input ? 0 : nir_src_as_uint(instr->src[0]);
287

288
   /* Each UBO has a 64-bit = 4 x 16-bit address */
289
   unsigned num_ubos = b->shader->nir->info.num_ubos;
290
   unsigned base_length = (num_ubos * 4);
291
   unsigned index = block * 4; /* 16 bit units */
292

293
   /* Lookup the base address (TODO: indirection) */
294
   agx_index base = agx_indexed_sysval(b->shader,
295
                                       AGX_PUSH_UBO_BASES, AGX_SIZE_64,
296
                                       index, base_length);
297

298
   /* Load the data */
299
   assert(instr->num_components <= 4);
300

301
   agx_device_load_to(b, agx_dest_index(&instr->dest),
302
                      base,
303
                      (offset_is_const && (const_offset == const_as_16)) ?
304
                      agx_immediate(const_as_16) : agx_mov_imm(b, 32, const_offset),
305
                      agx_format_for_bits(nir_dest_bit_size(instr->dest)),
306
                      BITFIELD_MASK(instr->num_components), 0);
307

308
   return agx_wait(b, 0);
309
}
310

311
static agx_instr *
312
agx_emit_load_frag_coord(agx_builder *b, nir_intrinsic_instr *instr)
313
{
314
   agx_index xy[2];
315

316
   for (unsigned i = 0; i < 2; ++i) {
317
      xy[i] = agx_fadd(b, agx_convert(b, agx_immediate(AGX_CONVERT_U32_TO_F),
318
               agx_get_sr(b, 32, AGX_SR_THREAD_POSITION_IN_GRID_X + i),
319
               AGX_ROUND_RTE), agx_immediate_f(0.5f));
320
   }
321

322
   /* Ordering by the ABI */
323
   agx_index z = agx_ld_vary(b, agx_immediate(1), 1, false);
324
   agx_index w = agx_ld_vary(b, agx_immediate(0), 1, false);
325

326
   return agx_p_combine_to(b, agx_dest_index(&instr->dest),
327
         xy[0], xy[1], z, w);
328
}
329

330
static agx_instr *
331
agx_blend_const(agx_builder *b, agx_index dst, unsigned comp)
332
{
333
     agx_index val = agx_indexed_sysval(b->shader,
334
           AGX_PUSH_BLEND_CONST, AGX_SIZE_32, comp * 2, 4 * 2);
335

336
     return agx_mov_to(b, dst, val);
337
}
338

339
static agx_instr *
340
agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
341
{
342
  agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
343
     agx_dest_index(&instr->dest) : agx_null();
344
  gl_shader_stage stage = b->shader->stage;
345

346
  switch (instr->intrinsic) {
347
  case nir_intrinsic_load_barycentric_pixel:
348
  case nir_intrinsic_load_barycentric_centroid:
349
  case nir_intrinsic_load_barycentric_sample:
350
  case nir_intrinsic_load_barycentric_at_sample:
351
  case nir_intrinsic_load_barycentric_at_offset:
352
     /* handled later via load_vary */
353
     return NULL;
354
  case nir_intrinsic_load_interpolated_input:
355
     assert(stage == MESA_SHADER_FRAGMENT);
356
     return agx_emit_load_vary(b, instr);
357

358
  case nir_intrinsic_load_input:
359
     if (stage == MESA_SHADER_FRAGMENT)
360
        return agx_emit_load_vary_flat(b, instr);
361
     else if (stage == MESA_SHADER_VERTEX)
362
        return agx_emit_load_attr(b, instr);
363
     else
364
        unreachable("Unsupported shader stage");
365

366
  case nir_intrinsic_store_output:
367
     if (stage == MESA_SHADER_FRAGMENT)
368
        return agx_emit_fragment_out(b, instr);
369
     else if (stage == MESA_SHADER_VERTEX)
370
        return agx_emit_store_vary(b, instr);
371
     else
372
        unreachable("Unsupported shader stage");
373

374
  case nir_intrinsic_load_output:
375
     assert(stage == MESA_SHADER_FRAGMENT);
376
     return agx_emit_load_tile(b, instr);
377

378
  case nir_intrinsic_load_ubo:
379
  case nir_intrinsic_load_kernel_input:
380
     return agx_emit_load_ubo(b, instr);
381

382
  case nir_intrinsic_load_frag_coord:
383
     return agx_emit_load_frag_coord(b, instr);
384

385
  case nir_intrinsic_load_back_face_agx:
386
     return agx_get_sr_to(b, dst, AGX_SR_BACKFACING);
387

388
  case nir_intrinsic_load_vertex_id:
389
     return agx_mov_to(b, dst, agx_abs(agx_register(10, AGX_SIZE_32))); /* TODO: RA */
390

391
  case nir_intrinsic_load_blend_const_color_r_float: return agx_blend_const(b, dst, 0);
392
  case nir_intrinsic_load_blend_const_color_g_float: return agx_blend_const(b, dst, 1);
393
  case nir_intrinsic_load_blend_const_color_b_float: return agx_blend_const(b, dst, 2);
394
  case nir_intrinsic_load_blend_const_color_a_float: return agx_blend_const(b, dst, 3);
395

396
  default:
397
       fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
398
       unreachable("Unhandled intrinsic");
399
  }
400
}
401

402
static agx_index
403
agx_alu_src_index(agx_builder *b, nir_alu_src src)
404
{
405
   /* Check well-formedness of the input NIR */
406
   ASSERTED unsigned bitsize = nir_src_bit_size(src.src);
407
   unsigned comps = nir_src_num_components(src.src);
408
   unsigned channel = src.swizzle[0];
409

410
   assert(bitsize == 1 || bitsize == 16 || bitsize == 32 || bitsize == 64);
411
   assert(!(src.negate || src.abs));
412
   assert(channel < comps);
413

414
   agx_index idx = agx_src_index(&src.src);
415

416
   /* We only deal with scalars, emit p_extract if needed */
417
   if (comps > 1)
418
      return agx_p_extract(b, idx, channel);
419
   else
420
      return idx;
421
}
422

423
static agx_instr *
424
agx_emit_alu_bool(agx_builder *b, nir_op op,
425
      agx_index dst, agx_index s0, agx_index s1, agx_index s2)
426
{
427
   /* Handle 1-bit bools as zero/nonzero rather than specifically 0/1 or 0/~0.
428
    * This will give the optimizer flexibility. */
429
   agx_index f = agx_immediate(0);
430
   agx_index t = agx_immediate(0x1);
431

432
   switch (op) {
433
   case nir_op_feq: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_EQ);
434
   case nir_op_flt: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_LT);
435
   case nir_op_fge: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_GE);
436
   case nir_op_fneu: return agx_fcmpsel_to(b, dst, s0, s1, f, t, AGX_FCOND_EQ);
437

438
   case nir_op_ieq: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_UEQ);
439
   case nir_op_ine: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_UEQ);
440
   case nir_op_ilt: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_SLT);
441
   case nir_op_ige: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_SLT);
442
   case nir_op_ult: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_ULT);
443
   case nir_op_uge: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_ULT);
444

445
   case nir_op_mov: return agx_mov_to(b, dst, s0);
446
   case nir_op_iand: return agx_and_to(b, dst, s0, s1);
447
   case nir_op_ior: return agx_or_to(b, dst, s0, s1);
448
   case nir_op_ixor: return agx_xor_to(b, dst, s0, s1);
449
   case nir_op_inot: return agx_xor_to(b, dst, s0, t);
450

451
   case nir_op_f2b1: return agx_fcmpsel_to(b, dst, s0, f, f, t, AGX_FCOND_EQ);
452
   case nir_op_i2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
453
   case nir_op_b2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
454

455
   case nir_op_bcsel:
456
      return agx_icmpsel_to(b, dst, s0, f, s2, s1, AGX_ICOND_UEQ);
457

458
   default:
459
      fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name);
460
      unreachable("Unhandled boolean ALU instruction");
461
   }
462
}
463

464
static agx_instr *
465
agx_emit_alu(agx_builder *b, nir_alu_instr *instr)
466
{
467
   unsigned srcs = nir_op_infos[instr->op].num_inputs;
468
   unsigned sz = nir_dest_bit_size(instr->dest.dest);
469
   unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0;
470
   ASSERTED unsigned comps = nir_dest_num_components(instr->dest.dest);
471

472
   assert(comps == 1 || nir_op_is_vec(instr->op));
473
   assert(sz == 1 || sz == 16 || sz == 32 || sz == 64);
474

475
   agx_index dst = agx_dest_index(&instr->dest.dest);
476
   agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null();
477
   agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null();
478
   agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null();
479
   agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null();
480

481
   /* 1-bit bools are a bit special, only handle with select ops */
482
   if (sz == 1)
483
      return agx_emit_alu_bool(b, instr->op, dst, s0, s1, s2);
484

485
#define UNOP(nop, aop) \
486
   case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0);
487
#define BINOP(nop, aop) \
488
   case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1);
489
#define TRIOP(nop, aop) \
490
   case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1, s2);
491

492
   switch (instr->op) {
493
   BINOP(fadd, fadd);
494
   BINOP(fmul, fmul);
495
   TRIOP(ffma, fma);
496

497
   UNOP(f2f16, fmov);
498
   UNOP(f2f32, fmov);
499
   UNOP(fround_even, roundeven);
500
   UNOP(ftrunc, trunc);
501
   UNOP(ffloor, floor);
502
   UNOP(fceil, ceil);
503
   UNOP(frcp, rcp);
504
   UNOP(frsq, rsqrt);
505
   UNOP(flog2, log2);
506
   UNOP(fexp2, exp2);
507

508
   UNOP(fddx, dfdx);
509
   UNOP(fddx_coarse, dfdx);
510
   UNOP(fddx_fine, dfdx);
511

512
   UNOP(fddy, dfdy);
513
   UNOP(fddy_coarse, dfdy);
514
   UNOP(fddy_fine, dfdy);
515

516
   UNOP(mov, mov);
517
   UNOP(u2u16, mov);
518
   UNOP(u2u32, mov);
519
   UNOP(inot, not);
520
   BINOP(iand, and);
521
   BINOP(ior, or);
522
   BINOP(ixor, xor);
523

524
   case nir_op_fsqrt: return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0));
525
   case nir_op_fsub: return agx_fadd_to(b, dst, s0, agx_neg(s1));
526
   case nir_op_fabs: return agx_fmov_to(b, dst, agx_abs(s0));
527
   case nir_op_fneg: return agx_fmov_to(b, dst, agx_neg(s0));
528

529
   case nir_op_fmin: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_LTN);
530
   case nir_op_fmax: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_GTN);
531
   case nir_op_imin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT);
532
   case nir_op_imax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT);
533
   case nir_op_umin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT);
534
   case nir_op_umax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT);
535

536
   case nir_op_iadd: return agx_iadd_to(b, dst, s0, s1, 0);
537
   case nir_op_isub: return agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
538
   case nir_op_ineg: return agx_iadd_to(b, dst, agx_zero(), agx_neg(s0), 0);
539
   case nir_op_imul: return agx_imad_to(b, dst, s0, s1, agx_zero(), 0);
540

541
   case nir_op_ishl: return agx_bfi_to(b, dst, agx_zero(), s0, s1, 0);
542
   case nir_op_ushr: return agx_bfeil_to(b, dst, agx_zero(), s0, s1, 0);
543
   case nir_op_ishr: return agx_asr_to(b, dst, s0, s1);
544

545
   case nir_op_bcsel:
546
      return agx_icmpsel_to(b, dst, s0, agx_zero(), s2, s1, AGX_ICOND_UEQ);
547

548
   case nir_op_b2i32:
549
   case nir_op_b2i16:
550
      return agx_icmpsel_to(b, dst, s0, agx_zero(), agx_zero(), agx_immediate(1), AGX_ICOND_UEQ);
551

552
   case nir_op_b2f16:
553
   case nir_op_b2f32:
554
   {
555
      /* At this point, boolean is just zero/nonzero, so compare with zero */
556
      agx_index one = (sz == 16) ?
557
         agx_mov_imm(b, 16, _mesa_float_to_half(1.0)) :
558
         agx_mov_imm(b, 32, fui(1.0));
559

560
      agx_index zero = agx_zero();
561

562
      return agx_fcmpsel_to(b, dst, s0, zero, zero, one, AGX_FCOND_EQ);
563
   }
564

565
   case nir_op_i2i32:
566
   {
567
      if (s0.size != AGX_SIZE_16)
568
         unreachable("todo: more conversions");
569

570
      return agx_iadd_to(b, dst, s0, agx_zero(), 0);
571
   }
572

573
   case nir_op_i2i16:
574
   {
575
      if (s0.size != AGX_SIZE_32)
576
         unreachable("todo: more conversions");
577

578
      return agx_iadd_to(b, dst, s0, agx_zero(), 0);
579
   }
580

581
   case nir_op_iadd_sat:
582
   {
583
      agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0);
584
      I->saturate = true;
585
      return I;
586
   }
587

588
   case nir_op_isub_sat:
589
   {
590
      agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
591
      I->saturate = true;
592
      return I;
593
   }
594

595
   case nir_op_uadd_sat:
596
   {
597
      agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0);
598
      I->saturate = true;
599
      return I;
600
   }
601

602
   case nir_op_usub_sat:
603
   {
604
      agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0);
605
      I->saturate = true;
606
      return I;
607
   }
608

609
   case nir_op_fsat:
610
   {
611
      agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero());
612
      I->saturate = true;
613
      return I;
614
   }
615

616
   case nir_op_fsin_agx:
617
   {
618
      agx_index fixup = agx_sin_pt_1(b, s0);
619
      agx_index sinc = agx_sin_pt_2(b, fixup);
620
      return agx_fmul_to(b, dst, sinc, fixup);
621
   }
622

623
   case nir_op_f2i16:
624
      return agx_convert_to(b, dst,
625
            agx_immediate(AGX_CONVERT_F_TO_S16), s0, AGX_ROUND_RTZ);
626

627
   case nir_op_f2i32:
628
      return agx_convert_to(b, dst,
629
            agx_immediate(AGX_CONVERT_F_TO_S32), s0, AGX_ROUND_RTZ);
630

631
   case nir_op_f2u16:
632
      return agx_convert_to(b, dst,
633
            agx_immediate(AGX_CONVERT_F_TO_U16), s0, AGX_ROUND_RTZ);
634

635
   case nir_op_f2u32:
636
      return agx_convert_to(b, dst,
637
            agx_immediate(AGX_CONVERT_F_TO_U32), s0, AGX_ROUND_RTZ);
638

639
   case nir_op_u2f16:
640
   case nir_op_u2f32:
641
   {
642
      if (src_sz == 64)
643
         unreachable("64-bit conversions unimplemented");
644

645
      enum agx_convert mode =
646
         (src_sz == 32) ? AGX_CONVERT_U32_TO_F :
647
         (src_sz == 16) ? AGX_CONVERT_U16_TO_F :
648
                          AGX_CONVERT_U8_TO_F;
649

650
      return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
651
   }
652

653
   case nir_op_i2f16:
654
   case nir_op_i2f32:
655
   {
656
      if (src_sz == 64)
657
         unreachable("64-bit conversions unimplemented");
658

659
      enum agx_convert mode =
660
         (src_sz == 32) ? AGX_CONVERT_S32_TO_F :
661
         (src_sz == 16) ? AGX_CONVERT_S16_TO_F :
662
                          AGX_CONVERT_S8_TO_F;
663

664
      return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
665
   }
666

667
   case nir_op_vec2:
668
   case nir_op_vec3:
669
   case nir_op_vec4:
670
      return agx_p_combine_to(b, dst, s0, s1, s2, s3);
671

672
   case nir_op_vec8:
673
   case nir_op_vec16:
674
      unreachable("should've been lowered");
675

676
   default:
677
      fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
678
      unreachable("Unhandled ALU instruction");
679
   }
680
}
681

682
static enum agx_dim
683
agx_tex_dim(enum glsl_sampler_dim dim, bool array)
684
{
685
   switch (dim) {
686
   case GLSL_SAMPLER_DIM_1D:
687
   case GLSL_SAMPLER_DIM_BUF:
688
      return array ? AGX_DIM_TEX_1D_ARRAY : AGX_DIM_TEX_1D;
689

690
   case GLSL_SAMPLER_DIM_2D:
691
   case GLSL_SAMPLER_DIM_RECT:
692
   case GLSL_SAMPLER_DIM_EXTERNAL:
693
      return array ? AGX_DIM_TEX_2D_ARRAY : AGX_DIM_TEX_2D;
694

695
   case GLSL_SAMPLER_DIM_MS:
696
      assert(!array && "multisampled arrays unsupported");
697
      return AGX_DIM_TEX_2D_MS;
698

699
   case GLSL_SAMPLER_DIM_3D:
700
      assert(!array && "3D arrays unsupported");
701
      return AGX_DIM_TEX_3D;
702

703
   case GLSL_SAMPLER_DIM_CUBE:
704
      return array ? AGX_DIM_TEX_CUBE_ARRAY : AGX_DIM_TEX_CUBE;
705

706
   default:
707
      unreachable("Invalid sampler dim\n");
708
   }
709
}
710

711
static void
712
agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
713
{
714
   switch (instr->op) {
715
   case nir_texop_tex:
716
   case nir_texop_txl:
717
      break;
718
   default:
719
      unreachable("Unhandled texture op");
720
   }
721

722
   enum agx_lod_mode lod_mode = (instr->op == nir_texop_tex) ?
723
      AGX_LOD_MODE_AUTO_LOD : AGX_LOD_MODE_LOD_MIN;
724

725
   agx_index coords = agx_null(),
726
             texture = agx_immediate(instr->texture_index),
727
             sampler = agx_immediate(instr->sampler_index),
728
             lod = agx_immediate(0),
729
             offset = agx_null();
730

731
   for (unsigned i = 0; i < instr->num_srcs; ++i) {
732
      agx_index index = agx_src_index(&instr->src[i].src);
733

734
      switch (instr->src[i].src_type) {
735
      case nir_tex_src_coord:
736
         coords = index;
737
         break;
738

739
      case nir_tex_src_lod:
740
         lod = index;
741
         break;
742

743
      case nir_tex_src_bias:
744
      case nir_tex_src_ms_index:
745
      case nir_tex_src_offset:
746
      case nir_tex_src_comparator:
747
      case nir_tex_src_texture_offset:
748
      case nir_tex_src_sampler_offset:
749
      default:
750
         unreachable("todo");
751
      }
752
   }
753

754
   agx_texture_sample_to(b, agx_dest_index(&instr->dest),
755
         coords, lod, texture, sampler, offset,
756
         agx_tex_dim(instr->sampler_dim, instr->is_array),
757
         lod_mode,
758
         0xF, /* TODO: wrmask */
759
         0);
760

761
   agx_wait(b, 0);
762
}
763

764
/* NIR loops are treated as a pair of AGX loops:
765
 *
766
 *    do {
767
 *       do {
768
 *          ...
769
 *       } while (0);
770
 *    } while (cond);
771
 *
772
 * By manipulating the nesting counter (r0l), we may break out of nested loops,
773
 * so under the model, both break and continue may be implemented as breaks,
774
 * where break breaks out of the outer loop (2 layers) and continue breaks out
775
 * of the inner loop (1 layer).
776
 *
777
 * After manipulating the nesting counter directly, pop_exec #0 must be used to
778
 * flush the update to the execution mask.
779
 */
780

781
static void
782
agx_emit_jump(agx_builder *b, nir_jump_instr *instr)
783
{
784
   agx_context *ctx = b->shader;
785
   assert (instr->type == nir_jump_break || instr->type == nir_jump_continue);
786

787
   /* Break out of either one or two loops */
788
   unsigned nestings = b->shader->loop_nesting;
789

790
   if (instr->type == nir_jump_continue) {
791
      nestings += 1;
792
      agx_block_add_successor(ctx->current_block, ctx->continue_block);
793
   } else if (instr->type == nir_jump_break) {
794
      nestings += 2;
795
      agx_block_add_successor(ctx->current_block, ctx->break_block);
796
   }
797

798
   /* Update the counter and flush */
799
   agx_index r0l = agx_register(0, false);
800
   agx_mov_to(b, r0l, agx_immediate(nestings));
801
   agx_pop_exec(b, 0);
802

803
   ctx->current_block->unconditional_jumps = true;
804
}
805

806
static void
807
agx_emit_instr(agx_builder *b, struct nir_instr *instr)
808
{
809
   switch (instr->type) {
810
   case nir_instr_type_load_const:
811
      agx_emit_load_const(b, nir_instr_as_load_const(instr));
812
      break;
813

814
   case nir_instr_type_intrinsic:
815
      agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
816
      break;
817

818
   case nir_instr_type_alu:
819
      agx_emit_alu(b, nir_instr_as_alu(instr));
820
      break;
821

822
   case nir_instr_type_tex:
823
      agx_emit_tex(b, nir_instr_as_tex(instr));
824
      break;
825

826
   case nir_instr_type_jump:
827
      agx_emit_jump(b, nir_instr_as_jump(instr));
828
      break;
829

830
   default:
831
      unreachable("should've been lowered");
832
   }
833
}
834

835
static agx_block *
836
agx_create_block(agx_context *ctx)
837
{
838
   agx_block *blk = rzalloc(ctx, agx_block);
839

840
   blk->predecessors = _mesa_set_create(blk,
841
         _mesa_hash_pointer, _mesa_key_pointer_equal);
842

843
   return blk;
844
}
845

846
static agx_block *
847
emit_block(agx_context *ctx, nir_block *block)
848
{
849
   if (ctx->after_block) {
850
      ctx->current_block = ctx->after_block;
851
      ctx->after_block = NULL;
852
   } else {
853
      ctx->current_block = agx_create_block(ctx);
854
   }
855

856
   agx_block *blk = ctx->current_block;
857
   list_addtail(&blk->link, &ctx->blocks);
858
   list_inithead(&blk->instructions);
859

860
   agx_builder _b = agx_init_builder(ctx, agx_after_block(blk));
861

862
   nir_foreach_instr(instr, block) {
863
      agx_emit_instr(&_b, instr);
864
   }
865

866
   return blk;
867
}
868

869
static agx_block *
870
emit_cf_list(agx_context *ctx, struct exec_list *list);
871

872
/* Emit if-else as
873
 *
874
 *    if_icmp cond != 0
875
 *       ...
876
 *    else_icmp cond == 0
877
 *       ...
878
 *    pop_exec
879
 *
880
 * If the else is empty, we can omit the else_icmp. This is not usually
881
 * optimal, but it's a start.
882
 */
883

884
static void
885
emit_if(agx_context *ctx, nir_if *nif)
886
{
887
   nir_block *nir_else_block = nir_if_first_else_block(nif);
888
   bool empty_else_block =
889
      (nir_else_block == nir_if_last_else_block(nif) &&
890
       exec_list_is_empty(&nir_else_block->instr_list));
891

892
   agx_block *first_block = ctx->current_block;
893
   agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block));
894
   agx_index cond = agx_src_index(&nif->condition);
895

896
   agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true);
897
   ctx->loop_nesting++;
898

899
   /* Emit the two subblocks. */
900
   agx_block *if_block = emit_cf_list(ctx, &nif->then_list);
901
   agx_block *end_then = ctx->current_block;
902

903
   if (!empty_else_block) {
904
      _b.cursor = agx_after_block(ctx->current_block);
905
      agx_else_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, false);
906
   }
907

908
   agx_block *else_block = emit_cf_list(ctx, &nif->else_list);
909
   agx_block *end_else = ctx->current_block;
910

911
   ctx->after_block = agx_create_block(ctx);
912

913
   agx_block_add_successor(first_block, if_block);
914
   agx_block_add_successor(first_block, else_block);
915
   agx_block_add_successor(end_then, ctx->after_block);
916
   agx_block_add_successor(end_else, ctx->after_block);
917

918
   _b.cursor = agx_after_block(ctx->current_block);
919
   agx_pop_exec(&_b, 1);
920
   ctx->loop_nesting--;
921
}
922

923
static void
924
emit_loop(agx_context *ctx, nir_loop *nloop)
925
{
926
   /* We only track nesting within the innermost loop, so reset */
927
   ctx->loop_nesting = 0;
928

929
   agx_block *popped_break = ctx->break_block;
930
   agx_block *popped_continue = ctx->continue_block;
931

932
   ctx->break_block = agx_create_block(ctx);
933
   ctx->continue_block = agx_create_block(ctx);
934

935
   /* Make room for break/continue nesting (TODO: skip if no divergent CF) */
936
   agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
937
   agx_push_exec(&_b, 2);
938

939
   /* Fallthrough to body */
940
   agx_block_add_successor(ctx->current_block, ctx->continue_block);
941

942
   /* Emit the body */
943
   ctx->after_block = ctx->continue_block;
944
   agx_block *start_block = emit_cf_list(ctx, &nloop->body);
945

946
   /* Fix up the nesting counter via an always true while_icmp, and branch back
947
    * to start of loop if any lanes are active */
948
   _b.cursor = agx_after_block(ctx->current_block);
949
   agx_while_icmp(&_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false);
950
   agx_jmp_exec_any(&_b, start_block);
951
   agx_pop_exec(&_b, 2);
952
   agx_block_add_successor(ctx->current_block, ctx->continue_block);
953

954
   /* Pop off */
955
   ctx->after_block = ctx->break_block;
956
   ctx->break_block = popped_break;
957
   ctx->continue_block = popped_continue;
958

959
   /* Update shader-db stats */
960
   ++ctx->loop_count;
961

962
   /* All nested control flow must have finished */
963
   assert(ctx->loop_nesting == 0);
964
}
965

966
/* Before the first control flow structure, the nesting counter (r0l) needs to
967
 * be zeroed for correct operation. This only happens at most once, since by
968
 * definition this occurs at the end of the first block, which dominates the
969
 * rest of the program. */
970

971
static void
972
emit_first_cf(agx_context *ctx)
973
{
974
   if (ctx->any_cf)
975
      return;
976

977
   agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
978
   agx_index r0l = agx_register(0, false);
979

980
   agx_mov_to(&_b, r0l, agx_immediate(0));
981
   ctx->any_cf = true;
982
}
983

984
static agx_block *
985
emit_cf_list(agx_context *ctx, struct exec_list *list)
986
{
987
   agx_block *start_block = NULL;
988

989
   foreach_list_typed(nir_cf_node, node, node, list) {
990
      switch (node->type) {
991
      case nir_cf_node_block: {
992
         agx_block *block = emit_block(ctx, nir_cf_node_as_block(node));
993

994
         if (!start_block)
995
            start_block = block;
996

997
         break;
998
      }
999

1000
      case nir_cf_node_if:
1001
         emit_first_cf(ctx);
1002
         emit_if(ctx, nir_cf_node_as_if(node));
1003
         break;
1004

1005
      case nir_cf_node_loop:
1006
         emit_first_cf(ctx);
1007
         emit_loop(ctx, nir_cf_node_as_loop(node));
1008
         break;
1009

1010
      default:
1011
         unreachable("Unknown control flow");
1012
      }
1013
   }
1014

1015
   return start_block;
1016
}
1017

1018
static void
1019
agx_set_st_vary_final(agx_context *ctx)
1020
{
1021
   agx_foreach_instr_global_rev(ctx, I) {
1022
      if (I->op == AGX_OPCODE_ST_VARY) {
1023
         I->last = true;
1024
         return;
1025
      }
1026
   }
1027
}
1028

1029
static void
1030
agx_print_stats(agx_context *ctx, unsigned size, FILE *fp)
1031
{
1032
   unsigned nr_ins = 0, nr_bytes = 0, nr_threads = 1;
1033

1034
   /* TODO */
1035
   fprintf(stderr, "%s shader: %u inst, %u bytes, %u threads, %u loops,"
1036
           "%u:%u spills:fills\n",
1037
           ctx->nir->info.label ?: "",
1038
           nr_ins, nr_bytes, nr_threads, ctx->loop_count,
1039
           ctx->spills, ctx->fills);
1040
}
1041

1042
static int
1043
glsl_type_size(const struct glsl_type *type, bool bindless)
1044
{
1045
   return glsl_count_attribute_slots(type, false);
1046
}
1047

1048
static bool
1049
agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_)
1050
{
1051
   if (instr->type != nir_instr_type_alu)
1052
      return false;
1053

1054
   nir_alu_instr *alu = nir_instr_as_alu(instr);
1055
   return alu->op == nir_op_fsin || alu->op == nir_op_fcos;
1056
}
1057

1058
/* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for
1059
 * heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in
1060
 * turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset
1061
 * fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode
1062
 * fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just
1063
 * need to change units from radians to quadrants modulo turns. Cosine is
1064
 * implemented by shifting by one quadrant: cos(x) = sin(x + tau/4).
1065
 */
1066

1067
static nir_ssa_def *
1068
agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_)
1069
{
1070
   nir_alu_instr *alu = nir_instr_as_alu(instr);
1071
   nir_ssa_def *x = nir_mov_alu(b, alu->src[0], 1);
1072
   nir_ssa_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f);
1073

1074
   if (alu->op == nir_op_fcos)
1075
      turns = nir_fadd_imm(b, turns, 0.25f);
1076

1077
   nir_ssa_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0);
1078
   return nir_fsin_agx(b, quadrants);
1079
}
1080

1081
static bool
1082
agx_lower_sincos(nir_shader *shader)
1083
{
1084
   return nir_shader_lower_instructions(shader,
1085
         agx_lower_sincos_filter, agx_lower_sincos_impl, NULL);
1086
}
1087

1088
static bool
1089
agx_lower_front_face(struct nir_builder *b,
1090
                     nir_instr *instr, UNUSED void *data)
1091
{
1092
   if (instr->type != nir_instr_type_intrinsic)
1093
      return false;
1094

1095
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1096
   if (intr->intrinsic != nir_intrinsic_load_front_face)
1097
      return false;
1098

1099
   assert(intr->dest.is_ssa);
1100
   nir_ssa_def *def = &intr->dest.ssa;
1101
   assert(def->bit_size == 1);
1102

1103
   b->cursor = nir_before_instr(&intr->instr);
1104
   nir_ssa_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1)));
1105
   return true;
1106
}
1107

1108
static bool
1109
agx_lower_point_coord(struct nir_builder *b,
1110
                      nir_instr *instr, UNUSED void *data)
1111
{
1112
   if (instr->type != nir_instr_type_intrinsic)
1113
      return false;
1114

1115
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1116

1117
   if (intr->intrinsic != nir_intrinsic_load_deref)
1118
      return false;
1119

1120
   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
1121
   nir_variable *var = nir_deref_instr_get_variable(deref);
1122

1123
   if (var->data.mode != nir_var_shader_in)
1124
      return false;
1125

1126
   if (var->data.location != VARYING_SLOT_PNTC)
1127
      return false;
1128

1129
   assert(intr->dest.is_ssa);
1130
   assert(intr->dest.ssa.num_components == 2);
1131

1132
   b->cursor = nir_after_instr(&intr->instr);
1133
   nir_ssa_def *def = nir_load_deref(b, deref);
1134
   nir_ssa_def *y = nir_channel(b, def, 1);
1135
   nir_ssa_def *flipped_y = nir_fadd_imm(b, nir_fneg(b, y), 1.0);
1136
   nir_ssa_def *flipped = nir_vec2(b, nir_channel(b, def, 0), flipped_y);
1137
   nir_ssa_def_rewrite_uses(&intr->dest.ssa, flipped);
1138
   return true;
1139
}
1140

1141
static void
1142
agx_optimize_nir(nir_shader *nir)
1143
{
1144
   bool progress;
1145

1146
   nir_lower_idiv_options idiv_options = {
1147
      .imprecise_32bit_lowering = true,
1148
      .allow_fp16 = true,
1149
   };
1150

1151
   NIR_PASS_V(nir, nir_lower_regs_to_ssa);
1152
   NIR_PASS_V(nir, nir_lower_int64);
1153
   NIR_PASS_V(nir, nir_lower_idiv, &idiv_options);
1154
   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
1155
   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
1156
   NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false);
1157
   NIR_PASS_V(nir, agx_lower_sincos);
1158
   NIR_PASS_V(nir, nir_shader_instructions_pass,
1159
         agx_lower_front_face,
1160
         nir_metadata_block_index | nir_metadata_dominance, NULL);
1161

1162
   do {
1163
      progress = false;
1164

1165
      NIR_PASS(progress, nir, nir_lower_var_copies);
1166
      NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
1167

1168
      NIR_PASS(progress, nir, nir_copy_prop);
1169
      NIR_PASS(progress, nir, nir_opt_remove_phis);
1170
      NIR_PASS(progress, nir, nir_opt_dce);
1171
      NIR_PASS(progress, nir, nir_opt_dead_cf);
1172
      NIR_PASS(progress, nir, nir_opt_cse);
1173
      NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
1174
      NIR_PASS(progress, nir, nir_opt_algebraic);
1175
      NIR_PASS(progress, nir, nir_opt_constant_folding);
1176

1177
      NIR_PASS(progress, nir, nir_opt_undef);
1178
      NIR_PASS(progress, nir, nir_lower_undef_to_zero);
1179

1180
      NIR_PASS(progress, nir, nir_opt_loop_unroll,
1181
               nir_var_shader_in |
1182
               nir_var_shader_out |
1183
               nir_var_function_temp);
1184
   } while (progress);
1185

1186
   NIR_PASS_V(nir, nir_opt_algebraic_late);
1187
   NIR_PASS_V(nir, nir_opt_constant_folding);
1188
   NIR_PASS_V(nir, nir_copy_prop);
1189
   NIR_PASS_V(nir, nir_opt_dce);
1190
   NIR_PASS_V(nir, nir_opt_cse);
1191
   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
1192
   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
1193

1194
   /* Cleanup optimizations */
1195
   nir_move_options move_all =
1196
      nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
1197
      nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
1198

1199
   NIR_PASS_V(nir, nir_opt_sink, move_all);
1200
   NIR_PASS_V(nir, nir_opt_move, move_all);
1201
   NIR_PASS_V(nir, nir_convert_from_ssa, true);
1202
}
1203

1204
/* ABI: position first, then user, then psiz */
1205
static void
1206
agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings *varyings,
1207
                      unsigned *remap)
1208
{
1209
   unsigned base = 0;
1210

1211
   nir_variable *pos = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_POS);
1212
   if (pos) {
1213
      assert(pos->data.driver_location < AGX_MAX_VARYINGS);
1214
      remap[pos->data.driver_location] = base;
1215
      base += 4;
1216
   }
1217

1218
   nir_foreach_shader_out_variable(var, nir) {
1219
      unsigned loc = var->data.location;
1220

1221
      if(loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ) {
1222
         continue;
1223
      }
1224

1225
      assert(var->data.driver_location < AGX_MAX_VARYINGS);
1226
      remap[var->data.driver_location] = base;
1227
      base += 4;
1228
   }
1229

1230
   nir_variable *psiz = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_PSIZ);
1231
   if (psiz) {
1232
      assert(psiz->data.driver_location < AGX_MAX_VARYINGS);
1233
      remap[psiz->data.driver_location] = base;
1234
      base += 1;
1235
   }
1236

1237
   varyings->nr_slots = base;
1238
}
1239

1240
static void
1241
agx_remap_varyings_fs(nir_shader *nir, struct agx_varyings *varyings,
1242
                      unsigned *remap)
1243
{
1244
   struct agx_varying_packed *packed = varyings->packed;
1245
   unsigned base = 0;
1246

1247
   agx_pack(packed, VARYING, cfg) {
1248
      cfg.type = AGX_VARYING_TYPE_FRAGCOORD_W;
1249
      cfg.components = 1;
1250
      cfg.triangle_slot = cfg.point_slot = base;
1251
   }
1252

1253
   base++;
1254
   packed++;
1255

1256
   agx_pack(packed, VARYING, cfg) {
1257
      cfg.type = AGX_VARYING_TYPE_FRAGCOORD_Z;
1258
      cfg.components = 1;
1259
      cfg.triangle_slot = cfg.point_slot = base;
1260
   }
1261

1262
   base++;
1263
   packed++;
1264

1265
   unsigned comps[MAX_VARYING] = { 0 };
1266

1267
   nir_foreach_shader_in_variable(var, nir) {
1268
     unsigned loc = var->data.driver_location;
1269
     const struct glsl_type *column =
1270
        glsl_without_array_or_matrix(var->type);
1271
     unsigned chan = glsl_get_components(column);
1272

1273
     /* If we have a fractional location added, we need to increase the size
1274
      * so it will fit, i.e. a vec3 in YZW requires us to allocate a vec4.
1275
      * We could do better but this is an edge case as it is, normally
1276
      * packed varyings will be aligned.
1277
      */
1278
     chan += var->data.location_frac;
1279
     comps[loc] = MAX2(comps[loc], chan);
1280
   }
1281

1282
   nir_foreach_shader_in_variable(var, nir) {
1283
     unsigned loc = var->data.driver_location;
1284
     unsigned sz = glsl_count_attribute_slots(var->type, FALSE);
1285
     unsigned channels = comps[loc];
1286

1287
     assert(var->data.driver_location <= AGX_MAX_VARYINGS);
1288
     remap[var->data.driver_location] = base;
1289

1290
     for (int c = 0; c < sz; ++c) {
1291
        agx_pack(packed, VARYING, cfg) {
1292
           cfg.type = (var->data.location == VARYING_SLOT_PNTC) ?
1293
              AGX_VARYING_TYPE_POINT_COORDINATES :
1294
              (var->data.interpolation == INTERP_MODE_FLAT) ?
1295
                 AGX_VARYING_TYPE_FLAT_LAST :
1296
                 AGX_VARYING_TYPE_SMOOTH;
1297

1298
           cfg.components = channels;
1299
           cfg.triangle_slot = cfg.point_slot = base;
1300
        }
1301

1302
        base += channels;
1303
        packed++;
1304
     }
1305
   }
1306

1307
   varyings->nr_descs = (packed - varyings->packed);
1308
   varyings->nr_slots = base;
1309
}
1310

1311
void
1312
agx_compile_shader_nir(nir_shader *nir,
1313
      struct agx_shader_key *key,
1314
      struct util_dynarray *binary,
1315
      struct agx_shader_info *out)
1316
{
1317
   agx_debug = debug_get_option_agx_debug();
1318

1319
   agx_context *ctx = rzalloc(NULL, agx_context);
1320
   ctx->nir = nir;
1321
   ctx->out = out;
1322
   ctx->key = key;
1323
   ctx->stage = nir->info.stage;
1324
   list_inithead(&ctx->blocks);
1325

1326
   if (ctx->stage == MESA_SHADER_VERTEX) {
1327
      out->writes_psiz = nir->info.outputs_written &
1328
         BITFIELD_BIT(VARYING_SLOT_PSIZ);
1329
   }
1330

1331
   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
1332

1333
   /* Lower large arrays to scratch and small arrays to csel */
1334
   NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
1335
         glsl_get_natural_size_align_bytes);
1336
   NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
1337

1338
   if (ctx->stage == MESA_SHADER_VERTEX) {
1339
      /* Lower from OpenGL [-1, 1] to [0, 1] if half-z is not set */
1340
      if (!key->vs.clip_halfz)
1341
         NIR_PASS_V(nir, nir_lower_clip_halfz);
1342
   } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
1343
      /* Flip point coordinate since OpenGL and Metal disagree */
1344
      NIR_PASS_V(nir, nir_shader_instructions_pass,
1345
            agx_lower_point_coord,
1346
            nir_metadata_block_index | nir_metadata_dominance, NULL);
1347
   }
1348

1349
   NIR_PASS_V(nir, nir_split_var_copies);
1350
   NIR_PASS_V(nir, nir_lower_global_vars_to_local);
1351
   NIR_PASS_V(nir, nir_lower_var_copies);
1352
   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
1353
   NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1354
         glsl_type_size, 0);
1355
   if (ctx->stage == MESA_SHADER_FRAGMENT) {
1356
      NIR_PASS_V(nir, nir_lower_mediump_io,
1357
            nir_var_shader_in | nir_var_shader_out, ~0, false);
1358
   }
1359
   NIR_PASS_V(nir, nir_lower_ssbo);
1360

1361
   /* Varying output is scalar, other I/O is vector */
1362
   if (ctx->stage == MESA_SHADER_VERTEX) {
1363
      NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out);
1364
   }
1365

1366
   nir_lower_tex_options lower_tex_options = {
1367
      .lower_txs_lod = true,
1368
      .lower_txp = ~0,
1369
   };
1370

1371
   nir_tex_src_type_constraints tex_constraints = {
1372
      [nir_tex_src_lod] = { true, 16 }
1373
   };
1374

1375
   NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
1376
   NIR_PASS_V(nir, nir_legalize_16bit_sampler_srcs, tex_constraints);
1377

1378
   agx_optimize_nir(nir);
1379

1380
   /* Must be last since NIR passes can remap driver_location freely */
1381
   if (ctx->stage == MESA_SHADER_VERTEX) {
1382
      agx_remap_varyings_vs(nir, &out->varyings, ctx->varyings);
1383
   } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
1384
      agx_remap_varyings_fs(nir, &out->varyings, ctx->varyings);
1385
   }
1386

1387
   bool skip_internal = nir->info.internal;
1388
   skip_internal &= !(agx_debug & AGX_DBG_INTERNAL);
1389

1390
   if (agx_debug & AGX_DBG_SHADERS && !skip_internal) {
1391
      nir_print_shader(nir, stdout);
1392
   }
1393

1394
   nir_foreach_function(func, nir) {
1395
      if (!func->impl)
1396
         continue;
1397

1398
      /* TODO: Handle phi nodes instead of just convert_from_ssa and yolo'ing
1399
       * the mapping of nir_register to hardware registers and guaranteeing bad
1400
       * performance and breaking spilling... */
1401
      ctx->nir_regalloc = rzalloc_array(ctx, unsigned, func->impl->reg_alloc);
1402

1403
      /* Leave the last 4 registers for hacky p-copy lowering */
1404
      unsigned nir_regalloc = AGX_NUM_REGS - (4 * 2);
1405

1406
      /* Assign backwards so we don't need to guess a size */
1407
      nir_foreach_register(reg, &func->impl->registers) {
1408
         /* Ensure alignment */
1409
         if (reg->bit_size >= 32 && (nir_regalloc & 1))
1410
            nir_regalloc--;
1411

1412
         unsigned size = DIV_ROUND_UP(reg->bit_size * reg->num_components, 16);
1413
         nir_regalloc -= size;
1414
         ctx->nir_regalloc[reg->index] = nir_regalloc;
1415
      }
1416

1417
      ctx->max_register = nir_regalloc;
1418
      ctx->alloc += func->impl->ssa_alloc;
1419
      emit_cf_list(ctx, &func->impl->body);
1420
      break; /* TODO: Multi-function shaders */
1421
   }
1422

1423
   /* TODO: Actual RA... this way passes don't need to deal nir_register */
1424
   agx_foreach_instr_global(ctx, I) {
1425
      agx_foreach_dest(I, d) {
1426
         if (I->dest[d].type == AGX_INDEX_NIR_REGISTER) {
1427
            I->dest[d].type = AGX_INDEX_REGISTER;
1428
            I->dest[d].value = ctx->nir_regalloc[I->dest[d].value];
1429
         }
1430
      }
1431

1432
      agx_foreach_src(I, s) {
1433
         if (I->src[s].type == AGX_INDEX_NIR_REGISTER) {
1434
            I->src[s].type = AGX_INDEX_REGISTER;
1435
            I->src[s].value = ctx->nir_regalloc[I->src[s].value];
1436
         }
1437
      }
1438
   }
1439

1440
   /* Terminate the shader after the exit block */
1441
   agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
1442
   agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
1443
   agx_stop(&_b);
1444

1445
   /* Also add traps to match the blob, unsure what the function is */
1446
   for (unsigned i = 0; i < 8; ++i)
1447
      agx_trap(&_b);
1448

1449
   unsigned block_source_count = 0;
1450

1451
   /* Name blocks now that we're done emitting so the order is consistent */
1452
   agx_foreach_block(ctx, block)
1453
      block->name = block_source_count++;
1454

1455
   if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1456
      agx_print_shader(ctx, stdout);
1457

1458
   agx_optimizer(ctx);
1459
   agx_dce(ctx);
1460

1461
   if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1462
      agx_print_shader(ctx, stdout);
1463

1464
   agx_ra(ctx);
1465

1466
   if (ctx->stage == MESA_SHADER_VERTEX)
1467
      agx_set_st_vary_final(ctx);
1468

1469
   if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1470
      agx_print_shader(ctx, stdout);
1471

1472
   agx_pack_binary(ctx, binary);
1473

1474
   if ((agx_debug & AGX_DBG_SHADERDB) && !skip_internal)
1475
      agx_print_stats(ctx, binary->size, stderr);
1476

1477
   ralloc_free(ctx);
1478
}
1479

1480
Product

Resources

Company