Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/asahi/compiler/agx_compile.c
4564 views
1
/*
2
* Copyright (C) 2021 Alyssa Rosenzweig <[email protected]>
3
* Copyright (C) 2020 Collabora Ltd.
4
* Copyright © 2016 Broadcom
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a
7
* copy of this software and associated documentation files (the "Software"),
8
* to deal in the Software without restriction, including without limitation
9
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
10
* and/or sell copies of the Software, and to permit persons to whom the
11
* Software is furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice (including the next
14
* paragraph) shall be included in all copies or substantial portions of the
15
* Software.
16
*
17
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
* SOFTWARE.
24
*/
25
26
#include "main/mtypes.h"
27
#include "compiler/nir_types.h"
28
#include "compiler/nir/nir_builder.h"
29
#include "util/u_debug.h"
30
#include "agx_compile.h"
31
#include "agx_compiler.h"
32
#include "agx_builder.h"
33
34
static const struct debug_named_value agx_debug_options[] = {
35
{"msgs", AGX_DBG_MSGS, "Print debug messages"},
36
{"shaders", AGX_DBG_SHADERS, "Dump shaders in NIR and AIR"},
37
{"shaderdb", AGX_DBG_SHADERDB, "Print statistics"},
38
{"verbose", AGX_DBG_VERBOSE, "Disassemble verbosely"},
39
{"internal", AGX_DBG_INTERNAL, "Dump even internal shaders"},
40
DEBUG_NAMED_VALUE_END
41
};
42
43
DEBUG_GET_ONCE_FLAGS_OPTION(agx_debug, "AGX_MESA_DEBUG", agx_debug_options, 0)
44
45
int agx_debug = 0;
46
47
#define DBG(fmt, ...) \
48
do { if (agx_debug & AGX_DBG_MSGS) \
49
fprintf(stderr, "%s:%d: "fmt, \
50
__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
51
52
static void
53
agx_block_add_successor(agx_block *block, agx_block *successor)
54
{
55
assert(block != NULL && successor != NULL);
56
57
/* Cull impossible edges */
58
if (block->unconditional_jumps)
59
return;
60
61
for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
62
if (block->successors[i]) {
63
if (block->successors[i] == successor)
64
return;
65
else
66
continue;
67
}
68
69
block->successors[i] = successor;
70
_mesa_set_add(successor->predecessors, block);
71
return;
72
}
73
74
unreachable("Too many successors");
75
}
76
77
static void
78
agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr)
79
{
80
/* Ensure we've been scalarized and bit size lowered */
81
unsigned bit_size = instr->def.bit_size;
82
assert(instr->def.num_components == 1);
83
assert(bit_size == 1 || bit_size == 16 || bit_size == 32);
84
85
/* Emit move, later passes can inline/push if useful */
86
agx_mov_imm_to(b,
87
agx_get_index(instr->def.index, agx_size_for_bits(bit_size)),
88
nir_const_value_as_uint(instr->value[0], bit_size));
89
}
90
91
/* AGX appears to lack support for vertex attributes. Lower to global loads. */
92
static agx_instr *
93
agx_emit_load_attr(agx_builder *b, nir_intrinsic_instr *instr)
94
{
95
nir_src *offset_src = nir_get_io_offset_src(instr);
96
assert(nir_src_is_const(*offset_src) && "no attribute indirects");
97
unsigned index = nir_intrinsic_base(instr) +
98
nir_src_as_uint(*offset_src);
99
100
struct agx_shader_key *key = b->shader->key;
101
struct agx_attribute attrib = key->vs.attributes[index];
102
103
/* address = base + (stride * vertex_id) + src_offset */
104
unsigned buf = attrib.buf;
105
agx_index stride = agx_mov_imm(b, 32, key->vs.vbuf_strides[buf]);
106
agx_index src_offset = agx_mov_imm(b, 32, attrib.src_offset);
107
agx_index vertex_id = agx_register(10, AGX_SIZE_32); // TODO: RA
108
agx_index offset = agx_imad(b, vertex_id, stride, src_offset, 0);
109
110
/* Each VBO has a 64-bit = 4 x 16-bit address, lookup the base address as a sysval */
111
unsigned num_vbos = key->vs.num_vbufs;
112
unsigned base_length = (num_vbos * 4);
113
agx_index base = agx_indexed_sysval(b->shader,
114
AGX_PUSH_VBO_BASES, AGX_SIZE_64, buf * 4, base_length);
115
116
/* Load the data */
117
assert(instr->num_components <= 4);
118
119
bool pad = ((attrib.nr_comps_minus_1 + 1) < instr->num_components);
120
agx_index real_dest = agx_dest_index(&instr->dest);
121
agx_index dest = pad ? agx_temp(b->shader, AGX_SIZE_32) : real_dest;
122
123
agx_device_load_to(b, dest, base, offset, attrib.format,
124
BITFIELD_MASK(attrib.nr_comps_minus_1 + 1), 0);
125
126
agx_wait(b, 0);
127
128
if (pad) {
129
agx_index one = agx_mov_imm(b, 32, fui(1.0));
130
agx_index zero = agx_mov_imm(b, 32, 0);
131
agx_index channels[4] = { zero, zero, zero, one };
132
for (unsigned i = 0; i < (attrib.nr_comps_minus_1 + 1); ++i)
133
channels[i] = agx_p_extract(b, dest, i);
134
for (unsigned i = instr->num_components; i < 4; ++i)
135
channels[i] = agx_null();
136
agx_p_combine_to(b, real_dest, channels[0], channels[1], channels[2], channels[3]);
137
}
138
139
return NULL;
140
}
141
142
static agx_instr *
143
agx_emit_load_vary_flat(agx_builder *b, nir_intrinsic_instr *instr)
144
{
145
unsigned components = instr->num_components;
146
assert(components >= 1 && components <= 4);
147
148
nir_src *offset = nir_get_io_offset_src(instr);
149
assert(nir_src_is_const(*offset) && "no indirects");
150
unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
151
imm_index += nir_src_as_uint(*offset);
152
153
agx_index chan[4] = { agx_null() };
154
155
for (unsigned i = 0; i < components; ++i) {
156
/* vec3 for each vertex, unknown what first 2 channels are for */
157
agx_index values = agx_ld_vary_flat(b, agx_immediate(imm_index + i), 1);
158
chan[i] = agx_p_extract(b, values, 2);
159
}
160
161
return agx_p_combine_to(b, agx_dest_index(&instr->dest),
162
chan[0], chan[1], chan[2], chan[3]);
163
}
164
165
static agx_instr *
166
agx_emit_load_vary(agx_builder *b, nir_intrinsic_instr *instr)
167
{
168
ASSERTED unsigned components = instr->num_components;
169
ASSERTED nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
170
171
assert(components >= 1 && components <= 4);
172
assert(parent);
173
174
/* TODO: Interpolation modes */
175
assert(parent->intrinsic == nir_intrinsic_load_barycentric_pixel);
176
177
nir_src *offset = nir_get_io_offset_src(instr);
178
assert(nir_src_is_const(*offset) && "no indirects");
179
unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
180
imm_index += nir_src_as_uint(*offset) * 4;
181
182
return agx_ld_vary_to(b, agx_dest_index(&instr->dest),
183
agx_immediate(imm_index), components, true);
184
}
185
186
static agx_instr *
187
agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr)
188
{
189
nir_src *offset = nir_get_io_offset_src(instr);
190
assert(nir_src_is_const(*offset) && "todo: indirects");
191
unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
192
imm_index += nir_intrinsic_component(instr);
193
imm_index += nir_src_as_uint(*offset);
194
195
/* nir_lower_io_to_scalar */
196
assert(nir_intrinsic_write_mask(instr) == 0x1);
197
198
return agx_st_vary(b,
199
agx_immediate(imm_index),
200
agx_src_index(&instr->src[0]));
201
}
202
203
static agx_instr *
204
agx_emit_fragment_out(agx_builder *b, nir_intrinsic_instr *instr)
205
{
206
const nir_variable *var =
207
nir_find_variable_with_driver_location(b->shader->nir,
208
nir_var_shader_out, nir_intrinsic_base(instr));
209
assert(var);
210
211
unsigned loc = var->data.location;
212
assert(var->data.index == 0 && "todo: dual-source blending");
213
assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
214
unsigned rt = (loc - FRAG_RESULT_DATA0);
215
216
/* TODO: Reverse-engineer interactions with MRT */
217
if (b->shader->nir->info.internal) {
218
/* clear */
219
} else if (b->shader->did_writeout) {
220
agx_writeout(b, 0x0004);
221
} else {
222
agx_writeout(b, 0xC200);
223
agx_writeout(b, 0x000C);
224
}
225
226
b->shader->did_writeout = true;
227
return agx_st_tile(b, agx_src_index(&instr->src[0]),
228
b->shader->key->fs.tib_formats[rt]);
229
}
230
231
static agx_instr *
232
agx_emit_load_tile(agx_builder *b, nir_intrinsic_instr *instr)
233
{
234
const nir_variable *var =
235
nir_find_variable_with_driver_location(b->shader->nir,
236
nir_var_shader_out, nir_intrinsic_base(instr));
237
assert(var);
238
239
unsigned loc = var->data.location;
240
assert(var->data.index == 0 && "todo: dual-source blending");
241
assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
242
unsigned rt = (loc - FRAG_RESULT_DATA0);
243
244
/* TODO: Reverse-engineer interactions with MRT */
245
agx_writeout(b, 0xC200);
246
agx_writeout(b, 0x0008);
247
b->shader->did_writeout = true;
248
b->shader->out->reads_tib = true;
249
250
return agx_ld_tile_to(b, agx_dest_index(&instr->dest),
251
b->shader->key->fs.tib_formats[rt]);
252
}
253
254
static enum agx_format
255
agx_format_for_bits(unsigned bits)
256
{
257
switch (bits) {
258
case 8: return AGX_FORMAT_I8;
259
case 16: return AGX_FORMAT_I16;
260
case 32: return AGX_FORMAT_I32;
261
default: unreachable("Invalid bit size for load/store");
262
}
263
}
264
265
static agx_instr *
266
agx_emit_load_ubo(agx_builder *b, nir_intrinsic_instr *instr)
267
{
268
bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);
269
nir_src *offset = nir_get_io_offset_src(instr);
270
271
if (!kernel_input && !nir_src_is_const(instr->src[0]))
272
unreachable("todo: indirect UBO access");
273
274
/* Constant offsets for device_load are 16-bit */
275
bool offset_is_const = nir_src_is_const(*offset);
276
assert(offset_is_const && "todo: indirect UBO access");
277
int32_t const_offset = offset_is_const ? nir_src_as_int(*offset) : 0;
278
279
/* Offsets are shifted by the type size, so divide that out */
280
unsigned bytes = nir_dest_bit_size(instr->dest) / 8;
281
assert((const_offset & (bytes - 1)) == 0);
282
const_offset = const_offset / bytes;
283
int16_t const_as_16 = const_offset;
284
285
/* UBO blocks are specified (kernel inputs are always 0) */
286
uint32_t block = kernel_input ? 0 : nir_src_as_uint(instr->src[0]);
287
288
/* Each UBO has a 64-bit = 4 x 16-bit address */
289
unsigned num_ubos = b->shader->nir->info.num_ubos;
290
unsigned base_length = (num_ubos * 4);
291
unsigned index = block * 4; /* 16 bit units */
292
293
/* Lookup the base address (TODO: indirection) */
294
agx_index base = agx_indexed_sysval(b->shader,
295
AGX_PUSH_UBO_BASES, AGX_SIZE_64,
296
index, base_length);
297
298
/* Load the data */
299
assert(instr->num_components <= 4);
300
301
agx_device_load_to(b, agx_dest_index(&instr->dest),
302
base,
303
(offset_is_const && (const_offset == const_as_16)) ?
304
agx_immediate(const_as_16) : agx_mov_imm(b, 32, const_offset),
305
agx_format_for_bits(nir_dest_bit_size(instr->dest)),
306
BITFIELD_MASK(instr->num_components), 0);
307
308
return agx_wait(b, 0);
309
}
310
311
static agx_instr *
312
agx_emit_load_frag_coord(agx_builder *b, nir_intrinsic_instr *instr)
313
{
314
agx_index xy[2];
315
316
for (unsigned i = 0; i < 2; ++i) {
317
xy[i] = agx_fadd(b, agx_convert(b, agx_immediate(AGX_CONVERT_U32_TO_F),
318
agx_get_sr(b, 32, AGX_SR_THREAD_POSITION_IN_GRID_X + i),
319
AGX_ROUND_RTE), agx_immediate_f(0.5f));
320
}
321
322
/* Ordering by the ABI */
323
agx_index z = agx_ld_vary(b, agx_immediate(1), 1, false);
324
agx_index w = agx_ld_vary(b, agx_immediate(0), 1, false);
325
326
return agx_p_combine_to(b, agx_dest_index(&instr->dest),
327
xy[0], xy[1], z, w);
328
}
329
330
static agx_instr *
331
agx_blend_const(agx_builder *b, agx_index dst, unsigned comp)
332
{
333
agx_index val = agx_indexed_sysval(b->shader,
334
AGX_PUSH_BLEND_CONST, AGX_SIZE_32, comp * 2, 4 * 2);
335
336
return agx_mov_to(b, dst, val);
337
}
338
339
static agx_instr *
340
agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
341
{
342
agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
343
agx_dest_index(&instr->dest) : agx_null();
344
gl_shader_stage stage = b->shader->stage;
345
346
switch (instr->intrinsic) {
347
case nir_intrinsic_load_barycentric_pixel:
348
case nir_intrinsic_load_barycentric_centroid:
349
case nir_intrinsic_load_barycentric_sample:
350
case nir_intrinsic_load_barycentric_at_sample:
351
case nir_intrinsic_load_barycentric_at_offset:
352
/* handled later via load_vary */
353
return NULL;
354
case nir_intrinsic_load_interpolated_input:
355
assert(stage == MESA_SHADER_FRAGMENT);
356
return agx_emit_load_vary(b, instr);
357
358
case nir_intrinsic_load_input:
359
if (stage == MESA_SHADER_FRAGMENT)
360
return agx_emit_load_vary_flat(b, instr);
361
else if (stage == MESA_SHADER_VERTEX)
362
return agx_emit_load_attr(b, instr);
363
else
364
unreachable("Unsupported shader stage");
365
366
case nir_intrinsic_store_output:
367
if (stage == MESA_SHADER_FRAGMENT)
368
return agx_emit_fragment_out(b, instr);
369
else if (stage == MESA_SHADER_VERTEX)
370
return agx_emit_store_vary(b, instr);
371
else
372
unreachable("Unsupported shader stage");
373
374
case nir_intrinsic_load_output:
375
assert(stage == MESA_SHADER_FRAGMENT);
376
return agx_emit_load_tile(b, instr);
377
378
case nir_intrinsic_load_ubo:
379
case nir_intrinsic_load_kernel_input:
380
return agx_emit_load_ubo(b, instr);
381
382
case nir_intrinsic_load_frag_coord:
383
return agx_emit_load_frag_coord(b, instr);
384
385
case nir_intrinsic_load_back_face_agx:
386
return agx_get_sr_to(b, dst, AGX_SR_BACKFACING);
387
388
case nir_intrinsic_load_vertex_id:
389
return agx_mov_to(b, dst, agx_abs(agx_register(10, AGX_SIZE_32))); /* TODO: RA */
390
391
case nir_intrinsic_load_blend_const_color_r_float: return agx_blend_const(b, dst, 0);
392
case nir_intrinsic_load_blend_const_color_g_float: return agx_blend_const(b, dst, 1);
393
case nir_intrinsic_load_blend_const_color_b_float: return agx_blend_const(b, dst, 2);
394
case nir_intrinsic_load_blend_const_color_a_float: return agx_blend_const(b, dst, 3);
395
396
default:
397
fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
398
unreachable("Unhandled intrinsic");
399
}
400
}
401
402
static agx_index
403
agx_alu_src_index(agx_builder *b, nir_alu_src src)
404
{
405
/* Check well-formedness of the input NIR */
406
ASSERTED unsigned bitsize = nir_src_bit_size(src.src);
407
unsigned comps = nir_src_num_components(src.src);
408
unsigned channel = src.swizzle[0];
409
410
assert(bitsize == 1 || bitsize == 16 || bitsize == 32 || bitsize == 64);
411
assert(!(src.negate || src.abs));
412
assert(channel < comps);
413
414
agx_index idx = agx_src_index(&src.src);
415
416
/* We only deal with scalars, emit p_extract if needed */
417
if (comps > 1)
418
return agx_p_extract(b, idx, channel);
419
else
420
return idx;
421
}
422
423
static agx_instr *
424
agx_emit_alu_bool(agx_builder *b, nir_op op,
425
agx_index dst, agx_index s0, agx_index s1, agx_index s2)
426
{
427
/* Handle 1-bit bools as zero/nonzero rather than specifically 0/1 or 0/~0.
428
* This will give the optimizer flexibility. */
429
agx_index f = agx_immediate(0);
430
agx_index t = agx_immediate(0x1);
431
432
switch (op) {
433
case nir_op_feq: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_EQ);
434
case nir_op_flt: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_LT);
435
case nir_op_fge: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_GE);
436
case nir_op_fneu: return agx_fcmpsel_to(b, dst, s0, s1, f, t, AGX_FCOND_EQ);
437
438
case nir_op_ieq: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_UEQ);
439
case nir_op_ine: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_UEQ);
440
case nir_op_ilt: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_SLT);
441
case nir_op_ige: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_SLT);
442
case nir_op_ult: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_ULT);
443
case nir_op_uge: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_ULT);
444
445
case nir_op_mov: return agx_mov_to(b, dst, s0);
446
case nir_op_iand: return agx_and_to(b, dst, s0, s1);
447
case nir_op_ior: return agx_or_to(b, dst, s0, s1);
448
case nir_op_ixor: return agx_xor_to(b, dst, s0, s1);
449
case nir_op_inot: return agx_xor_to(b, dst, s0, t);
450
451
case nir_op_f2b1: return agx_fcmpsel_to(b, dst, s0, f, f, t, AGX_FCOND_EQ);
452
case nir_op_i2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
453
case nir_op_b2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
454
455
case nir_op_bcsel:
456
return agx_icmpsel_to(b, dst, s0, f, s2, s1, AGX_ICOND_UEQ);
457
458
default:
459
fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name);
460
unreachable("Unhandled boolean ALU instruction");
461
}
462
}
463
464
static agx_instr *
465
agx_emit_alu(agx_builder *b, nir_alu_instr *instr)
466
{
467
unsigned srcs = nir_op_infos[instr->op].num_inputs;
468
unsigned sz = nir_dest_bit_size(instr->dest.dest);
469
unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0;
470
ASSERTED unsigned comps = nir_dest_num_components(instr->dest.dest);
471
472
assert(comps == 1 || nir_op_is_vec(instr->op));
473
assert(sz == 1 || sz == 16 || sz == 32 || sz == 64);
474
475
agx_index dst = agx_dest_index(&instr->dest.dest);
476
agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null();
477
agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null();
478
agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null();
479
agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null();
480
481
/* 1-bit bools are a bit special, only handle with select ops */
482
if (sz == 1)
483
return agx_emit_alu_bool(b, instr->op, dst, s0, s1, s2);
484
485
#define UNOP(nop, aop) \
486
case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0);
487
#define BINOP(nop, aop) \
488
case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1);
489
#define TRIOP(nop, aop) \
490
case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1, s2);
491
492
switch (instr->op) {
493
BINOP(fadd, fadd);
494
BINOP(fmul, fmul);
495
TRIOP(ffma, fma);
496
497
UNOP(f2f16, fmov);
498
UNOP(f2f32, fmov);
499
UNOP(fround_even, roundeven);
500
UNOP(ftrunc, trunc);
501
UNOP(ffloor, floor);
502
UNOP(fceil, ceil);
503
UNOP(frcp, rcp);
504
UNOP(frsq, rsqrt);
505
UNOP(flog2, log2);
506
UNOP(fexp2, exp2);
507
508
UNOP(fddx, dfdx);
509
UNOP(fddx_coarse, dfdx);
510
UNOP(fddx_fine, dfdx);
511
512
UNOP(fddy, dfdy);
513
UNOP(fddy_coarse, dfdy);
514
UNOP(fddy_fine, dfdy);
515
516
UNOP(mov, mov);
517
UNOP(u2u16, mov);
518
UNOP(u2u32, mov);
519
UNOP(inot, not);
520
BINOP(iand, and);
521
BINOP(ior, or);
522
BINOP(ixor, xor);
523
524
case nir_op_fsqrt: return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0));
525
case nir_op_fsub: return agx_fadd_to(b, dst, s0, agx_neg(s1));
526
case nir_op_fabs: return agx_fmov_to(b, dst, agx_abs(s0));
527
case nir_op_fneg: return agx_fmov_to(b, dst, agx_neg(s0));
528
529
case nir_op_fmin: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_LTN);
530
case nir_op_fmax: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_GTN);
531
case nir_op_imin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT);
532
case nir_op_imax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT);
533
case nir_op_umin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT);
534
case nir_op_umax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT);
535
536
case nir_op_iadd: return agx_iadd_to(b, dst, s0, s1, 0);
537
case nir_op_isub: return agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
538
case nir_op_ineg: return agx_iadd_to(b, dst, agx_zero(), agx_neg(s0), 0);
539
case nir_op_imul: return agx_imad_to(b, dst, s0, s1, agx_zero(), 0);
540
541
case nir_op_ishl: return agx_bfi_to(b, dst, agx_zero(), s0, s1, 0);
542
case nir_op_ushr: return agx_bfeil_to(b, dst, agx_zero(), s0, s1, 0);
543
case nir_op_ishr: return agx_asr_to(b, dst, s0, s1);
544
545
case nir_op_bcsel:
546
return agx_icmpsel_to(b, dst, s0, agx_zero(), s2, s1, AGX_ICOND_UEQ);
547
548
case nir_op_b2i32:
549
case nir_op_b2i16:
550
return agx_icmpsel_to(b, dst, s0, agx_zero(), agx_zero(), agx_immediate(1), AGX_ICOND_UEQ);
551
552
case nir_op_b2f16:
553
case nir_op_b2f32:
554
{
555
/* At this point, boolean is just zero/nonzero, so compare with zero */
556
agx_index one = (sz == 16) ?
557
agx_mov_imm(b, 16, _mesa_float_to_half(1.0)) :
558
agx_mov_imm(b, 32, fui(1.0));
559
560
agx_index zero = agx_zero();
561
562
return agx_fcmpsel_to(b, dst, s0, zero, zero, one, AGX_FCOND_EQ);
563
}
564
565
case nir_op_i2i32:
566
{
567
if (s0.size != AGX_SIZE_16)
568
unreachable("todo: more conversions");
569
570
return agx_iadd_to(b, dst, s0, agx_zero(), 0);
571
}
572
573
case nir_op_i2i16:
574
{
575
if (s0.size != AGX_SIZE_32)
576
unreachable("todo: more conversions");
577
578
return agx_iadd_to(b, dst, s0, agx_zero(), 0);
579
}
580
581
case nir_op_iadd_sat:
582
{
583
agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0);
584
I->saturate = true;
585
return I;
586
}
587
588
case nir_op_isub_sat:
589
{
590
agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
591
I->saturate = true;
592
return I;
593
}
594
595
case nir_op_uadd_sat:
596
{
597
agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0);
598
I->saturate = true;
599
return I;
600
}
601
602
case nir_op_usub_sat:
603
{
604
agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0);
605
I->saturate = true;
606
return I;
607
}
608
609
case nir_op_fsat:
610
{
611
agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero());
612
I->saturate = true;
613
return I;
614
}
615
616
case nir_op_fsin_agx:
617
{
618
agx_index fixup = agx_sin_pt_1(b, s0);
619
agx_index sinc = agx_sin_pt_2(b, fixup);
620
return agx_fmul_to(b, dst, sinc, fixup);
621
}
622
623
case nir_op_f2i16:
624
return agx_convert_to(b, dst,
625
agx_immediate(AGX_CONVERT_F_TO_S16), s0, AGX_ROUND_RTZ);
626
627
case nir_op_f2i32:
628
return agx_convert_to(b, dst,
629
agx_immediate(AGX_CONVERT_F_TO_S32), s0, AGX_ROUND_RTZ);
630
631
case nir_op_f2u16:
632
return agx_convert_to(b, dst,
633
agx_immediate(AGX_CONVERT_F_TO_U16), s0, AGX_ROUND_RTZ);
634
635
case nir_op_f2u32:
636
return agx_convert_to(b, dst,
637
agx_immediate(AGX_CONVERT_F_TO_U32), s0, AGX_ROUND_RTZ);
638
639
case nir_op_u2f16:
640
case nir_op_u2f32:
641
{
642
if (src_sz == 64)
643
unreachable("64-bit conversions unimplemented");
644
645
enum agx_convert mode =
646
(src_sz == 32) ? AGX_CONVERT_U32_TO_F :
647
(src_sz == 16) ? AGX_CONVERT_U16_TO_F :
648
AGX_CONVERT_U8_TO_F;
649
650
return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
651
}
652
653
case nir_op_i2f16:
654
case nir_op_i2f32:
655
{
656
if (src_sz == 64)
657
unreachable("64-bit conversions unimplemented");
658
659
enum agx_convert mode =
660
(src_sz == 32) ? AGX_CONVERT_S32_TO_F :
661
(src_sz == 16) ? AGX_CONVERT_S16_TO_F :
662
AGX_CONVERT_S8_TO_F;
663
664
return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
665
}
666
667
case nir_op_vec2:
668
case nir_op_vec3:
669
case nir_op_vec4:
670
return agx_p_combine_to(b, dst, s0, s1, s2, s3);
671
672
case nir_op_vec8:
673
case nir_op_vec16:
674
unreachable("should've been lowered");
675
676
default:
677
fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
678
unreachable("Unhandled ALU instruction");
679
}
680
}
681
682
static enum agx_dim
683
agx_tex_dim(enum glsl_sampler_dim dim, bool array)
684
{
685
switch (dim) {
686
case GLSL_SAMPLER_DIM_1D:
687
case GLSL_SAMPLER_DIM_BUF:
688
return array ? AGX_DIM_TEX_1D_ARRAY : AGX_DIM_TEX_1D;
689
690
case GLSL_SAMPLER_DIM_2D:
691
case GLSL_SAMPLER_DIM_RECT:
692
case GLSL_SAMPLER_DIM_EXTERNAL:
693
return array ? AGX_DIM_TEX_2D_ARRAY : AGX_DIM_TEX_2D;
694
695
case GLSL_SAMPLER_DIM_MS:
696
assert(!array && "multisampled arrays unsupported");
697
return AGX_DIM_TEX_2D_MS;
698
699
case GLSL_SAMPLER_DIM_3D:
700
assert(!array && "3D arrays unsupported");
701
return AGX_DIM_TEX_3D;
702
703
case GLSL_SAMPLER_DIM_CUBE:
704
return array ? AGX_DIM_TEX_CUBE_ARRAY : AGX_DIM_TEX_CUBE;
705
706
default:
707
unreachable("Invalid sampler dim\n");
708
}
709
}
710
711
static void
712
agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
713
{
714
switch (instr->op) {
715
case nir_texop_tex:
716
case nir_texop_txl:
717
break;
718
default:
719
unreachable("Unhandled texture op");
720
}
721
722
enum agx_lod_mode lod_mode = (instr->op == nir_texop_tex) ?
723
AGX_LOD_MODE_AUTO_LOD : AGX_LOD_MODE_LOD_MIN;
724
725
agx_index coords = agx_null(),
726
texture = agx_immediate(instr->texture_index),
727
sampler = agx_immediate(instr->sampler_index),
728
lod = agx_immediate(0),
729
offset = agx_null();
730
731
for (unsigned i = 0; i < instr->num_srcs; ++i) {
732
agx_index index = agx_src_index(&instr->src[i].src);
733
734
switch (instr->src[i].src_type) {
735
case nir_tex_src_coord:
736
coords = index;
737
break;
738
739
case nir_tex_src_lod:
740
lod = index;
741
break;
742
743
case nir_tex_src_bias:
744
case nir_tex_src_ms_index:
745
case nir_tex_src_offset:
746
case nir_tex_src_comparator:
747
case nir_tex_src_texture_offset:
748
case nir_tex_src_sampler_offset:
749
default:
750
unreachable("todo");
751
}
752
}
753
754
agx_texture_sample_to(b, agx_dest_index(&instr->dest),
755
coords, lod, texture, sampler, offset,
756
agx_tex_dim(instr->sampler_dim, instr->is_array),
757
lod_mode,
758
0xF, /* TODO: wrmask */
759
0);
760
761
agx_wait(b, 0);
762
}
763
764
/* NIR loops are treated as a pair of AGX loops:
765
*
766
* do {
767
* do {
768
* ...
769
* } while (0);
770
* } while (cond);
771
*
772
* By manipulating the nesting counter (r0l), we may break out of nested loops,
773
* so under the model, both break and continue may be implemented as breaks,
774
* where break breaks out of the outer loop (2 layers) and continue breaks out
775
* of the inner loop (1 layer).
776
*
777
* After manipulating the nesting counter directly, pop_exec #0 must be used to
778
* flush the update to the execution mask.
779
*/
780
781
static void
782
agx_emit_jump(agx_builder *b, nir_jump_instr *instr)
783
{
784
agx_context *ctx = b->shader;
785
assert (instr->type == nir_jump_break || instr->type == nir_jump_continue);
786
787
/* Break out of either one or two loops */
788
unsigned nestings = b->shader->loop_nesting;
789
790
if (instr->type == nir_jump_continue) {
791
nestings += 1;
792
agx_block_add_successor(ctx->current_block, ctx->continue_block);
793
} else if (instr->type == nir_jump_break) {
794
nestings += 2;
795
agx_block_add_successor(ctx->current_block, ctx->break_block);
796
}
797
798
/* Update the counter and flush */
799
agx_index r0l = agx_register(0, false);
800
agx_mov_to(b, r0l, agx_immediate(nestings));
801
agx_pop_exec(b, 0);
802
803
ctx->current_block->unconditional_jumps = true;
804
}
805
806
static void
807
agx_emit_instr(agx_builder *b, struct nir_instr *instr)
808
{
809
switch (instr->type) {
810
case nir_instr_type_load_const:
811
agx_emit_load_const(b, nir_instr_as_load_const(instr));
812
break;
813
814
case nir_instr_type_intrinsic:
815
agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
816
break;
817
818
case nir_instr_type_alu:
819
agx_emit_alu(b, nir_instr_as_alu(instr));
820
break;
821
822
case nir_instr_type_tex:
823
agx_emit_tex(b, nir_instr_as_tex(instr));
824
break;
825
826
case nir_instr_type_jump:
827
agx_emit_jump(b, nir_instr_as_jump(instr));
828
break;
829
830
default:
831
unreachable("should've been lowered");
832
}
833
}
834
835
static agx_block *
836
agx_create_block(agx_context *ctx)
837
{
838
agx_block *blk = rzalloc(ctx, agx_block);
839
840
blk->predecessors = _mesa_set_create(blk,
841
_mesa_hash_pointer, _mesa_key_pointer_equal);
842
843
return blk;
844
}
845
846
static agx_block *
847
emit_block(agx_context *ctx, nir_block *block)
848
{
849
if (ctx->after_block) {
850
ctx->current_block = ctx->after_block;
851
ctx->after_block = NULL;
852
} else {
853
ctx->current_block = agx_create_block(ctx);
854
}
855
856
agx_block *blk = ctx->current_block;
857
list_addtail(&blk->link, &ctx->blocks);
858
list_inithead(&blk->instructions);
859
860
agx_builder _b = agx_init_builder(ctx, agx_after_block(blk));
861
862
nir_foreach_instr(instr, block) {
863
agx_emit_instr(&_b, instr);
864
}
865
866
return blk;
867
}
868
869
static agx_block *
870
emit_cf_list(agx_context *ctx, struct exec_list *list);
871
872
/* Emit if-else as
873
*
874
* if_icmp cond != 0
875
* ...
876
* else_icmp cond == 0
877
* ...
878
* pop_exec
879
*
880
* If the else is empty, we can omit the else_icmp. This is not usually
881
* optimal, but it's a start.
882
*/
883
884
static void
885
emit_if(agx_context *ctx, nir_if *nif)
886
{
887
nir_block *nir_else_block = nir_if_first_else_block(nif);
888
bool empty_else_block =
889
(nir_else_block == nir_if_last_else_block(nif) &&
890
exec_list_is_empty(&nir_else_block->instr_list));
891
892
agx_block *first_block = ctx->current_block;
893
agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block));
894
agx_index cond = agx_src_index(&nif->condition);
895
896
agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true);
897
ctx->loop_nesting++;
898
899
/* Emit the two subblocks. */
900
agx_block *if_block = emit_cf_list(ctx, &nif->then_list);
901
agx_block *end_then = ctx->current_block;
902
903
if (!empty_else_block) {
904
_b.cursor = agx_after_block(ctx->current_block);
905
agx_else_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, false);
906
}
907
908
agx_block *else_block = emit_cf_list(ctx, &nif->else_list);
909
agx_block *end_else = ctx->current_block;
910
911
ctx->after_block = agx_create_block(ctx);
912
913
agx_block_add_successor(first_block, if_block);
914
agx_block_add_successor(first_block, else_block);
915
agx_block_add_successor(end_then, ctx->after_block);
916
agx_block_add_successor(end_else, ctx->after_block);
917
918
_b.cursor = agx_after_block(ctx->current_block);
919
agx_pop_exec(&_b, 1);
920
ctx->loop_nesting--;
921
}
922
923
static void
924
emit_loop(agx_context *ctx, nir_loop *nloop)
925
{
926
/* We only track nesting within the innermost loop, so reset */
927
ctx->loop_nesting = 0;
928
929
agx_block *popped_break = ctx->break_block;
930
agx_block *popped_continue = ctx->continue_block;
931
932
ctx->break_block = agx_create_block(ctx);
933
ctx->continue_block = agx_create_block(ctx);
934
935
/* Make room for break/continue nesting (TODO: skip if no divergent CF) */
936
agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
937
agx_push_exec(&_b, 2);
938
939
/* Fallthrough to body */
940
agx_block_add_successor(ctx->current_block, ctx->continue_block);
941
942
/* Emit the body */
943
ctx->after_block = ctx->continue_block;
944
agx_block *start_block = emit_cf_list(ctx, &nloop->body);
945
946
/* Fix up the nesting counter via an always true while_icmp, and branch back
947
* to start of loop if any lanes are active */
948
_b.cursor = agx_after_block(ctx->current_block);
949
agx_while_icmp(&_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false);
950
agx_jmp_exec_any(&_b, start_block);
951
agx_pop_exec(&_b, 2);
952
agx_block_add_successor(ctx->current_block, ctx->continue_block);
953
954
/* Pop off */
955
ctx->after_block = ctx->break_block;
956
ctx->break_block = popped_break;
957
ctx->continue_block = popped_continue;
958
959
/* Update shader-db stats */
960
++ctx->loop_count;
961
962
/* All nested control flow must have finished */
963
assert(ctx->loop_nesting == 0);
964
}
965
966
/* Before the first control flow structure, the nesting counter (r0l) needs to
967
* be zeroed for correct operation. This only happens at most once, since by
968
* definition this occurs at the end of the first block, which dominates the
969
* rest of the program. */
970
971
static void
972
emit_first_cf(agx_context *ctx)
973
{
974
if (ctx->any_cf)
975
return;
976
977
agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
978
agx_index r0l = agx_register(0, false);
979
980
agx_mov_to(&_b, r0l, agx_immediate(0));
981
ctx->any_cf = true;
982
}
983
984
static agx_block *
985
emit_cf_list(agx_context *ctx, struct exec_list *list)
986
{
987
agx_block *start_block = NULL;
988
989
foreach_list_typed(nir_cf_node, node, node, list) {
990
switch (node->type) {
991
case nir_cf_node_block: {
992
agx_block *block = emit_block(ctx, nir_cf_node_as_block(node));
993
994
if (!start_block)
995
start_block = block;
996
997
break;
998
}
999
1000
case nir_cf_node_if:
1001
emit_first_cf(ctx);
1002
emit_if(ctx, nir_cf_node_as_if(node));
1003
break;
1004
1005
case nir_cf_node_loop:
1006
emit_first_cf(ctx);
1007
emit_loop(ctx, nir_cf_node_as_loop(node));
1008
break;
1009
1010
default:
1011
unreachable("Unknown control flow");
1012
}
1013
}
1014
1015
return start_block;
1016
}
1017
1018
static void
1019
agx_set_st_vary_final(agx_context *ctx)
1020
{
1021
agx_foreach_instr_global_rev(ctx, I) {
1022
if (I->op == AGX_OPCODE_ST_VARY) {
1023
I->last = true;
1024
return;
1025
}
1026
}
1027
}
1028
1029
static void
1030
agx_print_stats(agx_context *ctx, unsigned size, FILE *fp)
1031
{
1032
unsigned nr_ins = 0, nr_bytes = 0, nr_threads = 1;
1033
1034
/* TODO */
1035
fprintf(stderr, "%s shader: %u inst, %u bytes, %u threads, %u loops,"
1036
"%u:%u spills:fills\n",
1037
ctx->nir->info.label ?: "",
1038
nr_ins, nr_bytes, nr_threads, ctx->loop_count,
1039
ctx->spills, ctx->fills);
1040
}
1041
1042
static int
1043
glsl_type_size(const struct glsl_type *type, bool bindless)
1044
{
1045
return glsl_count_attribute_slots(type, false);
1046
}
1047
1048
static bool
1049
agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_)
1050
{
1051
if (instr->type != nir_instr_type_alu)
1052
return false;
1053
1054
nir_alu_instr *alu = nir_instr_as_alu(instr);
1055
return alu->op == nir_op_fsin || alu->op == nir_op_fcos;
1056
}
1057
1058
/* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for
1059
* heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in
1060
* turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset
1061
* fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode
1062
* fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just
1063
* need to change units from radians to quadrants modulo turns. Cosine is
1064
* implemented by shifting by one quadrant: cos(x) = sin(x + tau/4).
1065
*/
1066
1067
static nir_ssa_def *
1068
agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_)
1069
{
1070
nir_alu_instr *alu = nir_instr_as_alu(instr);
1071
nir_ssa_def *x = nir_mov_alu(b, alu->src[0], 1);
1072
nir_ssa_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f);
1073
1074
if (alu->op == nir_op_fcos)
1075
turns = nir_fadd_imm(b, turns, 0.25f);
1076
1077
nir_ssa_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0);
1078
return nir_fsin_agx(b, quadrants);
1079
}
1080
1081
static bool
1082
agx_lower_sincos(nir_shader *shader)
1083
{
1084
return nir_shader_lower_instructions(shader,
1085
agx_lower_sincos_filter, agx_lower_sincos_impl, NULL);
1086
}
1087
1088
static bool
1089
agx_lower_front_face(struct nir_builder *b,
1090
nir_instr *instr, UNUSED void *data)
1091
{
1092
if (instr->type != nir_instr_type_intrinsic)
1093
return false;
1094
1095
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1096
if (intr->intrinsic != nir_intrinsic_load_front_face)
1097
return false;
1098
1099
assert(intr->dest.is_ssa);
1100
nir_ssa_def *def = &intr->dest.ssa;
1101
assert(def->bit_size == 1);
1102
1103
b->cursor = nir_before_instr(&intr->instr);
1104
nir_ssa_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1)));
1105
return true;
1106
}
1107
1108
static bool
1109
agx_lower_point_coord(struct nir_builder *b,
1110
nir_instr *instr, UNUSED void *data)
1111
{
1112
if (instr->type != nir_instr_type_intrinsic)
1113
return false;
1114
1115
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1116
1117
if (intr->intrinsic != nir_intrinsic_load_deref)
1118
return false;
1119
1120
nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
1121
nir_variable *var = nir_deref_instr_get_variable(deref);
1122
1123
if (var->data.mode != nir_var_shader_in)
1124
return false;
1125
1126
if (var->data.location != VARYING_SLOT_PNTC)
1127
return false;
1128
1129
assert(intr->dest.is_ssa);
1130
assert(intr->dest.ssa.num_components == 2);
1131
1132
b->cursor = nir_after_instr(&intr->instr);
1133
nir_ssa_def *def = nir_load_deref(b, deref);
1134
nir_ssa_def *y = nir_channel(b, def, 1);
1135
nir_ssa_def *flipped_y = nir_fadd_imm(b, nir_fneg(b, y), 1.0);
1136
nir_ssa_def *flipped = nir_vec2(b, nir_channel(b, def, 0), flipped_y);
1137
nir_ssa_def_rewrite_uses(&intr->dest.ssa, flipped);
1138
return true;
1139
}
1140
1141
static void
1142
agx_optimize_nir(nir_shader *nir)
1143
{
1144
bool progress;
1145
1146
nir_lower_idiv_options idiv_options = {
1147
.imprecise_32bit_lowering = true,
1148
.allow_fp16 = true,
1149
};
1150
1151
NIR_PASS_V(nir, nir_lower_regs_to_ssa);
1152
NIR_PASS_V(nir, nir_lower_int64);
1153
NIR_PASS_V(nir, nir_lower_idiv, &idiv_options);
1154
NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
1155
NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
1156
NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false);
1157
NIR_PASS_V(nir, agx_lower_sincos);
1158
NIR_PASS_V(nir, nir_shader_instructions_pass,
1159
agx_lower_front_face,
1160
nir_metadata_block_index | nir_metadata_dominance, NULL);
1161
1162
do {
1163
progress = false;
1164
1165
NIR_PASS(progress, nir, nir_lower_var_copies);
1166
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
1167
1168
NIR_PASS(progress, nir, nir_copy_prop);
1169
NIR_PASS(progress, nir, nir_opt_remove_phis);
1170
NIR_PASS(progress, nir, nir_opt_dce);
1171
NIR_PASS(progress, nir, nir_opt_dead_cf);
1172
NIR_PASS(progress, nir, nir_opt_cse);
1173
NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
1174
NIR_PASS(progress, nir, nir_opt_algebraic);
1175
NIR_PASS(progress, nir, nir_opt_constant_folding);
1176
1177
NIR_PASS(progress, nir, nir_opt_undef);
1178
NIR_PASS(progress, nir, nir_lower_undef_to_zero);
1179
1180
NIR_PASS(progress, nir, nir_opt_loop_unroll,
1181
nir_var_shader_in |
1182
nir_var_shader_out |
1183
nir_var_function_temp);
1184
} while (progress);
1185
1186
NIR_PASS_V(nir, nir_opt_algebraic_late);
1187
NIR_PASS_V(nir, nir_opt_constant_folding);
1188
NIR_PASS_V(nir, nir_copy_prop);
1189
NIR_PASS_V(nir, nir_opt_dce);
1190
NIR_PASS_V(nir, nir_opt_cse);
1191
NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
1192
NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
1193
1194
/* Cleanup optimizations */
1195
nir_move_options move_all =
1196
nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
1197
nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
1198
1199
NIR_PASS_V(nir, nir_opt_sink, move_all);
1200
NIR_PASS_V(nir, nir_opt_move, move_all);
1201
NIR_PASS_V(nir, nir_convert_from_ssa, true);
1202
}
1203
1204
/* ABI: position first, then user, then psiz */
1205
static void
1206
agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings *varyings,
1207
unsigned *remap)
1208
{
1209
unsigned base = 0;
1210
1211
nir_variable *pos = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_POS);
1212
if (pos) {
1213
assert(pos->data.driver_location < AGX_MAX_VARYINGS);
1214
remap[pos->data.driver_location] = base;
1215
base += 4;
1216
}
1217
1218
nir_foreach_shader_out_variable(var, nir) {
1219
unsigned loc = var->data.location;
1220
1221
if(loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ) {
1222
continue;
1223
}
1224
1225
assert(var->data.driver_location < AGX_MAX_VARYINGS);
1226
remap[var->data.driver_location] = base;
1227
base += 4;
1228
}
1229
1230
nir_variable *psiz = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_PSIZ);
1231
if (psiz) {
1232
assert(psiz->data.driver_location < AGX_MAX_VARYINGS);
1233
remap[psiz->data.driver_location] = base;
1234
base += 1;
1235
}
1236
1237
varyings->nr_slots = base;
1238
}
1239
1240
static void
1241
agx_remap_varyings_fs(nir_shader *nir, struct agx_varyings *varyings,
1242
unsigned *remap)
1243
{
1244
struct agx_varying_packed *packed = varyings->packed;
1245
unsigned base = 0;
1246
1247
agx_pack(packed, VARYING, cfg) {
1248
cfg.type = AGX_VARYING_TYPE_FRAGCOORD_W;
1249
cfg.components = 1;
1250
cfg.triangle_slot = cfg.point_slot = base;
1251
}
1252
1253
base++;
1254
packed++;
1255
1256
agx_pack(packed, VARYING, cfg) {
1257
cfg.type = AGX_VARYING_TYPE_FRAGCOORD_Z;
1258
cfg.components = 1;
1259
cfg.triangle_slot = cfg.point_slot = base;
1260
}
1261
1262
base++;
1263
packed++;
1264
1265
unsigned comps[MAX_VARYING] = { 0 };
1266
1267
nir_foreach_shader_in_variable(var, nir) {
1268
unsigned loc = var->data.driver_location;
1269
const struct glsl_type *column =
1270
glsl_without_array_or_matrix(var->type);
1271
unsigned chan = glsl_get_components(column);
1272
1273
/* If we have a fractional location added, we need to increase the size
1274
* so it will fit, i.e. a vec3 in YZW requires us to allocate a vec4.
1275
* We could do better but this is an edge case as it is, normally
1276
* packed varyings will be aligned.
1277
*/
1278
chan += var->data.location_frac;
1279
comps[loc] = MAX2(comps[loc], chan);
1280
}
1281
1282
nir_foreach_shader_in_variable(var, nir) {
1283
unsigned loc = var->data.driver_location;
1284
unsigned sz = glsl_count_attribute_slots(var->type, FALSE);
1285
unsigned channels = comps[loc];
1286
1287
assert(var->data.driver_location <= AGX_MAX_VARYINGS);
1288
remap[var->data.driver_location] = base;
1289
1290
for (int c = 0; c < sz; ++c) {
1291
agx_pack(packed, VARYING, cfg) {
1292
cfg.type = (var->data.location == VARYING_SLOT_PNTC) ?
1293
AGX_VARYING_TYPE_POINT_COORDINATES :
1294
(var->data.interpolation == INTERP_MODE_FLAT) ?
1295
AGX_VARYING_TYPE_FLAT_LAST :
1296
AGX_VARYING_TYPE_SMOOTH;
1297
1298
cfg.components = channels;
1299
cfg.triangle_slot = cfg.point_slot = base;
1300
}
1301
1302
base += channels;
1303
packed++;
1304
}
1305
}
1306
1307
varyings->nr_descs = (packed - varyings->packed);
1308
varyings->nr_slots = base;
1309
}
1310
1311
void
1312
agx_compile_shader_nir(nir_shader *nir,
1313
struct agx_shader_key *key,
1314
struct util_dynarray *binary,
1315
struct agx_shader_info *out)
1316
{
1317
agx_debug = debug_get_option_agx_debug();
1318
1319
agx_context *ctx = rzalloc(NULL, agx_context);
1320
ctx->nir = nir;
1321
ctx->out = out;
1322
ctx->key = key;
1323
ctx->stage = nir->info.stage;
1324
list_inithead(&ctx->blocks);
1325
1326
if (ctx->stage == MESA_SHADER_VERTEX) {
1327
out->writes_psiz = nir->info.outputs_written &
1328
BITFIELD_BIT(VARYING_SLOT_PSIZ);
1329
}
1330
1331
NIR_PASS_V(nir, nir_lower_vars_to_ssa);
1332
1333
/* Lower large arrays to scratch and small arrays to csel */
1334
NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
1335
glsl_get_natural_size_align_bytes);
1336
NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
1337
1338
if (ctx->stage == MESA_SHADER_VERTEX) {
1339
/* Lower from OpenGL [-1, 1] to [0, 1] if half-z is not set */
1340
if (!key->vs.clip_halfz)
1341
NIR_PASS_V(nir, nir_lower_clip_halfz);
1342
} else if (ctx->stage == MESA_SHADER_FRAGMENT) {
1343
/* Flip point coordinate since OpenGL and Metal disagree */
1344
NIR_PASS_V(nir, nir_shader_instructions_pass,
1345
agx_lower_point_coord,
1346
nir_metadata_block_index | nir_metadata_dominance, NULL);
1347
}
1348
1349
NIR_PASS_V(nir, nir_split_var_copies);
1350
NIR_PASS_V(nir, nir_lower_global_vars_to_local);
1351
NIR_PASS_V(nir, nir_lower_var_copies);
1352
NIR_PASS_V(nir, nir_lower_vars_to_ssa);
1353
NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1354
glsl_type_size, 0);
1355
if (ctx->stage == MESA_SHADER_FRAGMENT) {
1356
NIR_PASS_V(nir, nir_lower_mediump_io,
1357
nir_var_shader_in | nir_var_shader_out, ~0, false);
1358
}
1359
NIR_PASS_V(nir, nir_lower_ssbo);
1360
1361
/* Varying output is scalar, other I/O is vector */
1362
if (ctx->stage == MESA_SHADER_VERTEX) {
1363
NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out);
1364
}
1365
1366
nir_lower_tex_options lower_tex_options = {
1367
.lower_txs_lod = true,
1368
.lower_txp = ~0,
1369
};
1370
1371
nir_tex_src_type_constraints tex_constraints = {
1372
[nir_tex_src_lod] = { true, 16 }
1373
};
1374
1375
NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
1376
NIR_PASS_V(nir, nir_legalize_16bit_sampler_srcs, tex_constraints);
1377
1378
agx_optimize_nir(nir);
1379
1380
/* Must be last since NIR passes can remap driver_location freely */
1381
if (ctx->stage == MESA_SHADER_VERTEX) {
1382
agx_remap_varyings_vs(nir, &out->varyings, ctx->varyings);
1383
} else if (ctx->stage == MESA_SHADER_FRAGMENT) {
1384
agx_remap_varyings_fs(nir, &out->varyings, ctx->varyings);
1385
}
1386
1387
bool skip_internal = nir->info.internal;
1388
skip_internal &= !(agx_debug & AGX_DBG_INTERNAL);
1389
1390
if (agx_debug & AGX_DBG_SHADERS && !skip_internal) {
1391
nir_print_shader(nir, stdout);
1392
}
1393
1394
nir_foreach_function(func, nir) {
1395
if (!func->impl)
1396
continue;
1397
1398
/* TODO: Handle phi nodes instead of just convert_from_ssa and yolo'ing
1399
* the mapping of nir_register to hardware registers and guaranteeing bad
1400
* performance and breaking spilling... */
1401
ctx->nir_regalloc = rzalloc_array(ctx, unsigned, func->impl->reg_alloc);
1402
1403
/* Leave the last 4 registers for hacky p-copy lowering */
1404
unsigned nir_regalloc = AGX_NUM_REGS - (4 * 2);
1405
1406
/* Assign backwards so we don't need to guess a size */
1407
nir_foreach_register(reg, &func->impl->registers) {
1408
/* Ensure alignment */
1409
if (reg->bit_size >= 32 && (nir_regalloc & 1))
1410
nir_regalloc--;
1411
1412
unsigned size = DIV_ROUND_UP(reg->bit_size * reg->num_components, 16);
1413
nir_regalloc -= size;
1414
ctx->nir_regalloc[reg->index] = nir_regalloc;
1415
}
1416
1417
ctx->max_register = nir_regalloc;
1418
ctx->alloc += func->impl->ssa_alloc;
1419
emit_cf_list(ctx, &func->impl->body);
1420
break; /* TODO: Multi-function shaders */
1421
}
1422
1423
/* TODO: Actual RA... this way passes don't need to deal nir_register */
1424
agx_foreach_instr_global(ctx, I) {
1425
agx_foreach_dest(I, d) {
1426
if (I->dest[d].type == AGX_INDEX_NIR_REGISTER) {
1427
I->dest[d].type = AGX_INDEX_REGISTER;
1428
I->dest[d].value = ctx->nir_regalloc[I->dest[d].value];
1429
}
1430
}
1431
1432
agx_foreach_src(I, s) {
1433
if (I->src[s].type == AGX_INDEX_NIR_REGISTER) {
1434
I->src[s].type = AGX_INDEX_REGISTER;
1435
I->src[s].value = ctx->nir_regalloc[I->src[s].value];
1436
}
1437
}
1438
}
1439
1440
/* Terminate the shader after the exit block */
1441
agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
1442
agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
1443
agx_stop(&_b);
1444
1445
/* Also add traps to match the blob, unsure what the function is */
1446
for (unsigned i = 0; i < 8; ++i)
1447
agx_trap(&_b);
1448
1449
unsigned block_source_count = 0;
1450
1451
/* Name blocks now that we're done emitting so the order is consistent */
1452
agx_foreach_block(ctx, block)
1453
block->name = block_source_count++;
1454
1455
if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1456
agx_print_shader(ctx, stdout);
1457
1458
agx_optimizer(ctx);
1459
agx_dce(ctx);
1460
1461
if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1462
agx_print_shader(ctx, stdout);
1463
1464
agx_ra(ctx);
1465
1466
if (ctx->stage == MESA_SHADER_VERTEX)
1467
agx_set_st_vary_final(ctx);
1468
1469
if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1470
agx_print_shader(ctx, stdout);
1471
1472
agx_pack_binary(ctx, binary);
1473
1474
if ((agx_debug & AGX_DBG_SHADERDB) && !skip_internal)
1475
agx_print_stats(ctx, binary->size, stderr);
1476
1477
ralloc_free(ctx);
1478
}
1479
1480