Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/panfrost/bifrost/bifrost_compile.c
4564 views
1
/*
2
* Copyright (C) 2020 Collabora Ltd.
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
* SOFTWARE.
22
*
23
* Authors (Collabora):
24
* Alyssa Rosenzweig <[email protected]>
25
*/
26
27
#include "main/mtypes.h"
28
#include "compiler/glsl/glsl_to_nir.h"
29
#include "compiler/nir_types.h"
30
#include "compiler/nir/nir_builder.h"
31
#include "util/u_debug.h"
32
33
#include "disassemble.h"
34
#include "bifrost_compile.h"
35
#include "compiler.h"
36
#include "bi_quirks.h"
37
#include "bi_builder.h"
38
#include "bifrost_nir.h"
39
40
static const struct debug_named_value bifrost_debug_options[] = {
41
{"msgs", BIFROST_DBG_MSGS, "Print debug messages"},
42
{"shaders", BIFROST_DBG_SHADERS, "Dump shaders in NIR and MIR"},
43
{"shaderdb", BIFROST_DBG_SHADERDB, "Print statistics"},
44
{"verbose", BIFROST_DBG_VERBOSE, "Disassemble verbosely"},
45
{"internal", BIFROST_DBG_INTERNAL, "Dump even internal shaders"},
46
{"nosched", BIFROST_DBG_NOSCHED, "Force trivial bundling"},
47
{"inorder", BIFROST_DBG_INORDER, "Force in-order bundling"},
48
DEBUG_NAMED_VALUE_END
49
};
50
51
DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0)
52
53
/* How many bytes are prefetched by the Bifrost shader core. From the final
54
* clause of the shader, this range must be valid instructions or zero. */
55
#define BIFROST_SHADER_PREFETCH 128
56
57
int bifrost_debug = 0;
58
59
#define DBG(fmt, ...) \
60
do { if (bifrost_debug & BIFROST_DBG_MSGS) \
61
fprintf(stderr, "%s:%d: "fmt, \
62
__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
63
64
static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
65
66
static void
67
bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
68
{
69
bi_instr *branch = bi_jump(b, bi_zero());
70
71
switch (instr->type) {
72
case nir_jump_break:
73
branch->branch_target = b->shader->break_block;
74
break;
75
case nir_jump_continue:
76
branch->branch_target = b->shader->continue_block;
77
break;
78
default:
79
unreachable("Unhandled jump type");
80
}
81
82
pan_block_add_successor(&b->shader->current_block->base, &branch->branch_target->base);
83
b->shader->current_block->base.unconditional_jumps = true;
84
}
85
86
static bi_index
87
bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
88
{
89
switch (intr->intrinsic) {
90
case nir_intrinsic_load_barycentric_centroid:
91
case nir_intrinsic_load_barycentric_sample:
92
return bi_register(61);
93
94
/* Need to put the sample ID in the top 16-bits */
95
case nir_intrinsic_load_barycentric_at_sample:
96
return bi_mkvec_v2i16(b, bi_half(bi_dontcare(), false),
97
bi_half(bi_src_index(&intr->src[0]), false));
98
99
/* Interpret as 8:8 signed fixed point positions in pixels along X and
100
* Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
101
* is the center of the pixel so we first fixup and then convert. For
102
* fp16 input:
103
*
104
* f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
105
* f2i16((256 * (x, y)) + (128, 128)) =
106
* V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
107
*
108
* For fp32 input, that lacks enough precision for MSAA 16x, but the
109
* idea is the same. FIXME: still doesn't pass
110
*/
111
case nir_intrinsic_load_barycentric_at_offset: {
112
bi_index offset = bi_src_index(&intr->src[0]);
113
bi_index f16 = bi_null();
114
unsigned sz = nir_src_bit_size(intr->src[0]);
115
116
if (sz == 16) {
117
f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0),
118
bi_imm_f16(128.0), BI_ROUND_NONE);
119
} else {
120
assert(sz == 32);
121
bi_index f[2];
122
for (unsigned i = 0; i < 2; ++i) {
123
f[i] = bi_fadd_rscale_f32(b,
124
bi_word(offset, i),
125
bi_imm_f32(0.5), bi_imm_u32(8),
126
BI_ROUND_NONE, BI_SPECIAL_NONE);
127
}
128
129
f16 = bi_v2f32_to_v2f16(b, f[0], f[1], BI_ROUND_NONE);
130
}
131
132
return bi_v2f16_to_v2s16(b, f16, BI_ROUND_RTZ);
133
}
134
135
case nir_intrinsic_load_barycentric_pixel:
136
default:
137
return bi_dontcare();
138
}
139
}
140
141
static enum bi_sample
142
bi_interp_for_intrinsic(nir_intrinsic_op op)
143
{
144
switch (op) {
145
case nir_intrinsic_load_barycentric_centroid:
146
return BI_SAMPLE_CENTROID;
147
case nir_intrinsic_load_barycentric_sample:
148
case nir_intrinsic_load_barycentric_at_sample:
149
return BI_SAMPLE_SAMPLE;
150
case nir_intrinsic_load_barycentric_at_offset:
151
return BI_SAMPLE_EXPLICIT;
152
case nir_intrinsic_load_barycentric_pixel:
153
default:
154
return BI_SAMPLE_CENTER;
155
}
156
}
157
158
/* auto, 64-bit omitted */
159
static enum bi_register_format
160
bi_reg_fmt_for_nir(nir_alu_type T)
161
{
162
switch (T) {
163
case nir_type_float16: return BI_REGISTER_FORMAT_F16;
164
case nir_type_float32: return BI_REGISTER_FORMAT_F32;
165
case nir_type_int16: return BI_REGISTER_FORMAT_S16;
166
case nir_type_uint16: return BI_REGISTER_FORMAT_U16;
167
case nir_type_int32: return BI_REGISTER_FORMAT_S32;
168
case nir_type_uint32: return BI_REGISTER_FORMAT_U32;
169
default: unreachable("Invalid type for register format");
170
}
171
}
172
173
/* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
174
* immediate to be used (which applies even if _IMM can't be used) */
175
176
static bool
177
bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max)
178
{
179
nir_src *offset = nir_get_io_offset_src(instr);
180
181
if (!nir_src_is_const(*offset))
182
return false;
183
184
*immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
185
return (*immediate) < max;
186
}
187
188
static void
189
bi_make_vec_to(bi_builder *b, bi_index final_dst,
190
bi_index *src,
191
unsigned *channel,
192
unsigned count,
193
unsigned bitsize);
194
195
/* Bifrost's load instructions lack a component offset despite operating in
196
* terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
197
* but they may be unavoidable with separate shaders in use. To solve this, we
198
* lower to a larger load and an explicit copy of the desired components. */
199
200
static void
201
bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
202
{
203
unsigned component = nir_intrinsic_component(instr);
204
205
if (component == 0)
206
return;
207
208
bi_index srcs[] = { tmp, tmp, tmp, tmp };
209
unsigned channels[] = { component, component + 1, component + 2 };
210
211
bi_make_vec_to(b,
212
bi_dest_index(&instr->dest),
213
srcs, channels, instr->num_components,
214
nir_dest_bit_size(instr->dest));
215
}
216
217
static void
218
bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
219
{
220
nir_alu_type T = nir_intrinsic_dest_type(instr);
221
enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
222
nir_src *offset = nir_get_io_offset_src(instr);
223
unsigned component = nir_intrinsic_component(instr);
224
enum bi_vecsize vecsize = (instr->num_components + component - 1);
225
unsigned imm_index = 0;
226
unsigned base = nir_intrinsic_base(instr);
227
bool constant = nir_src_is_const(*offset);
228
bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
229
bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
230
231
if (immediate) {
232
bi_ld_attr_imm_to(b, dest, bi_register(61), bi_register(62),
233
regfmt, vecsize, imm_index);
234
} else {
235
bi_index idx = bi_src_index(&instr->src[0]);
236
237
if (constant)
238
idx = bi_imm_u32(imm_index);
239
else if (base != 0)
240
idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
241
242
bi_ld_attr_to(b, dest, bi_register(61), bi_register(62),
243
idx, regfmt, vecsize);
244
}
245
246
bi_copy_component(b, instr, dest);
247
}
248
249
static void
250
bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
251
{
252
enum bi_sample sample = BI_SAMPLE_CENTER;
253
enum bi_update update = BI_UPDATE_STORE;
254
enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
255
bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
256
bi_index src0 = bi_null();
257
258
unsigned component = nir_intrinsic_component(instr);
259
enum bi_vecsize vecsize = (instr->num_components + component - 1);
260
bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
261
262
unsigned sz = nir_dest_bit_size(instr->dest);
263
264
if (smooth) {
265
nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
266
assert(parent);
267
268
sample = bi_interp_for_intrinsic(parent->intrinsic);
269
src0 = bi_varying_src0_for_barycentric(b, parent);
270
271
assert(sz == 16 || sz == 32);
272
regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16
273
: BI_REGISTER_FORMAT_F32;
274
} else {
275
assert(sz == 32);
276
regfmt = BI_REGISTER_FORMAT_U32;
277
}
278
279
nir_src *offset = nir_get_io_offset_src(instr);
280
unsigned imm_index = 0;
281
bool immediate = bi_is_intr_immediate(instr, &imm_index, 20);
282
283
if (immediate && smooth) {
284
bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,
285
vecsize, imm_index);
286
} else if (immediate && !smooth) {
287
bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt,
288
vecsize, imm_index);
289
} else {
290
bi_index idx = bi_src_index(offset);
291
unsigned base = nir_intrinsic_base(instr);
292
293
if (base != 0)
294
idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
295
296
if (smooth) {
297
bi_ld_var_to(b, dest, src0, idx, regfmt, sample,
298
update, vecsize);
299
} else {
300
bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE,
301
regfmt, vecsize);
302
}
303
}
304
305
bi_copy_component(b, instr, dest);
306
}
307
308
static void
309
bi_make_vec16_to(bi_builder *b, bi_index dst, bi_index *src,
310
unsigned *channel, unsigned count)
311
{
312
for (unsigned i = 0; i < count; i += 2) {
313
bool next = (i + 1) < count;
314
315
unsigned chan = channel ? channel[i] : 0;
316
unsigned nextc = next && channel ? channel[i + 1] : 0;
317
318
bi_index w0 = bi_word(src[i], chan >> 1);
319
bi_index w1 = next ? bi_word(src[i + 1], nextc >> 1) : bi_zero();
320
321
bi_index h0 = bi_half(w0, chan & 1);
322
bi_index h1 = bi_half(w1, nextc & 1);
323
324
bi_index to = bi_word(dst, i >> 1);
325
326
if (bi_is_word_equiv(w0, w1) && (chan & 1) == 0 && ((nextc & 1) == 1))
327
bi_mov_i32_to(b, to, w0);
328
else if (bi_is_word_equiv(w0, w1))
329
bi_swz_v2i16_to(b, to, bi_swz_16(w0, chan & 1, nextc & 1));
330
else
331
bi_mkvec_v2i16_to(b, to, h0, h1);
332
}
333
}
334
335
static void
336
bi_make_vec_to(bi_builder *b, bi_index final_dst,
337
bi_index *src,
338
unsigned *channel,
339
unsigned count,
340
unsigned bitsize)
341
{
342
/* If we reads our own output, we need a temporary move to allow for
343
* swapping. TODO: Could do a bit better for pairwise swaps of 16-bit
344
* vectors */
345
bool reads_self = false;
346
347
for (unsigned i = 0; i < count; ++i)
348
reads_self |= bi_is_equiv(final_dst, src[i]);
349
350
/* SSA can't read itself */
351
assert(!reads_self || final_dst.reg);
352
353
bi_index dst = reads_self ? bi_temp(b->shader) : final_dst;
354
355
if (bitsize == 32) {
356
for (unsigned i = 0; i < count; ++i) {
357
bi_mov_i32_to(b, bi_word(dst, i),
358
bi_word(src[i], channel ? channel[i] : 0));
359
}
360
} else if (bitsize == 16) {
361
bi_make_vec16_to(b, dst, src, channel, count);
362
} else if (bitsize == 8 && count == 1) {
363
bi_swz_v4i8_to(b, dst, bi_byte(
364
bi_word(src[0], channel[0] >> 2),
365
channel[0] & 3));
366
} else {
367
unreachable("8-bit mkvec not yet supported");
368
}
369
370
/* Emit an explicit copy if needed */
371
if (!bi_is_equiv(dst, final_dst)) {
372
unsigned shift = (bitsize == 8) ? 2 : (bitsize == 16) ? 1 : 0;
373
unsigned vec = (1 << shift);
374
375
for (unsigned i = 0; i < count; i += vec) {
376
bi_mov_i32_to(b, bi_word(final_dst, i >> shift),
377
bi_word(dst, i >> shift));
378
}
379
}
380
}
381
382
static bi_instr *
383
bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval,
384
unsigned nr_components, unsigned offset)
385
{
386
unsigned sysval_ubo =
387
MAX2(b->shader->inputs->sysval_ubo, b->shader->nir->info.num_ubos);
388
unsigned uniform =
389
pan_lookup_sysval(b->shader->sysval_to_id,
390
&b->shader->info->sysvals,
391
sysval);
392
unsigned idx = (uniform * 16) + offset;
393
394
return bi_load_to(b, nr_components * 32, dest,
395
bi_imm_u32(idx),
396
bi_imm_u32(sysval_ubo), BI_SEG_UBO);
397
}
398
399
static void
400
bi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr,
401
unsigned nr_components, unsigned offset)
402
{
403
bi_load_sysval_to(b, bi_dest_index(&intr->dest),
404
panfrost_sysval_for_instr(&intr->instr, NULL),
405
nr_components, offset);
406
}
407
408
static bi_index
409
bi_load_sysval(bi_builder *b, int sysval,
410
unsigned nr_components, unsigned offset)
411
{
412
bi_index tmp = bi_temp(b->shader);
413
bi_load_sysval_to(b, tmp, sysval, nr_components, offset);
414
return tmp;
415
}
416
417
static void
418
bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
419
{
420
ASSERTED nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
421
422
/* Source color is passed through r0-r3, or r4-r7 for the second
423
* source when dual-source blending. TODO: Precolour instead */
424
bi_index srcs[] = {
425
bi_register(0), bi_register(1), bi_register(2), bi_register(3)
426
};
427
bi_index srcs2[] = {
428
bi_register(4), bi_register(5), bi_register(6), bi_register(7)
429
};
430
431
bool second_source = (sem.location == VARYING_SLOT_VAR0);
432
433
bi_make_vec_to(b, bi_dest_index(&instr->dest),
434
second_source ? srcs2 : srcs,
435
NULL, 4, 32);
436
}
437
438
static void
439
bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, unsigned rt)
440
{
441
/* Reads 2 or 4 staging registers to cover the input */
442
unsigned sr_count = (nir_alu_type_get_type_size(T) <= 16) ? 2 : 4;
443
444
if (b->shader->inputs->is_blend) {
445
uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc;
446
447
/* Blend descriptor comes from the compile inputs */
448
/* Put the result in r0 */
449
bi_blend_to(b, bi_register(0), rgba,
450
bi_register(60),
451
bi_imm_u32(blend_desc & 0xffffffff),
452
bi_imm_u32(blend_desc >> 32), sr_count);
453
} else {
454
/* Blend descriptor comes from the FAU RAM. By convention, the
455
* return address is stored in r48 and will be used by the
456
* blend shader to jump back to the fragment shader after */
457
bi_blend_to(b, bi_register(48), rgba,
458
bi_register(60),
459
bi_fau(BIR_FAU_BLEND_0 + rt, false),
460
bi_fau(BIR_FAU_BLEND_0 + rt, true), sr_count);
461
}
462
463
assert(rt < 8);
464
b->shader->info->bifrost.blend[rt].type = T;
465
}
466
467
/* Blend shaders do not need to run ATEST since they are dependent on a
468
* fragment shader that runs it. Blit shaders may not need to run ATEST, since
469
* ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
470
* there are no writes to the coverage mask. The latter two are satisfied for
471
* all blit shaders, so we just care about early-z, which blit shaders force
472
* iff they do not write depth or stencil */
473
474
static bool
475
bi_skip_atest(bi_context *ctx, bool emit_zs)
476
{
477
return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
478
}
479
480
static void
481
bi_emit_atest(bi_builder *b, bi_index alpha)
482
{
483
bi_index coverage = bi_register(60);
484
bi_instr *atest = bi_atest_to(b, coverage, coverage, alpha);
485
b->shader->emitted_atest = true;
486
487
/* Pseudo-source to encode in the tuple */
488
atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false);
489
}
490
491
static void
492
bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
493
{
494
bool combined = instr->intrinsic ==
495
nir_intrinsic_store_combined_output_pan;
496
497
unsigned writeout = combined ? nir_intrinsic_component(instr) :
498
PAN_WRITEOUT_C;
499
500
bool emit_blend = writeout & (PAN_WRITEOUT_C);
501
bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
502
503
const nir_variable *var =
504
nir_find_variable_with_driver_location(b->shader->nir,
505
nir_var_shader_out, nir_intrinsic_base(instr));
506
assert(var);
507
508
unsigned loc = var->data.location;
509
bi_index src0 = bi_src_index(&instr->src[0]);
510
511
/* By ISA convention, the coverage mask is stored in R60. The store
512
* itself will be handled by a subsequent ATEST instruction */
513
if (loc == FRAG_RESULT_SAMPLE_MASK) {
514
bi_index orig = bi_register(60);
515
bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0);
516
bi_index new = bi_lshift_and_i32(b, orig, src0, bi_imm_u8(0));
517
bi_mux_i32_to(b, orig, orig, new, msaa, BI_MUX_INT_ZERO);
518
return;
519
}
520
521
522
/* Dual-source blending is implemented by putting the color in
523
* registers r4-r7. */
524
if (var->data.index) {
525
unsigned count = nir_src_num_components(instr->src[0]);
526
527
for (unsigned i = 0; i < count; ++i)
528
bi_mov_i32_to(b, bi_register(4 + i), bi_word(src0, i));
529
530
b->shader->info->bifrost.blend_src1_type =
531
nir_intrinsic_src_type(instr);
532
533
return;
534
}
535
536
/* Emit ATEST if we have to, note ATEST requires a floating-point alpha
537
* value, but render target #0 might not be floating point. However the
538
* alpha value is only used for alpha-to-coverage, a stage which is
539
* skipped for pure integer framebuffers, so the issue is moot. */
540
541
if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
542
nir_alu_type T = nir_intrinsic_src_type(instr);
543
544
bi_index rgba = bi_src_index(&instr->src[0]);
545
bi_index alpha =
546
(T == nir_type_float16) ? bi_half(bi_word(rgba, 1), true) :
547
(T == nir_type_float32) ? bi_word(rgba, 3) :
548
bi_dontcare();
549
550
/* Don't read out-of-bounds */
551
if (nir_src_num_components(instr->src[0]) < 4)
552
alpha = bi_imm_f32(1.0);
553
554
bi_emit_atest(b, alpha);
555
}
556
557
if (emit_zs) {
558
bi_index z = { 0 }, s = { 0 };
559
560
if (writeout & PAN_WRITEOUT_Z)
561
z = bi_src_index(&instr->src[2]);
562
563
if (writeout & PAN_WRITEOUT_S)
564
s = bi_src_index(&instr->src[3]);
565
566
bi_zs_emit_to(b, bi_register(60), z, s, bi_register(60),
567
writeout & PAN_WRITEOUT_S,
568
writeout & PAN_WRITEOUT_Z);
569
}
570
571
if (emit_blend) {
572
assert(loc >= FRAG_RESULT_DATA0);
573
574
unsigned rt = (loc - FRAG_RESULT_DATA0);
575
bi_index color = bi_src_index(&instr->src[0]);
576
577
/* Explicit copy since BLEND inputs are precoloured to R0-R3,
578
* TODO: maybe schedule around this or implement in RA as a
579
* spill */
580
bool has_mrt = false;
581
582
nir_foreach_shader_out_variable(var, b->shader->nir)
583
has_mrt |= (var->data.location > FRAG_RESULT_DATA0);
584
585
if (has_mrt) {
586
bi_index srcs[4] = { color, color, color, color };
587
unsigned channels[4] = { 0, 1, 2, 3 };
588
color = bi_temp(b->shader);
589
bi_make_vec_to(b, color, srcs, channels,
590
nir_src_num_components(instr->src[0]),
591
nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
592
}
593
594
bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), rt);
595
}
596
597
if (b->shader->inputs->is_blend) {
598
/* Jump back to the fragment shader, return address is stored
599
* in r48 (see above).
600
*/
601
bi_jump(b, bi_register(48));
602
}
603
}
604
605
static void
606
bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
607
{
608
/* In principle we can do better for 16-bit. At the moment we require
609
* 32-bit to permit the use of .auto, in order to force .u32 for flat
610
* varyings, to handle internal TGSI shaders that set flat in the VS
611
* but smooth in the FS */
612
613
ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
614
assert(nir_alu_type_get_type_size(T) == 32);
615
enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
616
617
unsigned imm_index = 0;
618
bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
619
620
bi_index address;
621
if (immediate) {
622
address = bi_lea_attr_imm(b,
623
bi_register(61), bi_register(62),
624
regfmt, imm_index);
625
} else {
626
bi_index idx =
627
bi_iadd_u32(b,
628
bi_src_index(nir_get_io_offset_src(instr)),
629
bi_imm_u32(nir_intrinsic_base(instr)),
630
false);
631
address = bi_lea_attr(b,
632
bi_register(61), bi_register(62),
633
idx, regfmt);
634
}
635
636
/* Only look at the total components needed. In effect, we fill in all
637
* the intermediate "holes" in the write mask, since we can't mask off
638
* stores. Since nir_lower_io_to_temporaries ensures each varying is
639
* written at most once, anything that's masked out is undefined, so it
640
* doesn't matter what we write there. So we may as well do the
641
* simplest thing possible. */
642
unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
643
assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
644
645
bi_st_cvt(b, bi_src_index(&instr->src[0]), address,
646
bi_word(address, 1), bi_word(address, 2),
647
regfmt, nr - 1);
648
}
649
650
static void
651
bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
652
{
653
nir_src *offset = nir_get_io_offset_src(instr);
654
655
bool offset_is_const = nir_src_is_const(*offset);
656
bi_index dyn_offset = bi_src_index(offset);
657
uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
658
bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);
659
660
bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest),
661
bi_dest_index(&instr->dest), offset_is_const ?
662
bi_imm_u32(const_offset) : dyn_offset,
663
kernel_input ? bi_zero() : bi_src_index(&instr->src[0]),
664
BI_SEG_UBO);
665
}
666
667
static bi_index
668
bi_addr_high(nir_src *src)
669
{
670
return (nir_src_bit_size(*src) == 64) ?
671
bi_word(bi_src_index(src), 1) : bi_zero();
672
}
673
674
static void
675
bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
676
{
677
bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest),
678
bi_dest_index(&instr->dest),
679
bi_src_index(&instr->src[0]), bi_addr_high(&instr->src[0]),
680
seg);
681
}
682
683
static void
684
bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
685
{
686
/* Require contiguous masks, gauranteed by nir_lower_wrmasks */
687
assert(nir_intrinsic_write_mask(instr) ==
688
BITFIELD_MASK(instr->num_components));
689
690
bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
691
bi_src_index(&instr->src[0]),
692
bi_src_index(&instr->src[1]), bi_addr_high(&instr->src[1]),
693
seg);
694
}
695
696
/* Exchanges the staging register with memory */
697
698
static void
699
bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg)
700
{
701
assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
702
703
unsigned sz = nir_src_bit_size(*arg);
704
assert(sz == 32 || sz == 64);
705
706
bi_index data = bi_src_index(arg);
707
708
bi_index data_words[] = {
709
bi_word(data, 0),
710
bi_word(data, 1),
711
};
712
713
bi_index inout = bi_temp_reg(b->shader);
714
bi_make_vec_to(b, inout, data_words, NULL, sz / 32, 32);
715
716
bi_axchg_to(b, sz, inout, inout,
717
bi_word(addr, 0),
718
(seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(),
719
seg);
720
721
bi_index inout_words[] = {
722
bi_word(inout, 0),
723
bi_word(inout, 1),
724
};
725
726
bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
727
}
728
729
/* Exchanges the second staging register with memory if comparison with first
730
* staging register passes */
731
732
static void
733
bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg)
734
{
735
assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
736
737
/* hardware is swapped from NIR */
738
bi_index src0 = bi_src_index(arg_2);
739
bi_index src1 = bi_src_index(arg_1);
740
741
unsigned sz = nir_src_bit_size(*arg_1);
742
assert(sz == 32 || sz == 64);
743
744
bi_index data_words[] = {
745
bi_word(src0, 0),
746
sz == 32 ? bi_word(src1, 0) : bi_word(src0, 1),
747
748
/* 64-bit */
749
bi_word(src1, 0),
750
bi_word(src1, 1),
751
};
752
753
bi_index inout = bi_temp_reg(b->shader);
754
bi_make_vec_to(b, inout, data_words, NULL, 2 * (sz / 32), 32);
755
756
bi_acmpxchg_to(b, sz, inout, inout,
757
bi_word(addr, 0),
758
(seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(),
759
seg);
760
761
bi_index inout_words[] = {
762
bi_word(inout, 0),
763
bi_word(inout, 1),
764
};
765
766
bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
767
}
768
769
/* Extracts an atomic opcode */
770
771
static enum bi_atom_opc
772
bi_atom_opc_for_nir(nir_intrinsic_op op)
773
{
774
switch (op) {
775
case nir_intrinsic_global_atomic_add:
776
case nir_intrinsic_shared_atomic_add:
777
case nir_intrinsic_image_atomic_add:
778
return BI_ATOM_OPC_AADD;
779
780
case nir_intrinsic_global_atomic_imin:
781
case nir_intrinsic_shared_atomic_imin:
782
case nir_intrinsic_image_atomic_imin:
783
return BI_ATOM_OPC_ASMIN;
784
785
case nir_intrinsic_global_atomic_umin:
786
case nir_intrinsic_shared_atomic_umin:
787
case nir_intrinsic_image_atomic_umin:
788
return BI_ATOM_OPC_AUMIN;
789
790
case nir_intrinsic_global_atomic_imax:
791
case nir_intrinsic_shared_atomic_imax:
792
case nir_intrinsic_image_atomic_imax:
793
return BI_ATOM_OPC_ASMAX;
794
795
case nir_intrinsic_global_atomic_umax:
796
case nir_intrinsic_shared_atomic_umax:
797
case nir_intrinsic_image_atomic_umax:
798
return BI_ATOM_OPC_AUMAX;
799
800
case nir_intrinsic_global_atomic_and:
801
case nir_intrinsic_shared_atomic_and:
802
case nir_intrinsic_image_atomic_and:
803
return BI_ATOM_OPC_AAND;
804
805
case nir_intrinsic_global_atomic_or:
806
case nir_intrinsic_shared_atomic_or:
807
case nir_intrinsic_image_atomic_or:
808
return BI_ATOM_OPC_AOR;
809
810
case nir_intrinsic_global_atomic_xor:
811
case nir_intrinsic_shared_atomic_xor:
812
case nir_intrinsic_image_atomic_xor:
813
return BI_ATOM_OPC_AXOR;
814
815
default:
816
unreachable("Unexpected computational atomic");
817
}
818
}
819
820
/* Optimized unary atomics are available with an implied #1 argument */
821
822
static bool
823
bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
824
{
825
/* Check we have a compatible constant */
826
if (arg.type != BI_INDEX_CONSTANT)
827
return false;
828
829
if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
830
return false;
831
832
/* Check for a compatible operation */
833
switch (op) {
834
case BI_ATOM_OPC_AADD:
835
*out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
836
return true;
837
case BI_ATOM_OPC_ASMAX:
838
*out = BI_ATOM_OPC_ASMAX1;
839
return true;
840
case BI_ATOM_OPC_AUMAX:
841
*out = BI_ATOM_OPC_AUMAX1;
842
return true;
843
case BI_ATOM_OPC_AOR:
844
*out = BI_ATOM_OPC_AOR1;
845
return true;
846
default:
847
return false;
848
}
849
}
850
851
/* Coordinates are 16-bit integers in Bifrost but 32-bit in NIR */
852
853
static bi_index
854
bi_emit_image_coord(bi_builder *b, bi_index coord)
855
{
856
return bi_mkvec_v2i16(b,
857
bi_half(bi_word(coord, 0), false),
858
bi_half(bi_word(coord, 1), false));
859
}
860
861
static bi_index
862
bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr)
863
{
864
nir_src src = instr->src[0];
865
bi_index index = bi_src_index(&src);
866
bi_context *ctx = b->shader;
867
868
/* Images come after vertex attributes, so handle an explicit offset */
869
unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ?
870
util_bitcount64(ctx->nir->info.inputs_read) : 0;
871
872
if (offset == 0)
873
return index;
874
else if (nir_src_is_const(src))
875
return bi_imm_u32(nir_src_as_uint(src) + offset);
876
else
877
return bi_iadd_u32(b, index, bi_imm_u32(offset), false);
878
}
879
880
static void
881
bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
882
{
883
enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
884
ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
885
886
bi_index coords = bi_src_index(&instr->src[1]);
887
/* TODO: MSAA */
888
assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
889
890
bi_ld_attr_tex_to(b, bi_dest_index(&instr->dest),
891
bi_emit_image_coord(b, coords),
892
bi_emit_image_coord(b, bi_word(coords, 2)),
893
bi_emit_image_index(b, instr),
894
bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)),
895
instr->num_components - 1);
896
}
897
898
static bi_index
899
bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
900
{
901
enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
902
ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
903
904
/* TODO: MSAA */
905
assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
906
907
enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ?
908
bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) :
909
BI_REGISTER_FORMAT_AUTO;
910
911
bi_index coords = bi_src_index(&instr->src[1]);
912
bi_index xy = bi_emit_image_coord(b, coords);
913
bi_index zw = bi_emit_image_coord(b, bi_word(coords, 2));
914
915
bi_instr *I = bi_lea_attr_tex_to(b, bi_temp(b->shader), xy, zw,
916
bi_emit_image_index(b, instr), type);
917
918
/* LEA_ATTR_TEX defaults to the secondary attribute table, but our ABI
919
* has all images in the primary attribute table */
920
I->table = BI_TABLE_ATTRIBUTE_1;
921
922
return I->dest[0];
923
}
924
925
static void
926
bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
927
{
928
bi_index addr = bi_emit_lea_image(b, instr);
929
930
bi_st_cvt(b, bi_src_index(&instr->src[3]),
931
addr, bi_word(addr, 1), bi_word(addr, 2),
932
bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)),
933
instr->num_components - 1);
934
}
935
936
static void
937
bi_emit_atomic_i32_to(bi_builder *b, bi_index dst,
938
bi_index addr, bi_index arg, nir_intrinsic_op intrinsic)
939
{
940
/* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
941
* take any vector but can still output in RETURN mode */
942
bi_index sr = bi_temp_reg(b->shader);
943
944
enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic);
945
enum bi_atom_opc post_opc = opc;
946
947
/* Generate either ATOM_C or ATOM_C1 as required */
948
if (bi_promote_atom_c1(opc, arg, &opc)) {
949
bi_patom_c1_i32_to(b, sr, bi_word(addr, 0),
950
bi_word(addr, 1), opc, 2);
951
} else {
952
bi_mov_i32_to(b, sr, arg);
953
bi_patom_c_i32_to(b, sr, sr, bi_word(addr, 0),
954
bi_word(addr, 1), opc, 2);
955
}
956
957
/* Post-process it */
958
bi_atom_post_i32_to(b, dst, bi_word(sr, 0), bi_word(sr, 1), post_opc);
959
}
960
961
/* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5
962
* gl_FragCoord.z = ld_vary(fragz)
963
* gl_FragCoord.w = ld_vary(fragw)
964
*/
965
966
static void
967
bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr)
968
{
969
bi_index src[4] = {};
970
971
for (unsigned i = 0; i < 2; ++i) {
972
src[i] = bi_fadd_f32(b,
973
bi_u16_to_f32(b, bi_half(bi_register(59), i)),
974
bi_imm_f32(0.5f), BI_ROUND_NONE);
975
}
976
977
for (unsigned i = 0; i < 2; ++i) {
978
src[2 + i] = bi_ld_var_special(b, bi_zero(),
979
BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER,
980
BI_UPDATE_CLOBBER,
981
(i == 0) ? BI_VARYING_NAME_FRAG_Z :
982
BI_VARYING_NAME_FRAG_W,
983
BI_VECSIZE_NONE);
984
}
985
986
bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32);
987
}
988
989
static void
990
bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
991
{
992
unsigned rt = b->shader->inputs->blend.rt;
993
unsigned size = nir_dest_bit_size(instr->dest);
994
995
/* Get the render target */
996
if (!b->shader->inputs->is_blend) {
997
const nir_variable *var =
998
nir_find_variable_with_driver_location(b->shader->nir,
999
nir_var_shader_out, nir_intrinsic_base(instr));
1000
unsigned loc = var->data.location;
1001
assert(loc >= FRAG_RESULT_DATA0);
1002
rt = (loc - FRAG_RESULT_DATA0);
1003
}
1004
1005
/* We want to load the current pixel.
1006
* FIXME: The sample to load is currently hardcoded to 0. This should
1007
* be addressed for multi-sample FBs.
1008
*/
1009
struct bifrost_pixel_indices pix = {
1010
.y = BIFROST_CURRENT_PIXEL,
1011
.rt = rt
1012
};
1013
1014
bi_index desc = b->shader->inputs->is_blend ?
1015
bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) :
1016
bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0);
1017
1018
uint32_t indices = 0;
1019
memcpy(&indices, &pix, sizeof(indices));
1020
1021
bi_ld_tile_to(b, bi_dest_index(&instr->dest), bi_imm_u32(indices),
1022
bi_register(60), desc, (instr->num_components - 1));
1023
}
1024
1025
static void
1026
bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
1027
{
1028
bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
1029
bi_dest_index(&instr->dest) : bi_null();
1030
gl_shader_stage stage = b->shader->stage;
1031
1032
switch (instr->intrinsic) {
1033
case nir_intrinsic_load_barycentric_pixel:
1034
case nir_intrinsic_load_barycentric_centroid:
1035
case nir_intrinsic_load_barycentric_sample:
1036
case nir_intrinsic_load_barycentric_at_sample:
1037
case nir_intrinsic_load_barycentric_at_offset:
1038
/* handled later via load_vary */
1039
break;
1040
case nir_intrinsic_load_interpolated_input:
1041
case nir_intrinsic_load_input:
1042
if (b->shader->inputs->is_blend)
1043
bi_emit_load_blend_input(b, instr);
1044
else if (stage == MESA_SHADER_FRAGMENT)
1045
bi_emit_load_vary(b, instr);
1046
else if (stage == MESA_SHADER_VERTEX)
1047
bi_emit_load_attr(b, instr);
1048
else
1049
unreachable("Unsupported shader stage");
1050
break;
1051
1052
case nir_intrinsic_store_output:
1053
if (stage == MESA_SHADER_FRAGMENT)
1054
bi_emit_fragment_out(b, instr);
1055
else if (stage == MESA_SHADER_VERTEX)
1056
bi_emit_store_vary(b, instr);
1057
else
1058
unreachable("Unsupported shader stage");
1059
break;
1060
1061
case nir_intrinsic_store_combined_output_pan:
1062
assert(stage == MESA_SHADER_FRAGMENT);
1063
bi_emit_fragment_out(b, instr);
1064
break;
1065
1066
case nir_intrinsic_load_ubo:
1067
case nir_intrinsic_load_kernel_input:
1068
bi_emit_load_ubo(b, instr);
1069
break;
1070
1071
case nir_intrinsic_load_global:
1072
case nir_intrinsic_load_global_constant:
1073
bi_emit_load(b, instr, BI_SEG_NONE);
1074
break;
1075
1076
case nir_intrinsic_store_global:
1077
bi_emit_store(b, instr, BI_SEG_NONE);
1078
break;
1079
1080
case nir_intrinsic_load_scratch:
1081
bi_emit_load(b, instr, BI_SEG_TL);
1082
break;
1083
1084
case nir_intrinsic_store_scratch:
1085
bi_emit_store(b, instr, BI_SEG_TL);
1086
break;
1087
1088
case nir_intrinsic_load_shared:
1089
bi_emit_load(b, instr, BI_SEG_WLS);
1090
break;
1091
1092
case nir_intrinsic_store_shared:
1093
bi_emit_store(b, instr, BI_SEG_WLS);
1094
break;
1095
1096
/* Blob doesn't seem to do anything for memory barriers, note +BARRIER
1097
* is illegal in fragment shaders */
1098
case nir_intrinsic_memory_barrier:
1099
case nir_intrinsic_memory_barrier_buffer:
1100
case nir_intrinsic_memory_barrier_image:
1101
case nir_intrinsic_memory_barrier_shared:
1102
case nir_intrinsic_group_memory_barrier:
1103
break;
1104
1105
case nir_intrinsic_control_barrier:
1106
assert(b->shader->stage != MESA_SHADER_FRAGMENT);
1107
bi_barrier(b);
1108
break;
1109
1110
case nir_intrinsic_shared_atomic_add:
1111
case nir_intrinsic_shared_atomic_imin:
1112
case nir_intrinsic_shared_atomic_umin:
1113
case nir_intrinsic_shared_atomic_imax:
1114
case nir_intrinsic_shared_atomic_umax:
1115
case nir_intrinsic_shared_atomic_and:
1116
case nir_intrinsic_shared_atomic_or:
1117
case nir_intrinsic_shared_atomic_xor: {
1118
assert(nir_src_bit_size(instr->src[1]) == 32);
1119
1120
bi_index addr = bi_seg_add_i64(b, bi_src_index(&instr->src[0]),
1121
bi_zero(), false, BI_SEG_WLS);
1122
1123
bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]),
1124
instr->intrinsic);
1125
break;
1126
}
1127
1128
case nir_intrinsic_image_atomic_add:
1129
case nir_intrinsic_image_atomic_imin:
1130
case nir_intrinsic_image_atomic_umin:
1131
case nir_intrinsic_image_atomic_imax:
1132
case nir_intrinsic_image_atomic_umax:
1133
case nir_intrinsic_image_atomic_and:
1134
case nir_intrinsic_image_atomic_or:
1135
case nir_intrinsic_image_atomic_xor:
1136
assert(nir_src_bit_size(instr->src[3]) == 32);
1137
1138
bi_emit_atomic_i32_to(b, dst,
1139
bi_emit_lea_image(b, instr),
1140
bi_src_index(&instr->src[3]),
1141
instr->intrinsic);
1142
break;
1143
1144
case nir_intrinsic_global_atomic_add:
1145
case nir_intrinsic_global_atomic_imin:
1146
case nir_intrinsic_global_atomic_umin:
1147
case nir_intrinsic_global_atomic_imax:
1148
case nir_intrinsic_global_atomic_umax:
1149
case nir_intrinsic_global_atomic_and:
1150
case nir_intrinsic_global_atomic_or:
1151
case nir_intrinsic_global_atomic_xor:
1152
assert(nir_src_bit_size(instr->src[1]) == 32);
1153
1154
bi_emit_atomic_i32_to(b, dst,
1155
bi_src_index(&instr->src[0]),
1156
bi_src_index(&instr->src[1]),
1157
instr->intrinsic);
1158
break;
1159
1160
case nir_intrinsic_image_load:
1161
bi_emit_image_load(b, instr);
1162
break;
1163
1164
case nir_intrinsic_image_store:
1165
bi_emit_image_store(b, instr);
1166
break;
1167
1168
case nir_intrinsic_global_atomic_exchange:
1169
bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
1170
&instr->src[1], BI_SEG_NONE);
1171
break;
1172
1173
case nir_intrinsic_image_atomic_exchange:
1174
bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr),
1175
&instr->src[3], BI_SEG_NONE);
1176
break;
1177
1178
case nir_intrinsic_shared_atomic_exchange:
1179
bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
1180
&instr->src[1], BI_SEG_WLS);
1181
break;
1182
1183
case nir_intrinsic_global_atomic_comp_swap:
1184
bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
1185
&instr->src[1], &instr->src[2], BI_SEG_NONE);
1186
break;
1187
1188
case nir_intrinsic_image_atomic_comp_swap:
1189
bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr),
1190
&instr->src[3], &instr->src[4], BI_SEG_NONE);
1191
break;
1192
1193
case nir_intrinsic_shared_atomic_comp_swap:
1194
bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
1195
&instr->src[1], &instr->src[2], BI_SEG_WLS);
1196
break;
1197
1198
case nir_intrinsic_load_frag_coord:
1199
bi_emit_load_frag_coord(b, instr);
1200
break;
1201
1202
case nir_intrinsic_load_output:
1203
bi_emit_ld_tile(b, instr);
1204
break;
1205
1206
case nir_intrinsic_discard_if: {
1207
bi_index src = bi_src_index(&instr->src[0]);
1208
assert(nir_src_bit_size(instr->src[0]) == 1);
1209
bi_discard_f32(b, bi_half(src, false), bi_imm_u16(0), BI_CMPF_NE);
1210
break;
1211
}
1212
1213
case nir_intrinsic_discard:
1214
bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
1215
break;
1216
1217
case nir_intrinsic_load_ssbo_address:
1218
bi_load_sysval_nir(b, instr, 2, 0);
1219
break;
1220
1221
case nir_intrinsic_load_work_dim:
1222
bi_load_sysval_nir(b, instr, 1, 0);
1223
break;
1224
1225
case nir_intrinsic_load_first_vertex:
1226
bi_load_sysval_nir(b, instr, 1, 0);
1227
break;
1228
1229
case nir_intrinsic_load_base_vertex:
1230
bi_load_sysval_nir(b, instr, 1, 4);
1231
break;
1232
1233
case nir_intrinsic_load_base_instance:
1234
bi_load_sysval_nir(b, instr, 1, 8);
1235
break;
1236
1237
case nir_intrinsic_load_draw_id:
1238
bi_load_sysval_nir(b, instr, 1, 0);
1239
break;
1240
1241
case nir_intrinsic_get_ssbo_size:
1242
bi_load_sysval_nir(b, instr, 1, 8);
1243
break;
1244
1245
case nir_intrinsic_load_viewport_scale:
1246
case nir_intrinsic_load_viewport_offset:
1247
case nir_intrinsic_load_num_workgroups:
1248
case nir_intrinsic_load_workgroup_size:
1249
bi_load_sysval_nir(b, instr, 3, 0);
1250
break;
1251
1252
case nir_intrinsic_image_size:
1253
bi_load_sysval_nir(b, instr,
1254
nir_dest_num_components(instr->dest), 0);
1255
break;
1256
1257
case nir_intrinsic_load_sample_positions_pan:
1258
bi_mov_i32_to(b, bi_word(dst, 0),
1259
bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false));
1260
bi_mov_i32_to(b, bi_word(dst, 1),
1261
bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
1262
break;
1263
1264
case nir_intrinsic_load_sample_mask_in:
1265
/* r61[0:15] contains the coverage bitmap */
1266
bi_u16_to_u32_to(b, dst, bi_half(bi_register(61), false));
1267
break;
1268
1269
case nir_intrinsic_load_sample_id: {
1270
/* r61[16:23] contains the sampleID, mask it out. Upper bits
1271
* seem to read garbage (despite being architecturally defined
1272
* as zero), so use a 5-bit mask instead of 8-bits */
1273
1274
bi_rshift_and_i32_to(b, dst, bi_register(61), bi_imm_u32(0x1f),
1275
bi_imm_u8(16));
1276
break;
1277
}
1278
1279
case nir_intrinsic_load_front_face:
1280
/* r58 == 0 means primitive is front facing */
1281
bi_icmp_i32_to(b, dst, bi_register(58), bi_zero(), BI_CMPF_EQ,
1282
BI_RESULT_TYPE_M1);
1283
break;
1284
1285
case nir_intrinsic_load_point_coord:
1286
bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
1287
BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
1288
BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
1289
break;
1290
1291
case nir_intrinsic_load_vertex_id_zero_base:
1292
bi_mov_i32_to(b, dst, bi_register(61));
1293
break;
1294
1295
case nir_intrinsic_load_instance_id:
1296
bi_mov_i32_to(b, dst, bi_register(62));
1297
break;
1298
1299
case nir_intrinsic_load_subgroup_invocation:
1300
bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
1301
break;
1302
1303
case nir_intrinsic_load_local_invocation_id:
1304
for (unsigned i = 0; i < 3; ++i)
1305
bi_u16_to_u32_to(b, bi_word(dst, i),
1306
bi_half(bi_register(55 + i / 2), i % 2));
1307
break;
1308
1309
case nir_intrinsic_load_workgroup_id:
1310
for (unsigned i = 0; i < 3; ++i)
1311
bi_mov_i32_to(b, bi_word(dst, i), bi_register(57 + i));
1312
break;
1313
1314
case nir_intrinsic_load_global_invocation_id:
1315
case nir_intrinsic_load_global_invocation_id_zero_base:
1316
for (unsigned i = 0; i < 3; ++i)
1317
bi_mov_i32_to(b, bi_word(dst, i), bi_register(60 + i));
1318
break;
1319
1320
case nir_intrinsic_shader_clock:
1321
bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
1322
break;
1323
1324
default:
1325
fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
1326
assert(0);
1327
}
1328
}
1329
1330
static void
1331
bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
1332
{
1333
/* Make sure we've been lowered */
1334
assert(instr->def.num_components <= (32 / instr->def.bit_size));
1335
1336
/* Accumulate all the channels of the constant, as if we did an
1337
* implicit SEL over them */
1338
uint32_t acc = 0;
1339
1340
for (unsigned i = 0; i < instr->def.num_components; ++i) {
1341
unsigned v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
1342
acc |= (v << (i * instr->def.bit_size));
1343
}
1344
1345
bi_mov_i32_to(b, bi_get_index(instr->def.index, false, 0), bi_imm_u32(acc));
1346
}
1347
1348
static bi_index
1349
bi_alu_src_index(nir_alu_src src, unsigned comps)
1350
{
1351
/* we don't lower modifiers until the backend */
1352
assert(!(src.negate || src.abs));
1353
1354
unsigned bitsize = nir_src_bit_size(src.src);
1355
1356
/* TODO: Do we need to do something more clever with 1-bit bools? */
1357
if (bitsize == 1)
1358
bitsize = 16;
1359
1360
/* the bi_index carries the 32-bit (word) offset separate from the
1361
* subword swizzle, first handle the offset */
1362
1363
unsigned offset = 0;
1364
1365
assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
1366
unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
1367
1368
for (unsigned i = 0; i < comps; ++i) {
1369
unsigned new_offset = (src.swizzle[i] >> subword_shift);
1370
1371
if (i > 0)
1372
assert(offset == new_offset && "wrong vectorization");
1373
1374
offset = new_offset;
1375
}
1376
1377
bi_index idx = bi_word(bi_src_index(&src.src), offset);
1378
1379
/* Compose the subword swizzle with existing (identity) swizzle */
1380
assert(idx.swizzle == BI_SWIZZLE_H01);
1381
1382
/* Bigger vectors should have been lowered */
1383
assert(comps <= (1 << subword_shift));
1384
1385
if (bitsize == 16) {
1386
unsigned c0 = src.swizzle[0] & 1;
1387
unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
1388
idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
1389
} else if (bitsize == 8) {
1390
/* 8-bit vectors not yet supported */
1391
assert(comps == 1 && "8-bit vectors not supported");
1392
assert(src.swizzle[0] == 0 && "8-bit vectors not supported");
1393
idx.swizzle = BI_SWIZZLE_B0000;
1394
}
1395
1396
return idx;
1397
}
1398
1399
static enum bi_round
1400
bi_nir_round(nir_op op)
1401
{
1402
switch (op) {
1403
case nir_op_fround_even: return BI_ROUND_NONE;
1404
case nir_op_ftrunc: return BI_ROUND_RTZ;
1405
case nir_op_fceil: return BI_ROUND_RTP;
1406
case nir_op_ffloor: return BI_ROUND_RTN;
1407
default: unreachable("invalid nir round op");
1408
}
1409
}
1410
1411
/* Convenience for lowered transcendentals */
1412
1413
static bi_index
1414
bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
1415
{
1416
return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f), BI_ROUND_NONE);
1417
}
1418
1419
/* Approximate with FRCP_APPROX.f32 and apply a single iteration of
1420
* Newton-Raphson to improve precision */
1421
1422
static void
1423
bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
1424
{
1425
bi_index x1 = bi_frcp_approx_f32(b, s0);
1426
bi_index m = bi_frexpm_f32(b, s0, false, false);
1427
bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false);
1428
bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0),
1429
bi_zero(), BI_ROUND_NONE, BI_SPECIAL_N);
1430
bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e,
1431
BI_ROUND_NONE, BI_SPECIAL_NONE);
1432
}
1433
1434
static void
1435
bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
1436
{
1437
bi_index x1 = bi_frsq_approx_f32(b, s0);
1438
bi_index m = bi_frexpm_f32(b, s0, false, true);
1439
bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true);
1440
bi_index t1 = bi_fmul_f32(b, x1, x1);
1441
bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
1442
bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_N);
1443
bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e,
1444
BI_ROUND_NONE, BI_SPECIAL_N);
1445
}
1446
1447
/* More complex transcendentals, see
1448
* https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
1449
* for documentation */
1450
1451
static void
1452
bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
1453
{
1454
bi_index t1 = bi_temp(b->shader);
1455
bi_instr *t1_instr = bi_fadd_f32_to(b, t1,
1456
s0, bi_imm_u32(0x49400000), BI_ROUND_NONE);
1457
t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
1458
1459
bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000), BI_ROUND_NONE);
1460
1461
bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader),
1462
s0, bi_neg(t2), BI_ROUND_NONE);
1463
a2->clamp = BI_CLAMP_CLAMP_M1_1;
1464
1465
bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
1466
bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
1467
bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
1468
bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
1469
bi_imm_u32(0x3e75fffa), BI_ROUND_NONE);
1470
bi_index p2 = bi_fma_f32(b, p1, a2->dest[0],
1471
bi_imm_u32(0x3f317218), BI_ROUND_NONE);
1472
bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
1473
bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader),
1474
p3, a1t, a1t, a1i, BI_ROUND_NONE, BI_SPECIAL_NONE);
1475
x->clamp = BI_CLAMP_CLAMP_0_INF;
1476
1477
bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
1478
max->sem = BI_SEM_NAN_PROPAGATE;
1479
}
1480
1481
static void
1482
bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
1483
{
1484
/* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
1485
* fixed-point input */
1486
bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
1487
bi_imm_u32(24), BI_ROUND_NONE, BI_SPECIAL_NONE);
1488
bi_index fixed_pt = bi_f32_to_s32(b, scale, BI_ROUND_NONE);
1489
1490
/* Compute the result for the fixed-point input, but pass along
1491
* the floating-point scale for correct NaN propagation */
1492
bi_fexp_f32_to(b, dst, fixed_pt, scale);
1493
}
1494
1495
static void
1496
bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
1497
{
1498
/* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
1499
bi_index a1 = bi_frexpm_f32(b, s0, true, false);
1500
bi_index ei = bi_frexpe_f32(b, s0, true, false);
1501
bi_index ef = bi_s32_to_f32(b, ei, BI_ROUND_RTZ);
1502
1503
/* xt estimates -log(r1), a coarse approximation of log(a1) */
1504
bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
1505
bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
1506
1507
/* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
1508
* log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
1509
* and then log(s0) = x1 + x2 */
1510
bi_index x1 = bi_fadd_f32(b, ef, xt, BI_ROUND_NONE);
1511
1512
/* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
1513
* polynomial approximation around 1. The series is expressed around
1514
* 1, so set y = (a1 * r1) - 1.0 */
1515
bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0), BI_ROUND_NONE);
1516
1517
/* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
1518
* log_e(1 + y) by the Taylor series (lower precision than the blob):
1519
* y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
1520
bi_index loge = bi_fmul_f32(b, y,
1521
bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0), BI_ROUND_NONE));
1522
1523
bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
1524
1525
/* log(s0) = x1 + x2 */
1526
bi_fadd_f32_to(b, dst, x1, x2, BI_ROUND_NONE);
1527
}
1528
1529
static void
1530
bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
1531
{
1532
bi_index frexp = bi_frexpe_f32(b, s0, true, false);
1533
bi_index frexpi = bi_s32_to_f32(b, frexp, BI_ROUND_RTZ);
1534
bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
1535
bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi,
1536
BI_ROUND_NONE);
1537
}
1538
1539
static void
1540
bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
1541
{
1542
bi_index log2_base = bi_null();
1543
1544
if (base.type == BI_INDEX_CONSTANT) {
1545
log2_base = bi_imm_f32(log2f(uif(base.value)));
1546
} else {
1547
log2_base = bi_temp(b->shader);
1548
bi_lower_flog2_32(b, log2_base, base);
1549
}
1550
1551
return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
1552
}
1553
1554
static void
1555
bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
1556
{
1557
bi_index log2_base = bi_null();
1558
1559
if (base.type == BI_INDEX_CONSTANT) {
1560
log2_base = bi_imm_f32(log2f(uif(base.value)));
1561
} else {
1562
log2_base = bi_temp(b->shader);
1563
bi_flog2_32(b, log2_base, base);
1564
}
1565
1566
return bi_fexp_32(b, dst, exp, log2_base);
1567
}
1568
1569
/* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
1570
* FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
1571
* calculates the results. We use them to calculate sin/cos via a Taylor
1572
* approximation:
1573
*
1574
* f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
1575
* sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
1576
* cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
1577
*/
1578
1579
#define TWO_OVER_PI bi_imm_f32(2.0f / 3.14159f)
1580
#define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
1581
#define SINCOS_BIAS bi_imm_u32(0x49400000)
1582
1583
static void
1584
bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
1585
{
1586
/* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
1587
bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS, BI_ROUND_NONE);
1588
1589
/* Approximate domain error (small) */
1590
bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS),
1591
BI_ROUND_NONE),
1592
MPI_OVER_TWO, s0, BI_ROUND_NONE);
1593
1594
/* Lookup sin(x), cos(x) */
1595
bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
1596
bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
1597
1598
/* e^2 / 2 */
1599
bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(),
1600
bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_NONE);
1601
1602
/* (-e^2)/2 f''(x) */
1603
bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2),
1604
cos ? cosx : sinx,
1605
bi_negzero(), BI_ROUND_NONE);
1606
1607
/* e f'(x) - (e^2/2) f''(x) */
1608
bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
1609
cos ? bi_neg(sinx) : cosx,
1610
quadratic, BI_ROUND_NONE);
1611
I->clamp = BI_CLAMP_CLAMP_M1_1;
1612
1613
/* f(x) + e f'(x) - (e^2/2) f''(x) */
1614
bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx, BI_ROUND_NONE);
1615
}
1616
1617
static bi_instr *
1618
bi_emit_alu_bool(bi_builder *b, unsigned sz, nir_op op,
1619
bi_index dst, bi_index s0, bi_index s1, bi_index s2)
1620
{
1621
/* Handle 1-bit bools as 0/~0 by default and let the optimizer deal
1622
* with the bit patterns later. 0/~0 has the nice property of being
1623
* independent of replicated vectorization. */
1624
if (sz == 1) sz = 16;
1625
bi_index f = bi_zero();
1626
bi_index t = bi_imm_u16(0xFFFF);
1627
1628
switch (op) {
1629
case nir_op_feq:
1630
return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1);
1631
case nir_op_flt:
1632
return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);
1633
case nir_op_fge:
1634
return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);
1635
case nir_op_fneu:
1636
return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1);
1637
1638
case nir_op_ieq:
1639
return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1);
1640
case nir_op_ine:
1641
return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1);
1642
case nir_op_ilt:
1643
return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);
1644
case nir_op_ige:
1645
return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);
1646
case nir_op_ult:
1647
return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);
1648
case nir_op_uge:
1649
return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);
1650
1651
case nir_op_iand:
1652
return bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
1653
case nir_op_ior:
1654
return bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
1655
case nir_op_ixor:
1656
return bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
1657
case nir_op_inot:
1658
return bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
1659
1660
case nir_op_f2b1:
1661
return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);
1662
case nir_op_i2b1:
1663
return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);
1664
case nir_op_b2b1:
1665
return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);
1666
1667
case nir_op_bcsel:
1668
return bi_csel_to(b, nir_type_int, sz, dst, s0, f, s1, s2, BI_CMPF_NE);
1669
1670
default:
1671
fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name);
1672
unreachable("Unhandled boolean ALU instruction");
1673
}
1674
}
1675
1676
static void
1677
bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
1678
{
1679
bi_index dst = bi_dest_index(&instr->dest.dest);
1680
unsigned srcs = nir_op_infos[instr->op].num_inputs;
1681
unsigned sz = nir_dest_bit_size(instr->dest.dest);
1682
unsigned comps = nir_dest_num_components(instr->dest.dest);
1683
unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
1684
unsigned src1_sz = srcs > 1 ? nir_src_bit_size(instr->src[1].src) : 0;
1685
bool is_bool = (sz == 1);
1686
1687
/* TODO: Anything else? */
1688
if (sz == 1)
1689
sz = 16;
1690
1691
/* Indicate scalarness */
1692
if (sz == 16 && comps == 1)
1693
dst.swizzle = BI_SWIZZLE_H00;
1694
1695
if (!instr->dest.dest.is_ssa) {
1696
for (unsigned i = 0; i < comps; ++i)
1697
assert(instr->dest.write_mask);
1698
}
1699
1700
/* First, match against the various moves in NIR. These are
1701
* special-cased because they can operate on vectors even after
1702
* lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
1703
* instruction is no "bigger" than SIMD-within-a-register. These moves
1704
* are the exceptions that need to handle swizzles specially. */
1705
1706
switch (instr->op) {
1707
case nir_op_pack_32_2x16:
1708
case nir_op_vec2:
1709
case nir_op_vec3:
1710
case nir_op_vec4: {
1711
bi_index unoffset_srcs[4] = {
1712
srcs > 0 ? bi_src_index(&instr->src[0].src) : bi_null(),
1713
srcs > 1 ? bi_src_index(&instr->src[1].src) : bi_null(),
1714
srcs > 2 ? bi_src_index(&instr->src[2].src) : bi_null(),
1715
srcs > 3 ? bi_src_index(&instr->src[3].src) : bi_null(),
1716
};
1717
1718
unsigned channels[4] = {
1719
instr->src[0].swizzle[0],
1720
instr->src[1].swizzle[0],
1721
srcs > 2 ? instr->src[2].swizzle[0] : 0,
1722
srcs > 3 ? instr->src[3].swizzle[0] : 0,
1723
};
1724
1725
bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
1726
return;
1727
}
1728
1729
case nir_op_vec8:
1730
case nir_op_vec16:
1731
unreachable("should've been lowered");
1732
1733
case nir_op_unpack_32_2x16:
1734
case nir_op_unpack_64_2x32_split_x:
1735
bi_mov_i32_to(b, dst, bi_src_index(&instr->src[0].src));
1736
return;
1737
1738
case nir_op_unpack_64_2x32_split_y:
1739
bi_mov_i32_to(b, dst, bi_word(bi_src_index(&instr->src[0].src), 1));
1740
return;
1741
1742
case nir_op_pack_64_2x32_split:
1743
bi_mov_i32_to(b, bi_word(dst, 0), bi_src_index(&instr->src[0].src));
1744
bi_mov_i32_to(b, bi_word(dst, 1), bi_src_index(&instr->src[1].src));
1745
return;
1746
1747
case nir_op_pack_64_2x32:
1748
bi_mov_i32_to(b, bi_word(dst, 0), bi_word(bi_src_index(&instr->src[0].src), 0));
1749
bi_mov_i32_to(b, bi_word(dst, 1), bi_word(bi_src_index(&instr->src[0].src), 1));
1750
return;
1751
1752
case nir_op_mov: {
1753
bi_index idx = bi_src_index(&instr->src[0].src);
1754
bi_index unoffset_srcs[4] = { idx, idx, idx, idx };
1755
1756
unsigned channels[4] = {
1757
comps > 0 ? instr->src[0].swizzle[0] : 0,
1758
comps > 1 ? instr->src[0].swizzle[1] : 0,
1759
comps > 2 ? instr->src[0].swizzle[2] : 0,
1760
comps > 3 ? instr->src[0].swizzle[3] : 0,
1761
};
1762
1763
if (sz == 1) sz = 16;
1764
bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, sz);
1765
return;
1766
}
1767
1768
case nir_op_f2f16:
1769
assert(src_sz == 32);
1770
bi_index idx = bi_src_index(&instr->src[0].src);
1771
bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]);
1772
bi_index s1 = comps > 1 ?
1773
bi_word(idx, instr->src[0].swizzle[1]) : s0;
1774
1775
bi_v2f32_to_v2f16_to(b, dst, s0, s1, BI_ROUND_NONE);
1776
return;
1777
1778
/* Vectorized downcasts */
1779
case nir_op_u2u16:
1780
case nir_op_i2i16: {
1781
if (!(src_sz == 32 && comps == 2))
1782
break;
1783
1784
bi_index idx = bi_src_index(&instr->src[0].src);
1785
bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]);
1786
bi_index s1 = bi_word(idx, instr->src[0].swizzle[1]);
1787
1788
bi_mkvec_v2i16_to(b, dst,
1789
bi_half(s0, false), bi_half(s1, false));
1790
return;
1791
}
1792
1793
case nir_op_i2i8:
1794
case nir_op_u2u8:
1795
{
1796
/* Acts like an 8-bit swizzle */
1797
bi_index idx = bi_src_index(&instr->src[0].src);
1798
unsigned factor = src_sz / 8;
1799
unsigned chan[4] = { 0 };
1800
1801
for (unsigned i = 0; i < comps; ++i)
1802
chan[i] = instr->src[0].swizzle[i] * factor;
1803
1804
bi_make_vec_to(b, dst, &idx, chan, comps, 8);
1805
return;
1806
}
1807
1808
default:
1809
break;
1810
}
1811
1812
bi_index s0 = srcs > 0 ? bi_alu_src_index(instr->src[0], comps) : bi_null();
1813
bi_index s1 = srcs > 1 ? bi_alu_src_index(instr->src[1], comps) : bi_null();
1814
bi_index s2 = srcs > 2 ? bi_alu_src_index(instr->src[2], comps) : bi_null();
1815
1816
if (is_bool) {
1817
bi_emit_alu_bool(b, src_sz, instr->op, dst, s0, s1, s2);
1818
return;
1819
}
1820
1821
switch (instr->op) {
1822
case nir_op_ffma:
1823
bi_fma_to(b, sz, dst, s0, s1, s2, BI_ROUND_NONE);
1824
break;
1825
1826
case nir_op_fmul:
1827
bi_fma_to(b, sz, dst, s0, s1, bi_negzero(), BI_ROUND_NONE);
1828
break;
1829
1830
case nir_op_fsub:
1831
s1 = bi_neg(s1);
1832
FALLTHROUGH;
1833
case nir_op_fadd:
1834
bi_fadd_to(b, sz, dst, s0, s1, BI_ROUND_NONE);
1835
break;
1836
1837
case nir_op_fsat: {
1838
bi_instr *I = bi_fadd_to(b, sz, dst, s0, bi_negzero(), BI_ROUND_NONE);
1839
I->clamp = BI_CLAMP_CLAMP_0_1;
1840
break;
1841
}
1842
1843
case nir_op_fsat_signed_mali: {
1844
bi_instr *I = bi_fadd_to(b, sz, dst, s0, bi_negzero(), BI_ROUND_NONE);
1845
I->clamp = BI_CLAMP_CLAMP_M1_1;
1846
break;
1847
}
1848
1849
case nir_op_fclamp_pos_mali: {
1850
bi_instr *I = bi_fadd_to(b, sz, dst, s0, bi_negzero(), BI_ROUND_NONE);
1851
I->clamp = BI_CLAMP_CLAMP_0_INF;
1852
break;
1853
}
1854
1855
case nir_op_fneg:
1856
bi_fadd_to(b, sz, dst, bi_neg(s0), bi_negzero(), BI_ROUND_NONE);
1857
break;
1858
1859
case nir_op_fabs:
1860
bi_fadd_to(b, sz, dst, bi_abs(s0), bi_negzero(), BI_ROUND_NONE);
1861
break;
1862
1863
case nir_op_fsin:
1864
bi_lower_fsincos_32(b, dst, s0, false);
1865
break;
1866
1867
case nir_op_fcos:
1868
bi_lower_fsincos_32(b, dst, s0, true);
1869
break;
1870
1871
case nir_op_fexp2:
1872
assert(sz == 32); /* should've been lowered */
1873
1874
if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
1875
bi_lower_fexp2_32(b, dst, s0);
1876
else
1877
bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
1878
1879
break;
1880
1881
case nir_op_flog2:
1882
assert(sz == 32); /* should've been lowered */
1883
1884
if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
1885
bi_lower_flog2_32(b, dst, s0);
1886
else
1887
bi_flog2_32(b, dst, s0);
1888
1889
break;
1890
1891
case nir_op_fpow:
1892
assert(sz == 32); /* should've been lowered */
1893
1894
if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
1895
bi_lower_fpow_32(b, dst, s0, s1);
1896
else
1897
bi_fpow_32(b, dst, s0, s1);
1898
1899
break;
1900
1901
case nir_op_bcsel:
1902
if (src1_sz == 8)
1903
bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
1904
else
1905
bi_csel_to(b, nir_type_int, src1_sz,
1906
dst, s0, bi_zero(), s1, s2, BI_CMPF_NE);
1907
break;
1908
1909
case nir_op_ishl:
1910
bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
1911
break;
1912
case nir_op_ushr:
1913
bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
1914
break;
1915
1916
case nir_op_ishr:
1917
bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
1918
break;
1919
1920
case nir_op_imin:
1921
case nir_op_umin:
1922
bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
1923
s0, s1, s0, s1, BI_CMPF_LT);
1924
break;
1925
1926
case nir_op_imax:
1927
case nir_op_umax:
1928
bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
1929
s0, s1, s0, s1, BI_CMPF_GT);
1930
break;
1931
1932
case nir_op_fddx:
1933
case nir_op_fddy: {
1934
bi_index lane1 = bi_lshift_and_i32(b,
1935
bi_fau(BIR_FAU_LANE_ID, false),
1936
bi_imm_u32(instr->op == nir_op_fddx ? 2 : 1),
1937
bi_imm_u8(0));
1938
1939
bi_index lane2 = bi_iadd_u32(b, lane1,
1940
bi_imm_u32(instr->op == nir_op_fddx ? 1 : 2),
1941
false);
1942
1943
bi_index left, right;
1944
1945
if (b->shader->arch == 6) {
1946
left = bi_clper_v6_i32(b, s0, lane1);
1947
right = bi_clper_v6_i32(b, s0, lane2);
1948
} else {
1949
left = bi_clper_v7_i32(b, s0, lane1,
1950
BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
1951
BI_SUBGROUP_SUBGROUP4);
1952
1953
right = bi_clper_v7_i32(b, s0, lane2,
1954
BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
1955
BI_SUBGROUP_SUBGROUP4);
1956
}
1957
1958
bi_fadd_to(b, sz, dst, right, bi_neg(left), BI_ROUND_NONE);
1959
break;
1960
}
1961
1962
case nir_op_f2f32:
1963
bi_f16_to_f32_to(b, dst, s0);
1964
break;
1965
1966
case nir_op_f2i32:
1967
if (src_sz == 32)
1968
bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
1969
else
1970
bi_f16_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
1971
break;
1972
1973
/* Note 32-bit sources => no vectorization, so 32-bit works */
1974
case nir_op_f2u16:
1975
if (src_sz == 32)
1976
bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
1977
else
1978
bi_v2f16_to_v2u16_to(b, dst, s0, BI_ROUND_RTZ);
1979
break;
1980
1981
case nir_op_f2i16:
1982
if (src_sz == 32)
1983
bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
1984
else
1985
bi_v2f16_to_v2s16_to(b, dst, s0, BI_ROUND_RTZ);
1986
break;
1987
1988
case nir_op_f2u32:
1989
if (src_sz == 32)
1990
bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
1991
else
1992
bi_f16_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
1993
break;
1994
1995
case nir_op_u2f16:
1996
if (src_sz == 32)
1997
bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);
1998
else if (src_sz == 16)
1999
bi_v2u16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
2000
else if (src_sz == 8)
2001
bi_v2u8_to_v2f16_to(b, dst, s0);
2002
break;
2003
2004
case nir_op_u2f32:
2005
if (src_sz == 32)
2006
bi_u32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
2007
else if (src_sz == 16)
2008
bi_u16_to_f32_to(b, dst, s0);
2009
else
2010
bi_u8_to_f32_to(b, dst, s0);
2011
break;
2012
2013
case nir_op_i2f16:
2014
if (src_sz == 32)
2015
bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);
2016
else if (src_sz == 16)
2017
bi_v2s16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
2018
else if (src_sz == 8)
2019
bi_v2s8_to_v2f16_to(b, dst, s0);
2020
break;
2021
2022
case nir_op_i2f32:
2023
if (src_sz == 32)
2024
bi_s32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
2025
else if (src_sz == 16)
2026
bi_s16_to_f32_to(b, dst, s0);
2027
else if (src_sz == 8)
2028
bi_s8_to_f32_to(b, dst, s0);
2029
break;
2030
2031
case nir_op_i2i32:
2032
if (src_sz == 16)
2033
bi_s16_to_s32_to(b, dst, s0);
2034
else
2035
bi_s8_to_s32_to(b, dst, s0);
2036
break;
2037
2038
case nir_op_u2u32:
2039
if (src_sz == 16)
2040
bi_u16_to_u32_to(b, dst, s0);
2041
else
2042
bi_u8_to_u32_to(b, dst, s0);
2043
break;
2044
2045
case nir_op_i2i16:
2046
assert(src_sz == 8 || src_sz == 32);
2047
2048
if (src_sz == 8)
2049
bi_v2s8_to_v2s16_to(b, dst, s0);
2050
else
2051
bi_mov_i32_to(b, dst, s0);
2052
break;
2053
2054
case nir_op_u2u16:
2055
assert(src_sz == 8 || src_sz == 32);
2056
2057
if (src_sz == 8)
2058
bi_v2u8_to_v2u16_to(b, dst, s0);
2059
else
2060
bi_mov_i32_to(b, dst, s0);
2061
break;
2062
2063
case nir_op_b2f16:
2064
case nir_op_b2f32:
2065
bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(),
2066
(sz == 16) ? bi_imm_f16(1.0) : bi_imm_f32(1.0),
2067
(sz == 16) ? bi_imm_f16(0.0) : bi_imm_f32(0.0),
2068
BI_CMPF_NE);
2069
break;
2070
2071
case nir_op_b2b32:
2072
bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(),
2073
bi_imm_u32(~0), bi_zero(), BI_CMPF_NE);
2074
break;
2075
2076
case nir_op_b2i8:
2077
case nir_op_b2i16:
2078
case nir_op_b2i32:
2079
bi_lshift_and_to(b, sz, dst, s0, bi_imm_uintN(1, sz), bi_imm_u8(0));
2080
break;
2081
2082
case nir_op_fround_even:
2083
case nir_op_fceil:
2084
case nir_op_ffloor:
2085
case nir_op_ftrunc:
2086
bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
2087
break;
2088
2089
case nir_op_fmin:
2090
bi_fmin_to(b, sz, dst, s0, s1);
2091
break;
2092
2093
case nir_op_fmax:
2094
bi_fmax_to(b, sz, dst, s0, s1);
2095
break;
2096
2097
case nir_op_iadd:
2098
bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
2099
break;
2100
2101
case nir_op_iadd_sat:
2102
bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
2103
break;
2104
2105
case nir_op_uadd_sat:
2106
bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
2107
break;
2108
2109
case nir_op_ihadd:
2110
bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
2111
break;
2112
2113
case nir_op_irhadd:
2114
bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
2115
break;
2116
2117
case nir_op_ineg:
2118
bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
2119
break;
2120
2121
case nir_op_isub:
2122
bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
2123
break;
2124
2125
case nir_op_isub_sat:
2126
bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
2127
break;
2128
2129
case nir_op_usub_sat:
2130
bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
2131
break;
2132
2133
case nir_op_imul:
2134
bi_imul_to(b, sz, dst, s0, s1);
2135
break;
2136
2137
case nir_op_iabs:
2138
bi_iabs_to(b, sz, dst, s0);
2139
break;
2140
2141
case nir_op_iand:
2142
bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2143
break;
2144
2145
case nir_op_ior:
2146
bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2147
break;
2148
2149
case nir_op_ixor:
2150
bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2151
break;
2152
2153
case nir_op_inot:
2154
bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
2155
break;
2156
2157
case nir_op_frsq:
2158
if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2159
bi_lower_frsq_32(b, dst, s0);
2160
else
2161
bi_frsq_to(b, sz, dst, s0);
2162
break;
2163
2164
case nir_op_frcp:
2165
if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2166
bi_lower_frcp_32(b, dst, s0);
2167
else
2168
bi_frcp_to(b, sz, dst, s0);
2169
break;
2170
2171
case nir_op_uclz:
2172
bi_clz_to(b, sz, dst, s0, false);
2173
break;
2174
2175
case nir_op_bit_count:
2176
bi_popcount_i32_to(b, dst, s0);
2177
break;
2178
2179
case nir_op_bitfield_reverse:
2180
bi_bitrev_i32_to(b, dst, s0);
2181
break;
2182
2183
case nir_op_ufind_msb: {
2184
bi_index clz = bi_clz(b, src_sz, s0, false);
2185
2186
if (sz == 8)
2187
clz = bi_byte(clz, 0);
2188
else if (sz == 16)
2189
clz = bi_half(clz, false);
2190
2191
bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
2192
break;
2193
}
2194
2195
default:
2196
fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
2197
unreachable("Unknown ALU op");
2198
}
2199
}
2200
2201
/* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */
2202
static unsigned
2203
bifrost_tex_format(enum glsl_sampler_dim dim)
2204
{
2205
switch (dim) {
2206
case GLSL_SAMPLER_DIM_1D:
2207
case GLSL_SAMPLER_DIM_BUF:
2208
return 1;
2209
2210
case GLSL_SAMPLER_DIM_2D:
2211
case GLSL_SAMPLER_DIM_MS:
2212
case GLSL_SAMPLER_DIM_EXTERNAL:
2213
case GLSL_SAMPLER_DIM_RECT:
2214
return 2;
2215
2216
case GLSL_SAMPLER_DIM_3D:
2217
return 3;
2218
2219
case GLSL_SAMPLER_DIM_CUBE:
2220
return 0;
2221
2222
default:
2223
DBG("Unknown sampler dim type\n");
2224
assert(0);
2225
return 0;
2226
}
2227
}
2228
2229
static enum bifrost_texture_format_full
2230
bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
2231
{
2232
switch (T) {
2233
case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp;
2234
case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp;
2235
case nir_type_uint16: return BIFROST_TEXTURE_FORMAT_U16;
2236
case nir_type_int16: return BIFROST_TEXTURE_FORMAT_S16;
2237
case nir_type_uint32: return BIFROST_TEXTURE_FORMAT_U32;
2238
case nir_type_int32: return BIFROST_TEXTURE_FORMAT_S32;
2239
default: unreachable("Invalid type for texturing");
2240
}
2241
}
2242
2243
/* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */
2244
static bi_index
2245
bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
2246
{
2247
/* For (u)int we can just passthrough */
2248
nir_alu_type base = nir_alu_type_get_base_type(T);
2249
if (base == nir_type_int || base == nir_type_uint)
2250
return idx;
2251
2252
/* Otherwise we convert */
2253
assert(T == nir_type_float32);
2254
2255
/* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
2256
* Texel Selection") defines the layer to be taken from clamp(RNE(r),
2257
* 0, dt - 1). So we use round RTE, clamping is handled at the data
2258
* structure level */
2259
2260
return bi_f32_to_u32(b, idx, BI_ROUND_NONE);
2261
}
2262
2263
/* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
2264
* 16-bit 8:8 fixed-point format. We lower as:
2265
*
2266
* F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
2267
* MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
2268
*/
2269
2270
static bi_index
2271
bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
2272
{
2273
/* Sort of arbitrary. Must be less than 128.0, greater than or equal to
2274
* the max LOD (16 since we cap at 2^16 texture dimensions), and
2275
* preferably small to minimize precision loss */
2276
const float max_lod = 16.0;
2277
2278
bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader),
2279
fp16 ? bi_half(lod, false) : lod,
2280
bi_imm_f32(1.0f / max_lod), bi_negzero(), BI_ROUND_NONE);
2281
2282
fsat->clamp = BI_CLAMP_CLAMP_M1_1;
2283
2284
bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f),
2285
bi_negzero(), BI_ROUND_NONE);
2286
2287
return bi_mkvec_v2i16(b,
2288
bi_half(bi_f32_to_s32(b, fmul, BI_ROUND_RTZ), false),
2289
bi_imm_u16(0));
2290
}
2291
2292
/* FETCH takes a 32-bit staging register containing the LOD as an integer in
2293
* the bottom 16-bits and (if present) the cube face index in the top 16-bits.
2294
* TODO: Cube face.
2295
*/
2296
2297
static bi_index
2298
bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
2299
{
2300
return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
2301
}
2302
2303
/* The hardware specifies texel offsets and multisample indices together as a
2304
* u8vec4 <offset, ms index>. By default all are zero, so if have either a
2305
* nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
2306
* the bits we need and return that to be passed as a staging register. Else we
2307
* return 0 to avoid allocating a data register when everything is zero. */
2308
2309
static bi_index
2310
bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
2311
{
2312
bi_index dest = bi_zero();
2313
2314
int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2315
if (offs_idx >= 0 &&
2316
(!nir_src_is_const(instr->src[offs_idx].src) ||
2317
nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
2318
unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
2319
bi_index idx = bi_src_index(&instr->src[offs_idx].src);
2320
dest = bi_mkvec_v4i8(b,
2321
(nr > 0) ? bi_byte(bi_word(idx, 0), 0) : bi_imm_u8(0),
2322
(nr > 1) ? bi_byte(bi_word(idx, 1), 0) : bi_imm_u8(0),
2323
(nr > 2) ? bi_byte(bi_word(idx, 2), 0) : bi_imm_u8(0),
2324
bi_imm_u8(0));
2325
}
2326
2327
int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
2328
if (ms_idx >= 0 &&
2329
(!nir_src_is_const(instr->src[ms_idx].src) ||
2330
nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
2331
dest = bi_lshift_or_i32(b,
2332
bi_src_index(&instr->src[ms_idx].src), dest,
2333
bi_imm_u8(24));
2334
}
2335
2336
return dest;
2337
}
2338
2339
static void
2340
bi_emit_cube_coord(bi_builder *b, bi_index coord,
2341
bi_index *face, bi_index *s, bi_index *t)
2342
{
2343
/* Compute max { |x|, |y|, |z| } */
2344
bi_instr *cubeface = bi_cubeface_to(b, bi_temp(b->shader),
2345
bi_temp(b->shader), coord,
2346
bi_word(coord, 1), bi_word(coord, 2));
2347
2348
/* Select coordinates */
2349
2350
bi_index ssel = bi_cube_ssel(b, bi_word(coord, 2), coord,
2351
cubeface->dest[1]);
2352
2353
bi_index tsel = bi_cube_tsel(b, bi_word(coord, 1), bi_word(coord, 2),
2354
cubeface->dest[1]);
2355
2356
/* The OpenGL ES specification requires us to transform an input vector
2357
* (x, y, z) to the coordinate, given the selected S/T:
2358
*
2359
* (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
2360
*
2361
* We implement (s shown, t similar) in a form friendlier to FMA
2362
* instructions, and clamp coordinates at the end for correct
2363
* NaN/infinity handling:
2364
*
2365
* fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
2366
*
2367
* Take the reciprocal of max{x, y, z}
2368
*/
2369
2370
bi_index rcp = bi_frcp_f32(b, cubeface->dest[0]);
2371
2372
/* Calculate 0.5 * (1.0 / max{x, y, z}) */
2373
bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero(),
2374
BI_ROUND_NONE);
2375
2376
/* Transform the coordinates */
2377
*s = bi_temp(b->shader);
2378
*t = bi_temp(b->shader);
2379
2380
bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f),
2381
BI_ROUND_NONE);
2382
bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f),
2383
BI_ROUND_NONE);
2384
2385
S->clamp = BI_CLAMP_CLAMP_0_1;
2386
T->clamp = BI_CLAMP_CLAMP_0_1;
2387
2388
/* Face index at bit[29:31], matching the cube map descriptor */
2389
*face = cubeface->dest[1];
2390
}
2391
2392
/* Emits a cube map descriptor, returning lower 32-bits and putting upper
2393
* 32-bits in passed pointer t. The packing of the face with the S coordinate
2394
* exploits the redundancy of floating points with the range restriction of
2395
* CUBEFACE output.
2396
*
2397
* struct cube_map_descriptor {
2398
* float s : 29;
2399
* unsigned face : 3;
2400
* float t : 32;
2401
* }
2402
*
2403
* Since the cube face index is preshifted, this is easy to pack with a bitwise
2404
* MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
2405
* bits from face.
2406
*/
2407
2408
static bi_index
2409
bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
2410
{
2411
bi_index face, s;
2412
bi_emit_cube_coord(b, coord, &face, &s, t);
2413
bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
2414
return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
2415
}
2416
2417
/* Map to the main texture op used. Some of these (txd in particular) will
2418
* lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
2419
* sequence). We assume that lowering is handled elsewhere.
2420
*/
2421
2422
static enum bifrost_tex_op
2423
bi_tex_op(nir_texop op)
2424
{
2425
switch (op) {
2426
case nir_texop_tex:
2427
case nir_texop_txb:
2428
case nir_texop_txl:
2429
case nir_texop_txd:
2430
case nir_texop_tex_prefetch:
2431
return BIFROST_TEX_OP_TEX;
2432
case nir_texop_txf:
2433
case nir_texop_txf_ms:
2434
case nir_texop_txf_ms_fb:
2435
case nir_texop_txf_ms_mcs:
2436
case nir_texop_tg4:
2437
return BIFROST_TEX_OP_FETCH;
2438
case nir_texop_txs:
2439
case nir_texop_lod:
2440
case nir_texop_query_levels:
2441
case nir_texop_texture_samples:
2442
case nir_texop_samples_identical:
2443
unreachable("should've been lowered");
2444
default:
2445
unreachable("unsupported tex op");
2446
}
2447
}
2448
2449
/* Data registers required by texturing in the order they appear. All are
2450
* optional, the texture operation descriptor determines which are present.
2451
* Note since 3D arrays are not permitted at an API level, Z_COORD and
2452
* ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
2453
2454
enum bifrost_tex_dreg {
2455
BIFROST_TEX_DREG_Z_COORD = 0,
2456
BIFROST_TEX_DREG_Y_DELTAS = 1,
2457
BIFROST_TEX_DREG_LOD = 2,
2458
BIFROST_TEX_DREG_GRDESC_HI = 3,
2459
BIFROST_TEX_DREG_SHADOW = 4,
2460
BIFROST_TEX_DREG_ARRAY = 5,
2461
BIFROST_TEX_DREG_OFFSETMS = 6,
2462
BIFROST_TEX_DREG_SAMPLER = 7,
2463
BIFROST_TEX_DREG_TEXTURE = 8,
2464
BIFROST_TEX_DREG_COUNT,
2465
};
2466
2467
static void
2468
bi_emit_texc(bi_builder *b, nir_tex_instr *instr)
2469
{
2470
bool computed_lod = false;
2471
2472
struct bifrost_texture_operation desc = {
2473
.op = bi_tex_op(instr->op),
2474
.offset_or_bias_disable = false, /* TODO */
2475
.shadow_or_clamp_disable = instr->is_shadow,
2476
.array = instr->is_array,
2477
.dimension = bifrost_tex_format(instr->sampler_dim),
2478
.format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */
2479
.mask = 0xF,
2480
};
2481
2482
switch (desc.op) {
2483
case BIFROST_TEX_OP_TEX:
2484
desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
2485
computed_lod = true;
2486
break;
2487
case BIFROST_TEX_OP_FETCH:
2488
desc.lod_or_fetch = instr->op == nir_texop_tg4 ?
2489
BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component :
2490
BIFROST_TEXTURE_FETCH_TEXEL;
2491
break;
2492
default:
2493
unreachable("texture op unsupported");
2494
}
2495
2496
/* 32-bit indices to be allocated as consecutive staging registers */
2497
bi_index dregs[BIFROST_TEX_DREG_COUNT] = { };
2498
bi_index cx = bi_null(), cy = bi_null();
2499
2500
for (unsigned i = 0; i < instr->num_srcs; ++i) {
2501
bi_index index = bi_src_index(&instr->src[i].src);
2502
unsigned sz = nir_src_bit_size(instr->src[i].src);
2503
ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
2504
nir_alu_type T = base | sz;
2505
2506
switch (instr->src[i].src_type) {
2507
case nir_tex_src_coord:
2508
if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
2509
cx = bi_emit_texc_cube_coord(b, index, &cy);
2510
} else {
2511
unsigned components = nir_src_num_components(instr->src[i].src);
2512
2513
/* Copy XY (for 2D+) or XX (for 1D) */
2514
cx = index;
2515
cy = bi_word(index, MIN2(1, components - 1));
2516
2517
assert(components >= 1 && components <= 3);
2518
2519
if (components < 3) {
2520
/* nothing to do */
2521
} else if (desc.array) {
2522
/* 2D array */
2523
dregs[BIFROST_TEX_DREG_ARRAY] =
2524
bi_emit_texc_array_index(b,
2525
bi_word(index, 2), T);
2526
} else {
2527
/* 3D */
2528
dregs[BIFROST_TEX_DREG_Z_COORD] =
2529
bi_word(index, 2);
2530
}
2531
}
2532
break;
2533
2534
case nir_tex_src_lod:
2535
if (desc.op == BIFROST_TEX_OP_TEX &&
2536
nir_src_is_const(instr->src[i].src) &&
2537
nir_src_as_uint(instr->src[i].src) == 0) {
2538
desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
2539
} else if (desc.op == BIFROST_TEX_OP_TEX) {
2540
assert(base == nir_type_float);
2541
2542
assert(sz == 16 || sz == 32);
2543
dregs[BIFROST_TEX_DREG_LOD] =
2544
bi_emit_texc_lod_88(b, index, sz == 16);
2545
desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
2546
} else {
2547
assert(desc.op == BIFROST_TEX_OP_FETCH);
2548
assert(base == nir_type_uint || base == nir_type_int);
2549
assert(sz == 16 || sz == 32);
2550
2551
dregs[BIFROST_TEX_DREG_LOD] =
2552
bi_emit_texc_lod_cube(b, index);
2553
}
2554
2555
break;
2556
2557
case nir_tex_src_bias:
2558
/* Upper 16-bits interpreted as a clamp, leave zero */
2559
assert(desc.op == BIFROST_TEX_OP_TEX);
2560
assert(base == nir_type_float);
2561
assert(sz == 16 || sz == 32);
2562
dregs[BIFROST_TEX_DREG_LOD] =
2563
bi_emit_texc_lod_88(b, index, sz == 16);
2564
desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
2565
computed_lod = true;
2566
break;
2567
2568
case nir_tex_src_ms_index:
2569
case nir_tex_src_offset:
2570
if (desc.offset_or_bias_disable)
2571
break;
2572
2573
dregs[BIFROST_TEX_DREG_OFFSETMS] =
2574
bi_emit_texc_offset_ms_index(b, instr);
2575
if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
2576
desc.offset_or_bias_disable = true;
2577
break;
2578
2579
case nir_tex_src_comparator:
2580
dregs[BIFROST_TEX_DREG_SHADOW] = index;
2581
break;
2582
2583
case nir_tex_src_texture_offset:
2584
assert(instr->texture_index == 0);
2585
dregs[BIFROST_TEX_DREG_TEXTURE] = index;
2586
break;
2587
2588
case nir_tex_src_sampler_offset:
2589
assert(instr->sampler_index == 0);
2590
dregs[BIFROST_TEX_DREG_SAMPLER] = index;
2591
break;
2592
2593
default:
2594
unreachable("Unhandled src type in texc emit");
2595
}
2596
}
2597
2598
if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
2599
dregs[BIFROST_TEX_DREG_LOD] =
2600
bi_emit_texc_lod_cube(b, bi_zero());
2601
}
2602
2603
/* Choose an index mode */
2604
2605
bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
2606
bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
2607
bool direct = direct_tex && direct_samp;
2608
2609
desc.immediate_indices = direct && (instr->sampler_index < 16);
2610
2611
if (desc.immediate_indices) {
2612
desc.sampler_index_or_mode = instr->sampler_index;
2613
desc.index = instr->texture_index;
2614
} else {
2615
enum bifrost_index mode = 0;
2616
2617
if (direct && instr->sampler_index == instr->texture_index) {
2618
mode = BIFROST_INDEX_IMMEDIATE_SHARED;
2619
desc.index = instr->texture_index;
2620
} else if (direct) {
2621
mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
2622
desc.index = instr->sampler_index;
2623
dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b,
2624
bi_imm_u32(instr->texture_index));
2625
} else if (direct_tex) {
2626
assert(!direct_samp);
2627
mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
2628
desc.index = instr->texture_index;
2629
} else if (direct_samp) {
2630
assert(!direct_tex);
2631
mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
2632
desc.index = instr->sampler_index;
2633
} else {
2634
mode = BIFROST_INDEX_REGISTER;
2635
}
2636
2637
desc.sampler_index_or_mode = mode | (0x3 << 2);
2638
}
2639
2640
/* Allocate staging registers contiguously by compacting the array.
2641
* Index is not SSA (tied operands) */
2642
2643
unsigned sr_count = 0;
2644
2645
for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
2646
if (!bi_is_null(dregs[i]))
2647
dregs[sr_count++] = dregs[i];
2648
}
2649
2650
bi_index idx = sr_count ? bi_temp_reg(b->shader) : bi_null();
2651
2652
if (sr_count)
2653
bi_make_vec_to(b, idx, dregs, NULL, sr_count, 32);
2654
2655
uint32_t desc_u = 0;
2656
memcpy(&desc_u, &desc, sizeof(desc_u));
2657
bi_texc_to(b, sr_count ? idx : bi_dest_index(&instr->dest),
2658
idx, cx, cy, bi_imm_u32(desc_u), !computed_lod,
2659
sr_count);
2660
2661
/* Explicit copy to facilitate tied operands */
2662
if (sr_count) {
2663
bi_index srcs[4] = { idx, idx, idx, idx };
2664
unsigned channels[4] = { 0, 1, 2, 3 };
2665
bi_make_vec_to(b, bi_dest_index(&instr->dest), srcs, channels, 4, 32);
2666
}
2667
}
2668
2669
/* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
2670
* textures with sufficiently small immediate indices. Anything else
2671
* needs a complete texture op. */
2672
2673
static void
2674
bi_emit_texs(bi_builder *b, nir_tex_instr *instr)
2675
{
2676
int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
2677
assert(coord_idx >= 0);
2678
bi_index coords = bi_src_index(&instr->src[coord_idx].src);
2679
2680
if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
2681
bi_index face, s, t;
2682
bi_emit_cube_coord(b, coords, &face, &s, &t);
2683
2684
bi_texs_cube_to(b, nir_dest_bit_size(instr->dest),
2685
bi_dest_index(&instr->dest),
2686
s, t, face,
2687
instr->sampler_index, instr->texture_index);
2688
} else {
2689
bi_texs_2d_to(b, nir_dest_bit_size(instr->dest),
2690
bi_dest_index(&instr->dest),
2691
coords, bi_word(coords, 1),
2692
instr->op != nir_texop_tex, /* zero LOD */
2693
instr->sampler_index, instr->texture_index);
2694
}
2695
}
2696
2697
static bool
2698
bi_is_simple_tex(nir_tex_instr *instr)
2699
{
2700
if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
2701
return false;
2702
2703
if (instr->dest_type != nir_type_float32 &&
2704
instr->dest_type != nir_type_float16)
2705
return false;
2706
2707
if (instr->is_shadow || instr->is_array)
2708
return false;
2709
2710
switch (instr->sampler_dim) {
2711
case GLSL_SAMPLER_DIM_2D:
2712
case GLSL_SAMPLER_DIM_EXTERNAL:
2713
case GLSL_SAMPLER_DIM_RECT:
2714
break;
2715
2716
case GLSL_SAMPLER_DIM_CUBE:
2717
/* LOD can't be specified with TEXS_CUBE */
2718
if (instr->op == nir_texop_txl)
2719
return false;
2720
break;
2721
2722
default:
2723
return false;
2724
}
2725
2726
for (unsigned i = 0; i < instr->num_srcs; ++i) {
2727
if (instr->src[i].src_type != nir_tex_src_lod &&
2728
instr->src[i].src_type != nir_tex_src_coord)
2729
return false;
2730
}
2731
2732
/* Indices need to fit in provided bits */
2733
unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
2734
if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
2735
return false;
2736
2737
int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2738
if (lod_idx < 0)
2739
return true;
2740
2741
nir_src lod = instr->src[lod_idx].src;
2742
return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
2743
}
2744
2745
static void
2746
bi_emit_tex(bi_builder *b, nir_tex_instr *instr)
2747
{
2748
switch (instr->op) {
2749
case nir_texop_txs:
2750
bi_load_sysval_to(b, bi_dest_index(&instr->dest),
2751
panfrost_sysval_for_instr(&instr->instr, NULL),
2752
4, 0);
2753
return;
2754
case nir_texop_tex:
2755
case nir_texop_txl:
2756
case nir_texop_txb:
2757
case nir_texop_txf:
2758
case nir_texop_txf_ms:
2759
case nir_texop_tg4:
2760
break;
2761
default:
2762
unreachable("Invalid texture operation");
2763
}
2764
2765
if (bi_is_simple_tex(instr))
2766
bi_emit_texs(b, instr);
2767
else
2768
bi_emit_texc(b, instr);
2769
}
2770
2771
static void
2772
bi_emit_instr(bi_builder *b, struct nir_instr *instr)
2773
{
2774
switch (instr->type) {
2775
case nir_instr_type_load_const:
2776
bi_emit_load_const(b, nir_instr_as_load_const(instr));
2777
break;
2778
2779
case nir_instr_type_intrinsic:
2780
bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
2781
break;
2782
2783
case nir_instr_type_alu:
2784
bi_emit_alu(b, nir_instr_as_alu(instr));
2785
break;
2786
2787
case nir_instr_type_tex:
2788
bi_emit_tex(b, nir_instr_as_tex(instr));
2789
break;
2790
2791
case nir_instr_type_jump:
2792
bi_emit_jump(b, nir_instr_as_jump(instr));
2793
break;
2794
2795
default:
2796
unreachable("should've been lowered");
2797
}
2798
}
2799
2800
static bi_block *
2801
create_empty_block(bi_context *ctx)
2802
{
2803
bi_block *blk = rzalloc(ctx, bi_block);
2804
2805
blk->base.predecessors = _mesa_set_create(blk,
2806
_mesa_hash_pointer,
2807
_mesa_key_pointer_equal);
2808
2809
return blk;
2810
}
2811
2812
static bi_block *
2813
emit_block(bi_context *ctx, nir_block *block)
2814
{
2815
if (ctx->after_block) {
2816
ctx->current_block = ctx->after_block;
2817
ctx->after_block = NULL;
2818
} else {
2819
ctx->current_block = create_empty_block(ctx);
2820
}
2821
2822
list_addtail(&ctx->current_block->base.link, &ctx->blocks);
2823
list_inithead(&ctx->current_block->base.instructions);
2824
2825
bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
2826
2827
nir_foreach_instr(instr, block) {
2828
bi_emit_instr(&_b, instr);
2829
++ctx->instruction_count;
2830
}
2831
2832
return ctx->current_block;
2833
}
2834
2835
static void
2836
emit_if(bi_context *ctx, nir_if *nif)
2837
{
2838
bi_block *before_block = ctx->current_block;
2839
2840
/* Speculatively emit the branch, but we can't fill it in until later */
2841
bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
2842
bi_instr *then_branch = bi_branchz_i16(&_b,
2843
bi_half(bi_src_index(&nif->condition), false),
2844
bi_zero(), BI_CMPF_EQ);
2845
2846
/* Emit the two subblocks. */
2847
bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
2848
bi_block *end_then_block = ctx->current_block;
2849
2850
/* Emit second block, and check if it's empty */
2851
2852
int count_in = ctx->instruction_count;
2853
bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
2854
bi_block *end_else_block = ctx->current_block;
2855
ctx->after_block = create_empty_block(ctx);
2856
2857
/* Now that we have the subblocks emitted, fix up the branches */
2858
2859
assert(then_block);
2860
assert(else_block);
2861
2862
if (ctx->instruction_count == count_in) {
2863
then_branch->branch_target = ctx->after_block;
2864
pan_block_add_successor(&end_then_block->base, &ctx->after_block->base); /* fallthrough */
2865
} else {
2866
then_branch->branch_target = else_block;
2867
2868
/* Emit a jump from the end of the then block to the end of the else */
2869
_b.cursor = bi_after_block(end_then_block);
2870
bi_instr *then_exit = bi_jump(&_b, bi_zero());
2871
then_exit->branch_target = ctx->after_block;
2872
2873
pan_block_add_successor(&end_then_block->base, &then_exit->branch_target->base);
2874
pan_block_add_successor(&end_else_block->base, &ctx->after_block->base); /* fallthrough */
2875
}
2876
2877
pan_block_add_successor(&before_block->base, &then_branch->branch_target->base); /* then_branch */
2878
pan_block_add_successor(&before_block->base, &then_block->base); /* fallthrough */
2879
}
2880
2881
static void
2882
emit_loop(bi_context *ctx, nir_loop *nloop)
2883
{
2884
/* Remember where we are */
2885
bi_block *start_block = ctx->current_block;
2886
2887
bi_block *saved_break = ctx->break_block;
2888
bi_block *saved_continue = ctx->continue_block;
2889
2890
ctx->continue_block = create_empty_block(ctx);
2891
ctx->break_block = create_empty_block(ctx);
2892
ctx->after_block = ctx->continue_block;
2893
2894
/* Emit the body itself */
2895
emit_cf_list(ctx, &nloop->body);
2896
2897
/* Branch back to loop back */
2898
bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
2899
bi_instr *I = bi_jump(&_b, bi_zero());
2900
I->branch_target = ctx->continue_block;
2901
pan_block_add_successor(&start_block->base, &ctx->continue_block->base);
2902
pan_block_add_successor(&ctx->current_block->base, &ctx->continue_block->base);
2903
2904
ctx->after_block = ctx->break_block;
2905
2906
/* Pop off */
2907
ctx->break_block = saved_break;
2908
ctx->continue_block = saved_continue;
2909
++ctx->loop_count;
2910
}
2911
2912
static bi_block *
2913
emit_cf_list(bi_context *ctx, struct exec_list *list)
2914
{
2915
bi_block *start_block = NULL;
2916
2917
foreach_list_typed(nir_cf_node, node, node, list) {
2918
switch (node->type) {
2919
case nir_cf_node_block: {
2920
bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
2921
2922
if (!start_block)
2923
start_block = block;
2924
2925
break;
2926
}
2927
2928
case nir_cf_node_if:
2929
emit_if(ctx, nir_cf_node_as_if(node));
2930
break;
2931
2932
case nir_cf_node_loop:
2933
emit_loop(ctx, nir_cf_node_as_loop(node));
2934
break;
2935
2936
default:
2937
unreachable("Unknown control flow");
2938
}
2939
}
2940
2941
return start_block;
2942
}
2943
2944
/* shader-db stuff */
2945
2946
struct bi_stats {
2947
unsigned nr_clauses, nr_tuples, nr_ins;
2948
unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
2949
};
2950
2951
static void
2952
bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
2953
{
2954
/* Count instructions */
2955
stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);
2956
2957
/* Non-message passing tuples are always arithmetic */
2958
if (tuple->add != clause->message) {
2959
stats->nr_arith++;
2960
return;
2961
}
2962
2963
/* Message + FMA we'll count as arithmetic _and_ message */
2964
if (tuple->fma)
2965
stats->nr_arith++;
2966
2967
switch (clause->message_type) {
2968
case BIFROST_MESSAGE_VARYING:
2969
/* Check components interpolated */
2970
stats->nr_varying += (clause->message->vecsize + 1) *
2971
(bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
2972
break;
2973
2974
case BIFROST_MESSAGE_VARTEX:
2975
/* 2 coordinates, fp32 each */
2976
stats->nr_varying += (2 * 2);
2977
FALLTHROUGH;
2978
case BIFROST_MESSAGE_TEX:
2979
stats->nr_texture++;
2980
break;
2981
2982
case BIFROST_MESSAGE_ATTRIBUTE:
2983
case BIFROST_MESSAGE_LOAD:
2984
case BIFROST_MESSAGE_STORE:
2985
case BIFROST_MESSAGE_ATOMIC:
2986
stats->nr_ldst++;
2987
break;
2988
2989
case BIFROST_MESSAGE_NONE:
2990
case BIFROST_MESSAGE_BARRIER:
2991
case BIFROST_MESSAGE_BLEND:
2992
case BIFROST_MESSAGE_TILE:
2993
case BIFROST_MESSAGE_Z_STENCIL:
2994
case BIFROST_MESSAGE_ATEST:
2995
case BIFROST_MESSAGE_JOB:
2996
case BIFROST_MESSAGE_64BIT:
2997
/* Nothing to do */
2998
break;
2999
};
3000
3001
}
3002
3003
static void
3004
bi_print_stats(bi_context *ctx, unsigned size, FILE *fp)
3005
{
3006
struct bi_stats stats = { 0 };
3007
3008
/* Count instructions, clauses, and tuples. Also attempt to construct
3009
* normalized execution engine cycle counts, using the following ratio:
3010
*
3011
* 24 arith tuples/cycle
3012
* 2 texture messages/cycle
3013
* 16 x 16-bit varying channels interpolated/cycle
3014
* 1 load store message/cycle
3015
*
3016
* These numbers seem to match Arm Mobile Studio's heuristic. The real
3017
* cycle counts are surely more complicated.
3018
*/
3019
3020
bi_foreach_block(ctx, _block) {
3021
bi_block *block = (bi_block *) _block;
3022
3023
bi_foreach_clause_in_block(block, clause) {
3024
stats.nr_clauses++;
3025
stats.nr_tuples += clause->tuple_count;
3026
3027
for (unsigned i = 0; i < clause->tuple_count; ++i)
3028
bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
3029
}
3030
}
3031
3032
float cycles_arith = ((float) stats.nr_arith) / 24.0;
3033
float cycles_texture = ((float) stats.nr_texture) / 2.0;
3034
float cycles_varying = ((float) stats.nr_varying) / 16.0;
3035
float cycles_ldst = ((float) stats.nr_ldst) / 1.0;
3036
3037
float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
3038
float cycles_bound = MAX2(cycles_arith, cycles_message);
3039
3040
/* Thread count and register pressure are traded off only on v7 */
3041
bool full_threads = (ctx->arch == 7 && ctx->info->work_reg_count <= 32);
3042
unsigned nr_threads = full_threads ? 2 : 1;
3043
3044
/* Dump stats */
3045
3046
fprintf(stderr, "%s - %s shader: "
3047
"%u inst, %u tuples, %u clauses, "
3048
"%f cycles, %f arith, %f texture, %f vary, %f ldst, "
3049
"%u quadwords, %u threads, %u loops, "
3050
"%u:%u spills:fills\n",
3051
ctx->nir->info.label ?: "",
3052
ctx->inputs->is_blend ? "PAN_SHADER_BLEND" :
3053
gl_shader_stage_name(ctx->stage),
3054
stats.nr_ins, stats.nr_tuples, stats.nr_clauses,
3055
cycles_bound, cycles_arith, cycles_texture,
3056
cycles_varying, cycles_ldst,
3057
size / 16, nr_threads,
3058
ctx->loop_count,
3059
ctx->spills, ctx->fills);
3060
}
3061
3062
static int
3063
glsl_type_size(const struct glsl_type *type, bool bindless)
3064
{
3065
return glsl_count_attribute_slots(type, false);
3066
}
3067
3068
/* Split stores to memory. We don't split stores to vertex outputs, since
3069
* nir_lower_io_to_temporaries will ensure there's only a single write.
3070
*/
3071
3072
static bool
3073
should_split_wrmask(const nir_instr *instr, UNUSED const void *data)
3074
{
3075
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3076
3077
switch (intr->intrinsic) {
3078
case nir_intrinsic_store_ssbo:
3079
case nir_intrinsic_store_shared:
3080
case nir_intrinsic_store_global:
3081
case nir_intrinsic_store_scratch:
3082
return true;
3083
default:
3084
return false;
3085
}
3086
}
3087
3088
/* Bifrost wants transcendentals as FP32 */
3089
3090
static unsigned
3091
bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
3092
{
3093
if (instr->type != nir_instr_type_alu)
3094
return 0;
3095
3096
nir_alu_instr *alu = nir_instr_as_alu(instr);
3097
3098
switch (alu->op) {
3099
case nir_op_fexp2:
3100
case nir_op_flog2:
3101
case nir_op_fpow:
3102
case nir_op_fsin:
3103
case nir_op_fcos:
3104
return (nir_dest_bit_size(alu->dest.dest) == 32) ? 0 : 32;
3105
default:
3106
return 0;
3107
}
3108
}
3109
3110
/* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
3111
* transcendentals are an exception. Also shifts because of lane size mismatch
3112
* (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
3113
* to be scalarized due to type size. */
3114
3115
static bool
3116
bi_vectorize_filter(const nir_instr *instr, void *data)
3117
{
3118
/* Defaults work for everything else */
3119
if (instr->type != nir_instr_type_alu)
3120
return true;
3121
3122
const nir_alu_instr *alu = nir_instr_as_alu(instr);
3123
3124
switch (alu->op) {
3125
case nir_op_frcp:
3126
case nir_op_frsq:
3127
case nir_op_ishl:
3128
case nir_op_ishr:
3129
case nir_op_ushr:
3130
case nir_op_f2i16:
3131
case nir_op_f2u16:
3132
case nir_op_i2f16:
3133
case nir_op_u2f16:
3134
return false;
3135
default:
3136
return true;
3137
}
3138
}
3139
3140
/* XXX: This is a kludge to workaround NIR's lack of divergence metadata. If we
3141
* keep divergence info around after we consume it for indirect lowering,
3142
* nir_convert_from_ssa will regress code quality since it will avoid
3143
* coalescing divergent with non-divergent nodes. */
3144
3145
static bool
3146
nir_invalidate_divergence_ssa(nir_ssa_def *ssa, UNUSED void *data)
3147
{
3148
ssa->divergent = false;
3149
return true;
3150
}
3151
3152
static bool
3153
nir_invalidate_divergence(struct nir_builder *b, nir_instr *instr,
3154
UNUSED void *data)
3155
{
3156
return nir_foreach_ssa_def(instr, nir_invalidate_divergence_ssa, NULL);
3157
}
3158
3159
static void
3160
bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
3161
{
3162
bool progress;
3163
unsigned lower_flrp = 16 | 32 | 64;
3164
3165
NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
3166
3167
nir_lower_tex_options lower_tex_options = {
3168
.lower_txs_lod = true,
3169
.lower_txp = ~0,
3170
.lower_tg4_broadcom_swizzle = true,
3171
.lower_txd = true,
3172
};
3173
3174
NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin);
3175
NIR_PASS(progress, nir, pan_lower_helper_invocation);
3176
3177
NIR_PASS(progress, nir, nir_lower_int64);
3178
3179
nir_lower_idiv_options idiv_options = {
3180
.imprecise_32bit_lowering = true,
3181
.allow_fp16 = true,
3182
};
3183
NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
3184
3185
NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
3186
NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);
3187
NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
3188
3189
do {
3190
progress = false;
3191
3192
NIR_PASS(progress, nir, nir_lower_var_copies);
3193
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
3194
NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);
3195
3196
NIR_PASS(progress, nir, nir_copy_prop);
3197
NIR_PASS(progress, nir, nir_opt_remove_phis);
3198
NIR_PASS(progress, nir, nir_opt_dce);
3199
NIR_PASS(progress, nir, nir_opt_dead_cf);
3200
NIR_PASS(progress, nir, nir_opt_cse);
3201
NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
3202
NIR_PASS(progress, nir, nir_opt_algebraic);
3203
NIR_PASS(progress, nir, nir_opt_constant_folding);
3204
3205
NIR_PASS(progress, nir, nir_lower_alu);
3206
3207
if (lower_flrp != 0) {
3208
bool lower_flrp_progress = false;
3209
NIR_PASS(lower_flrp_progress,
3210
nir,
3211
nir_lower_flrp,
3212
lower_flrp,
3213
false /* always_precise */);
3214
if (lower_flrp_progress) {
3215
NIR_PASS(progress, nir,
3216
nir_opt_constant_folding);
3217
progress = true;
3218
}
3219
3220
/* Nothing should rematerialize any flrps, so we only
3221
* need to do this lowering once.
3222
*/
3223
lower_flrp = 0;
3224
}
3225
3226
NIR_PASS(progress, nir, nir_opt_undef);
3227
NIR_PASS(progress, nir, nir_lower_undef_to_zero);
3228
3229
NIR_PASS(progress, nir, nir_opt_loop_unroll,
3230
nir_var_shader_in |
3231
nir_var_shader_out |
3232
nir_var_function_temp);
3233
} while (progress);
3234
3235
/* TODO: Why is 64-bit getting rematerialized?
3236
* KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
3237
NIR_PASS(progress, nir, nir_lower_int64);
3238
3239
/* We need to cleanup after each iteration of late algebraic
3240
* optimizations, since otherwise NIR can produce weird edge cases
3241
* (like fneg of a constant) which we don't handle */
3242
bool late_algebraic = true;
3243
while (late_algebraic) {
3244
late_algebraic = false;
3245
NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
3246
NIR_PASS(progress, nir, nir_opt_constant_folding);
3247
NIR_PASS(progress, nir, nir_copy_prop);
3248
NIR_PASS(progress, nir, nir_opt_dce);
3249
NIR_PASS(progress, nir, nir_opt_cse);
3250
}
3251
3252
NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);
3253
NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
3254
NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
3255
NIR_PASS(progress, nir, nir_opt_dce);
3256
3257
/* Prepass to simplify instruction selection */
3258
NIR_PASS(progress, nir, bifrost_nir_lower_algebraic_late);
3259
3260
/* Backend scheduler is purely local, so do some global optimizations
3261
* to reduce register pressure. */
3262
nir_move_options move_all =
3263
nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
3264
nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
3265
3266
NIR_PASS_V(nir, nir_opt_sink, move_all);
3267
NIR_PASS_V(nir, nir_opt_move, move_all);
3268
3269
/* We might lower attribute, varying, and image indirects. Use the
3270
* gathered info to skip the extra analysis in the happy path. */
3271
bool any_indirects =
3272
nir->info.inputs_read_indirectly ||
3273
nir->info.outputs_accessed_indirectly ||
3274
nir->info.patch_inputs_read_indirectly ||
3275
nir->info.patch_outputs_accessed_indirectly ||
3276
nir->info.images_used;
3277
3278
if (any_indirects) {
3279
nir_convert_to_lcssa(nir, true, true);
3280
NIR_PASS_V(nir, nir_divergence_analysis);
3281
NIR_PASS_V(nir, bi_lower_divergent_indirects,
3282
bifrost_lanes_per_warp(gpu_id));
3283
NIR_PASS_V(nir, nir_shader_instructions_pass,
3284
nir_invalidate_divergence, nir_metadata_all, NULL);
3285
}
3286
3287
/* Take us out of SSA */
3288
NIR_PASS(progress, nir, nir_lower_locals_to_regs);
3289
NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
3290
NIR_PASS(progress, nir, nir_convert_from_ssa, true);
3291
}
3292
3293
/* The cmdstream lowers 8-bit fragment output as 16-bit, so we need to do the
3294
* same lowering here to zero-extend correctly */
3295
3296
static bool
3297
bifrost_nir_lower_i8_fragout_impl(struct nir_builder *b,
3298
nir_intrinsic_instr *intr, UNUSED void *data)
3299
{
3300
if (nir_src_bit_size(intr->src[0]) != 8)
3301
return false;
3302
3303
nir_alu_type type =
3304
nir_alu_type_get_base_type(nir_intrinsic_src_type(intr));
3305
3306
assert(type == nir_type_int || type == nir_type_uint);
3307
3308
b->cursor = nir_before_instr(&intr->instr);
3309
nir_ssa_def *cast = nir_convert_to_bit_size(b, intr->src[0].ssa, type, 16);
3310
3311
nir_intrinsic_set_src_type(intr, type | 16);
3312
nir_instr_rewrite_src_ssa(&intr->instr, &intr->src[0], cast);
3313
return true;
3314
}
3315
3316
static bool
3317
bifrost_nir_lower_i8_fragin_impl(struct nir_builder *b,
3318
nir_intrinsic_instr *intr, UNUSED void *data)
3319
{
3320
if (nir_dest_bit_size(intr->dest) != 8)
3321
return false;
3322
3323
nir_alu_type type =
3324
nir_alu_type_get_base_type(nir_intrinsic_dest_type(intr));
3325
3326
assert(type == nir_type_int || type == nir_type_uint);
3327
3328
b->cursor = nir_before_instr(&intr->instr);
3329
nir_ssa_def *out =
3330
nir_load_output(b, intr->num_components, 16, intr->src[0].ssa,
3331
.base = nir_intrinsic_base(intr),
3332
.component = nir_intrinsic_component(intr),
3333
.dest_type = type | 16,
3334
.io_semantics = nir_intrinsic_io_semantics(intr));
3335
3336
nir_ssa_def *cast = nir_convert_to_bit_size(b, out, type, 8);
3337
nir_ssa_def_rewrite_uses(&intr->dest.ssa, cast);
3338
return true;
3339
}
3340
3341
static bool
3342
bifrost_nir_lower_i8_frag(struct nir_builder *b,
3343
nir_instr *instr, UNUSED void *data)
3344
{
3345
if (instr->type != nir_instr_type_intrinsic)
3346
return false;
3347
3348
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3349
if (intr->intrinsic == nir_intrinsic_load_output)
3350
return bifrost_nir_lower_i8_fragin_impl(b, intr, data);
3351
else if (intr->intrinsic == nir_intrinsic_store_output)
3352
return bifrost_nir_lower_i8_fragout_impl(b, intr, data);
3353
else
3354
return false;
3355
}
3356
3357
static void
3358
bi_opt_post_ra(bi_context *ctx)
3359
{
3360
bi_foreach_instr_global_safe(ctx, ins) {
3361
if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0]))
3362
bi_remove_instruction(ins);
3363
}
3364
}
3365
3366
static bool
3367
bifrost_nir_lower_store_component(struct nir_builder *b,
3368
nir_instr *instr, void *data)
3369
{
3370
if (instr->type != nir_instr_type_intrinsic)
3371
return false;
3372
3373
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3374
3375
if (intr->intrinsic != nir_intrinsic_store_output)
3376
return false;
3377
3378
struct hash_table_u64 *slots = data;
3379
unsigned component = nir_intrinsic_component(intr);
3380
nir_src *slot_src = nir_get_io_offset_src(intr);
3381
uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr);
3382
3383
nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot);
3384
unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0);
3385
3386
nir_ssa_def *value = intr->src[0].ssa;
3387
b->cursor = nir_before_instr(&intr->instr);
3388
3389
nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size);
3390
nir_ssa_def *channels[4] = { undef, undef, undef, undef };
3391
3392
/* Copy old */
3393
u_foreach_bit(i, mask) {
3394
assert(prev != NULL);
3395
nir_ssa_def *prev_ssa = prev->src[0].ssa;
3396
channels[i] = nir_channel(b, prev_ssa, i);
3397
}
3398
3399
/* Copy new */
3400
unsigned new_mask = nir_intrinsic_write_mask(intr);
3401
mask |= (new_mask << component);
3402
3403
u_foreach_bit(i, new_mask) {
3404
assert(component + i < 4);
3405
channels[component + i] = nir_channel(b, value, i);
3406
}
3407
3408
intr->num_components = util_last_bit(mask);
3409
nir_instr_rewrite_src_ssa(instr, &intr->src[0],
3410
nir_vec(b, channels, intr->num_components));
3411
3412
nir_intrinsic_set_component(intr, 0);
3413
nir_intrinsic_set_write_mask(intr, mask);
3414
3415
if (prev) {
3416
_mesa_hash_table_u64_remove(slots, slot);
3417
nir_instr_remove(&prev->instr);
3418
}
3419
3420
_mesa_hash_table_u64_insert(slots, slot, intr);
3421
return false;
3422
}
3423
3424
/* Dead code elimination for branches at the end of a block - only one branch
3425
* per block is legal semantically, but unreachable jumps can be generated.
3426
* Likewise we can generate jumps to the terminal block which need to be
3427
* lowered away to a jump to #0x0, which induces successful termination. */
3428
3429
static void
3430
bi_lower_branch(bi_block *block)
3431
{
3432
bool branched = false;
3433
ASSERTED bool was_jump = false;
3434
3435
bi_foreach_instr_in_block_safe(block, ins) {
3436
if (!ins->branch_target) continue;
3437
3438
if (branched) {
3439
assert(was_jump && (ins->op == BI_OPCODE_JUMP));
3440
bi_remove_instruction(ins);
3441
continue;
3442
}
3443
3444
branched = true;
3445
was_jump = ins->op == BI_OPCODE_JUMP;
3446
3447
if (bi_is_terminal_block(ins->branch_target))
3448
ins->branch_target = NULL;
3449
}
3450
}
3451
3452
void
3453
bifrost_compile_shader_nir(nir_shader *nir,
3454
const struct panfrost_compile_inputs *inputs,
3455
struct util_dynarray *binary,
3456
struct pan_shader_info *info)
3457
{
3458
bifrost_debug = debug_get_option_bifrost_debug();
3459
3460
bi_context *ctx = rzalloc(NULL, bi_context);
3461
ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx);
3462
3463
ctx->inputs = inputs;
3464
ctx->nir = nir;
3465
ctx->info = info;
3466
ctx->stage = nir->info.stage;
3467
ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
3468
ctx->arch = inputs->gpu_id >> 12;
3469
list_inithead(&ctx->blocks);
3470
3471
/* Lower gl_Position pre-optimisation, but after lowering vars to ssa
3472
* (so we don't accidentally duplicate the epilogue since mesa/st has
3473
* messed with our I/O quite a bit already) */
3474
3475
NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3476
3477
if (ctx->stage == MESA_SHADER_VERTEX) {
3478
NIR_PASS_V(nir, nir_lower_viewport_transform);
3479
NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0);
3480
}
3481
3482
/* Lower large arrays to scratch and small arrays to bcsel (TODO: tune
3483
* threshold, but not until addresses / csel is optimized better) */
3484
NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
3485
glsl_get_natural_size_align_bytes);
3486
NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
3487
3488
NIR_PASS_V(nir, nir_split_var_copies);
3489
NIR_PASS_V(nir, nir_lower_global_vars_to_local);
3490
NIR_PASS_V(nir, nir_lower_var_copies);
3491
NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3492
NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3493
glsl_type_size, 0);
3494
3495
if (ctx->stage == MESA_SHADER_FRAGMENT) {
3496
NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
3497
~0, false);
3498
} else {
3499
struct hash_table_u64 *stores = _mesa_hash_table_u64_create(ctx);
3500
NIR_PASS_V(nir, nir_shader_instructions_pass,
3501
bifrost_nir_lower_store_component,
3502
nir_metadata_block_index |
3503
nir_metadata_dominance, stores);
3504
}
3505
3506
NIR_PASS_V(nir, nir_lower_ssbo);
3507
NIR_PASS_V(nir, pan_nir_lower_zs_store);
3508
NIR_PASS_V(nir, pan_lower_sample_pos);
3509
NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
3510
3511
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3512
NIR_PASS_V(nir, nir_shader_instructions_pass,
3513
bifrost_nir_lower_i8_frag,
3514
nir_metadata_block_index | nir_metadata_dominance,
3515
NULL);
3516
}
3517
3518
bi_optimize_nir(nir, ctx->inputs->gpu_id, ctx->inputs->is_blend);
3519
3520
NIR_PASS_V(nir, pan_nir_reorder_writeout);
3521
3522
bool skip_internal = nir->info.internal;
3523
skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);
3524
3525
if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
3526
nir_print_shader(nir, stdout);
3527
}
3528
3529
info->tls_size = nir->scratch_size;
3530
3531
nir_foreach_function(func, nir) {
3532
if (!func->impl)
3533
continue;
3534
3535
ctx->ssa_alloc += func->impl->ssa_alloc;
3536
ctx->reg_alloc += func->impl->reg_alloc;
3537
3538
emit_cf_list(ctx, &func->impl->body);
3539
break; /* TODO: Multi-function shaders */
3540
}
3541
3542
unsigned block_source_count = 0;
3543
3544
bi_foreach_block(ctx, _block) {
3545
bi_block *block = (bi_block *) _block;
3546
3547
/* Name blocks now that we're done emitting so the order is
3548
* consistent */
3549
block->base.name = block_source_count++;
3550
}
3551
3552
/* If the shader doesn't write any colour or depth outputs, it may
3553
* still need an ATEST at the very end! */
3554
bool need_dummy_atest =
3555
(ctx->stage == MESA_SHADER_FRAGMENT) &&
3556
!ctx->emitted_atest &&
3557
!bi_skip_atest(ctx, false);
3558
3559
if (need_dummy_atest) {
3560
pan_block *end = list_last_entry(&ctx->blocks, pan_block, link);
3561
bi_builder b = bi_init_builder(ctx, bi_after_block((bi_block *) end));
3562
bi_emit_atest(&b, bi_zero());
3563
}
3564
3565
/* Runs before constant folding */
3566
bi_lower_swizzle(ctx);
3567
3568
/* Runs before copy prop */
3569
bi_opt_push_ubo(ctx);
3570
bi_opt_constant_fold(ctx);
3571
3572
bi_opt_copy_prop(ctx);
3573
bi_opt_mod_prop_forward(ctx);
3574
bi_opt_mod_prop_backward(ctx);
3575
bi_opt_dead_code_eliminate(ctx);
3576
bi_opt_cse(ctx);
3577
bi_opt_dead_code_eliminate(ctx);
3578
3579
bi_foreach_block(ctx, _block) {
3580
bi_block *block = (bi_block *) _block;
3581
bi_lower_branch(block);
3582
}
3583
3584
if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
3585
bi_print_shader(ctx, stdout);
3586
bi_lower_fau(ctx);
3587
3588
/* Analyze before register allocation to avoid false dependencies. The
3589
* skip bit is a function of only the data flow graph and is invariant
3590
* under valid scheduling. */
3591
bi_analyze_helper_requirements(ctx);
3592
3593
bi_register_allocate(ctx);
3594
bi_opt_post_ra(ctx);
3595
if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
3596
bi_print_shader(ctx, stdout);
3597
bi_schedule(ctx);
3598
bi_assign_scoreboard(ctx);
3599
3600
/* Analyze after scheduling since we depend on instruction order. */
3601
bi_analyze_helper_terminate(ctx);
3602
3603
if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
3604
bi_print_shader(ctx, stdout);
3605
3606
unsigned final_clause = bi_pack(ctx, binary);
3607
3608
/* If we need to wait for ATEST or BLEND in the first clause, pass the
3609
* corresponding bits through to the renderer state descriptor */
3610
pan_block *first_block = list_first_entry(&ctx->blocks, pan_block, link);
3611
bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
3612
3613
unsigned first_deps = first_clause ? first_clause->dependencies : 0;
3614
info->bifrost.wait_6 = (first_deps & (1 << 6));
3615
info->bifrost.wait_7 = (first_deps & (1 << 7));
3616
3617
info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos);
3618
3619
if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
3620
disassemble_bifrost(stdout, binary->data, binary->size,
3621
bifrost_debug & BIFROST_DBG_VERBOSE);
3622
fflush(stdout);
3623
}
3624
3625
/* Pad the shader with enough zero bytes to trick the prefetcher,
3626
* unless we're compiling an empty shader (in which case we don't pad
3627
* so the size remains 0) */
3628
unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
3629
3630
if (binary->size) {
3631
memset(util_dynarray_grow(binary, uint8_t, prefetch_size),
3632
0, prefetch_size);
3633
}
3634
3635
if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) &&
3636
!skip_internal) {
3637
bi_print_stats(ctx, binary->size, stderr);
3638
}
3639
3640
ralloc_free(ctx);
3641
}
3642
3643