Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_instruction_selection.cpp
7096 views
1
/*
2
* Copyright © 2018 Valve Corporation
3
* Copyright © 2018 Google
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
* IN THE SOFTWARE.
23
*
24
*/
25
26
#include "aco_instruction_selection.h"
27
28
#include "aco_builder.h"
29
#include "aco_ir.h"
30
31
#include "common/ac_exp_param.h"
32
#include "common/sid.h"
33
#include "vulkan/radv_descriptor_set.h"
34
35
#include "util/fast_idiv_by_const.h"
36
#include "util/memstream.h"
37
38
#include <array>
39
#include <functional>
40
#include <map>
41
#include <numeric>
42
#include <stack>
43
#include <vector>
44
45
namespace aco {
46
namespace {
47
48
#define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
49
50
static void
51
_isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
52
const char* msg)
53
{
54
char* out;
55
size_t outsize;
56
struct u_memstream mem;
57
u_memstream_open(&mem, &out, &outsize);
58
FILE* const memf = u_memstream_get(&mem);
59
60
fprintf(memf, "%s: ", msg);
61
nir_print_instr(instr, memf);
62
u_memstream_close(&mem);
63
64
_aco_err(ctx->program, file, line, out);
65
free(out);
66
}
67
68
struct if_context {
69
Temp cond;
70
71
bool divergent_old;
72
bool exec_potentially_empty_discard_old;
73
bool exec_potentially_empty_break_old;
74
uint16_t exec_potentially_empty_break_depth_old;
75
76
unsigned BB_if_idx;
77
unsigned invert_idx;
78
bool uniform_has_then_branch;
79
bool then_branch_divergent;
80
Block BB_invert;
81
Block BB_endif;
82
};
83
84
struct loop_context {
85
Block loop_exit;
86
87
unsigned header_idx_old;
88
Block* exit_old;
89
bool divergent_cont_old;
90
bool divergent_branch_old;
91
bool divergent_if_old;
92
};
93
94
static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);
95
96
static void
97
add_logical_edge(unsigned pred_idx, Block* succ)
98
{
99
succ->logical_preds.emplace_back(pred_idx);
100
}
101
102
static void
103
add_linear_edge(unsigned pred_idx, Block* succ)
104
{
105
succ->linear_preds.emplace_back(pred_idx);
106
}
107
108
static void
109
add_edge(unsigned pred_idx, Block* succ)
110
{
111
add_logical_edge(pred_idx, succ);
112
add_linear_edge(pred_idx, succ);
113
}
114
115
static void
116
append_logical_start(Block* b)
117
{
118
Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
119
}
120
121
static void
122
append_logical_end(Block* b)
123
{
124
Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
125
}
126
127
Temp
128
get_ssa_temp(struct isel_context* ctx, nir_ssa_def* def)
129
{
130
uint32_t id = ctx->first_temp_id + def->index;
131
return Temp(id, ctx->program->temp_rc[id]);
132
}
133
134
Temp
135
emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
136
{
137
Builder bld(ctx->program, ctx->block);
138
assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
139
assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
140
141
if (ctx->program->wave_size == 32) {
142
Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
143
return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
144
}
145
146
Operand mask_lo = Operand::c32(-1u);
147
Operand mask_hi = Operand::c32(-1u);
148
149
if (mask.isTemp()) {
150
RegClass rc = RegClass(mask.regClass().type(), 1);
151
Builder::Result mask_split =
152
bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
153
mask_lo = Operand(mask_split.def(0).getTemp());
154
mask_hi = Operand(mask_split.def(1).getTemp());
155
} else if (mask.physReg() == exec) {
156
mask_lo = Operand(exec_lo, s1);
157
mask_hi = Operand(exec_hi, s1);
158
}
159
160
Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
161
162
if (ctx->program->chip_class <= GFX7)
163
return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
164
else
165
return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
166
}
167
168
Temp
169
emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false)
170
{
171
if (!dst.id())
172
dst = bld.tmp(src.regClass());
173
174
assert(src.size() == dst.size());
175
176
if (bld.program->stage != fragment_fs) {
177
if (!dst.id())
178
return src;
179
180
bld.copy(Definition(dst), src);
181
return dst;
182
}
183
184
bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
185
bld.program->needs_wqm |= program_needs_wqm;
186
return dst;
187
}
188
189
static Temp
190
emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
191
{
192
if (index.regClass() == s1)
193
return bld.readlane(bld.def(s1), data, index);
194
195
if (ctx->options->chip_class <= GFX7) {
196
/* GFX6-7: there is no bpermute instruction */
197
Operand index_op(index);
198
Operand input_data(data);
199
index_op.setLateKill(true);
200
input_data.setLateKill(true);
201
202
return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc),
203
index_op, input_data);
204
} else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
205
206
/* GFX10 wave64 mode: emulate full-wave bpermute */
207
Temp index_is_lo =
208
bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
209
Builder::Result index_is_lo_split =
210
bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
211
Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
212
index_is_lo_split.def(1).getTemp());
213
Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
214
index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
215
Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
216
Operand input_data(data);
217
218
index_x4.setLateKill(true);
219
input_data.setLateKill(true);
220
same_half.setLateKill(true);
221
222
/* We need one pair of shared VGPRs:
223
* Note, that these have twice the allocation granularity of normal VGPRs */
224
ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
225
226
return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
227
index_x4, input_data, same_half);
228
} else {
229
/* GFX8-9 or GFX10 wave32: bpermute works normally */
230
Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
231
return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
232
}
233
}
234
235
static Temp
236
emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)
237
{
238
if (ctx->options->chip_class >= GFX8) {
239
unsigned and_mask = mask & 0x1f;
240
unsigned or_mask = (mask >> 5) & 0x1f;
241
unsigned xor_mask = (mask >> 10) & 0x1f;
242
243
uint16_t dpp_ctrl = 0xffff;
244
245
// TODO: we could use DPP8 for some swizzles
246
if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
247
unsigned res[4] = {0, 1, 2, 3};
248
for (unsigned i = 0; i < 4; i++)
249
res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
250
dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
251
} else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
252
dpp_ctrl = dpp_row_rr(8);
253
} else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
254
dpp_ctrl = dpp_row_mirror;
255
} else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
256
dpp_ctrl = dpp_row_half_mirror;
257
}
258
259
if (dpp_ctrl != 0xffff)
260
return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
261
}
262
263
return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
264
}
265
266
Temp
267
as_vgpr(isel_context* ctx, Temp val)
268
{
269
if (val.type() == RegType::sgpr) {
270
Builder bld(ctx->program, ctx->block);
271
return bld.copy(bld.def(RegType::vgpr, val.size()), val);
272
}
273
assert(val.type() == RegType::vgpr);
274
return val;
275
}
276
277
// assumes a != 0xffffffff
278
void
279
emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b)
280
{
281
assert(b != 0);
282
Builder bld(ctx->program, ctx->block);
283
284
if (util_is_power_of_two_or_zero(b)) {
285
bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(util_logbase2(b)), a);
286
return;
287
}
288
289
util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
290
291
assert(info.multiplier <= 0xffffffff);
292
293
bool pre_shift = info.pre_shift != 0;
294
bool increment = info.increment != 0;
295
bool multiply = true;
296
bool post_shift = info.post_shift != 0;
297
298
if (!pre_shift && !increment && !multiply && !post_shift) {
299
bld.copy(Definition(dst), a);
300
return;
301
}
302
303
Temp pre_shift_dst = a;
304
if (pre_shift) {
305
pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
306
bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand::c32(info.pre_shift),
307
a);
308
}
309
310
Temp increment_dst = pre_shift_dst;
311
if (increment) {
312
increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
313
bld.vadd32(Definition(increment_dst), Operand::c32(info.increment), pre_shift_dst);
314
}
315
316
Temp multiply_dst = increment_dst;
317
if (multiply) {
318
multiply_dst = post_shift ? bld.tmp(v1) : dst;
319
bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
320
bld.copy(bld.def(v1), Operand::c32(info.multiplier)));
321
}
322
323
if (post_shift) {
324
bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(info.post_shift),
325
multiply_dst);
326
}
327
}
328
329
void
330
emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
331
{
332
Builder bld(ctx->program, ctx->block);
333
bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
334
}
335
336
Temp
337
emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
338
{
339
/* no need to extract the whole vector */
340
if (src.regClass() == dst_rc) {
341
assert(idx == 0);
342
return src;
343
}
344
345
assert(src.bytes() > (idx * dst_rc.bytes()));
346
Builder bld(ctx->program, ctx->block);
347
auto it = ctx->allocated_vec.find(src.id());
348
if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
349
if (it->second[idx].regClass() == dst_rc) {
350
return it->second[idx];
351
} else {
352
assert(!dst_rc.is_subdword());
353
assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
354
return bld.copy(bld.def(dst_rc), it->second[idx]);
355
}
356
}
357
358
if (dst_rc.is_subdword())
359
src = as_vgpr(ctx, src);
360
361
if (src.bytes() == dst_rc.bytes()) {
362
assert(idx == 0);
363
return bld.copy(bld.def(dst_rc), src);
364
} else {
365
Temp dst = bld.tmp(dst_rc);
366
emit_extract_vector(ctx, src, idx, dst);
367
return dst;
368
}
369
}
370
371
void
372
emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
373
{
374
if (num_components == 1)
375
return;
376
if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
377
return;
378
RegClass rc;
379
if (num_components > vec_src.size()) {
380
if (vec_src.type() == RegType::sgpr) {
381
/* should still help get_alu_src() */
382
emit_split_vector(ctx, vec_src, vec_src.size());
383
return;
384
}
385
/* sub-dword split */
386
rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
387
} else {
388
rc = RegClass(vec_src.type(), vec_src.size() / num_components);
389
}
390
aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
391
aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
392
split->operands[0] = Operand(vec_src);
393
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
394
for (unsigned i = 0; i < num_components; i++) {
395
elems[i] = ctx->program->allocateTmp(rc);
396
split->definitions[i] = Definition(elems[i]);
397
}
398
ctx->block->instructions.emplace_back(std::move(split));
399
ctx->allocated_vec.emplace(vec_src.id(), elems);
400
}
401
402
/* This vector expansion uses a mask to determine which elements in the new vector
403
* come from the original vector. The other elements are undefined. */
404
void
405
expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
406
{
407
emit_split_vector(ctx, vec_src, util_bitcount(mask));
408
409
if (vec_src == dst)
410
return;
411
412
Builder bld(ctx->program, ctx->block);
413
if (num_components == 1) {
414
if (dst.type() == RegType::sgpr)
415
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
416
else
417
bld.copy(Definition(dst), vec_src);
418
return;
419
}
420
421
unsigned component_size = dst.size() / num_components;
422
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
423
424
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
425
aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
426
vec->definitions[0] = Definition(dst);
427
unsigned k = 0;
428
for (unsigned i = 0; i < num_components; i++) {
429
if (mask & (1 << i)) {
430
Temp src =
431
emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
432
if (dst.type() == RegType::sgpr)
433
src = bld.as_uniform(src);
434
vec->operands[i] = Operand(src);
435
} else {
436
vec->operands[i] = Operand::zero(component_size == 2 ? 8 : 4);
437
}
438
elems[i] = vec->operands[i].getTemp();
439
}
440
ctx->block->instructions.emplace_back(std::move(vec));
441
ctx->allocated_vec.emplace(dst.id(), elems);
442
}
443
444
/* adjust misaligned small bit size loads */
445
void
446
byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
447
{
448
Builder bld(ctx->program, ctx->block);
449
Operand shift;
450
Temp select = Temp();
451
if (offset.isConstant()) {
452
assert(offset.constantValue() && offset.constantValue() < 4);
453
shift = Operand::c32(offset.constantValue() * 8);
454
} else {
455
/* bit_offset = 8 * (offset & 0x3) */
456
Temp tmp =
457
bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));
458
select = bld.tmp(s1);
459
shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
460
Operand::c32(3u));
461
}
462
463
if (vec.size() == 1) {
464
bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
465
} else if (vec.size() == 2) {
466
Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
467
bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
468
if (tmp == dst)
469
emit_split_vector(ctx, dst, 2);
470
else
471
emit_extract_vector(ctx, tmp, 0, dst);
472
} else if (vec.size() == 3 || vec.size() == 4) {
473
Temp lo = bld.tmp(s2), hi;
474
if (vec.size() == 3) {
475
/* this can happen if we use VMEM for a uniform load */
476
hi = bld.tmp(s1);
477
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
478
} else {
479
hi = bld.tmp(s2);
480
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
481
hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());
482
}
483
if (select != Temp())
484
hi =
485
bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));
486
lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
487
Temp mid = bld.tmp(s1);
488
lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
489
hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
490
mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
491
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
492
emit_split_vector(ctx, dst, 2);
493
}
494
}
495
496
void
497
byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
498
{
499
Builder bld(ctx->program, ctx->block);
500
if (offset.isTemp()) {
501
Temp tmp[4] = {vec, vec, vec, vec};
502
503
if (vec.size() == 4) {
504
tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
505
bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
506
Definition(tmp[2]), Definition(tmp[3]), vec);
507
} else if (vec.size() == 3) {
508
tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
509
bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
510
Definition(tmp[2]), vec);
511
} else if (vec.size() == 2) {
512
tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
513
bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
514
}
515
for (unsigned i = 0; i < dst.size(); i++)
516
tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
517
518
vec = tmp[0];
519
if (dst.size() == 2)
520
vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
521
522
offset = Operand::zero();
523
}
524
525
unsigned num_components = vec.bytes() / component_size;
526
if (vec.regClass() == dst.regClass()) {
527
assert(offset.constantValue() == 0);
528
bld.copy(Definition(dst), vec);
529
emit_split_vector(ctx, dst, num_components);
530
return;
531
}
532
533
emit_split_vector(ctx, vec, num_components);
534
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
535
RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
536
537
assert(offset.constantValue() % component_size == 0);
538
unsigned skip = offset.constantValue() / component_size;
539
for (unsigned i = skip; i < num_components; i++)
540
elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
541
542
if (dst.type() == RegType::vgpr) {
543
/* if dst is vgpr - split the src and create a shrunk version according to the mask. */
544
num_components = dst.bytes() / component_size;
545
aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(
546
aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
547
for (unsigned i = 0; i < num_components; i++)
548
create_vec->operands[i] = Operand(elems[i]);
549
create_vec->definitions[0] = Definition(dst);
550
bld.insert(std::move(create_vec));
551
552
} else if (skip) {
553
/* if dst is sgpr - split the src, but move the original to sgpr. */
554
vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
555
byte_align_scalar(ctx, vec, offset, dst);
556
} else {
557
assert(dst.size() == vec.size());
558
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
559
}
560
561
ctx->allocated_vec.emplace(dst.id(), elems);
562
}
563
564
Temp
565
bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
566
{
567
Builder bld(ctx->program, ctx->block);
568
if (!dst.id())
569
dst = bld.tmp(bld.lm);
570
571
assert(val.regClass() == s1);
572
assert(dst.regClass() == bld.lm);
573
574
return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
575
bld.scc(val));
576
}
577
578
Temp
579
bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
580
{
581
Builder bld(ctx->program, ctx->block);
582
if (!dst.id())
583
dst = bld.tmp(s1);
584
585
assert(val.regClass() == bld.lm);
586
assert(dst.regClass() == s1);
587
588
/* if we're currently in WQM mode, ensure that the source is also computed in WQM */
589
Temp tmp = bld.tmp(s1);
590
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
591
return emit_wqm(bld, tmp, dst);
592
}
593
594
/**
595
* Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
596
* src_bits and dst_bits are truncated.
597
*
598
* Sign extension may be applied using the sign_extend parameter. The position of the input sign
599
* bit is indicated by src_bits in this case.
600
*
601
* If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
602
*/
603
Temp
604
convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
605
bool sign_extend, Temp dst = Temp())
606
{
607
assert(!(sign_extend && dst_bits < src_bits) &&
608
"Shrinking integers is not supported for signed inputs");
609
610
if (!dst.id()) {
611
if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
612
dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
613
else
614
dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
615
}
616
617
assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
618
assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
619
620
if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
621
/* Copy the raw value, leaving an undefined value in the upper bits for
622
* the caller to handle appropriately */
623
return bld.copy(Definition(dst), src);
624
} else if (dst.bytes() < src.bytes()) {
625
return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
626
}
627
628
Temp tmp = dst;
629
if (dst_bits == 64)
630
tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
631
632
if (tmp == src) {
633
} else if (src.regClass() == s1) {
634
assert(src_bits < 32);
635
bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
636
Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
637
} else if (ctx->options->chip_class >= GFX8) {
638
assert(src_bits < 32);
639
assert(src_bits != 8 || src.regClass() == v1b);
640
assert(src_bits != 16 || src.regClass() == v2b);
641
assert(dst_bits >= 16);
642
aco_ptr<SDWA_instruction> sdwa{
643
create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
644
sdwa->operands[0] = Operand(src);
645
sdwa->definitions[0] = Definition(tmp);
646
if (sign_extend)
647
sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
648
else
649
sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
650
sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
651
bld.insert(std::move(sdwa));
652
} else {
653
assert(src_bits < 32);
654
assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
655
aco_opcode opcode = sign_extend ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
656
bld.vop3(opcode, Definition(tmp), src, Operand::zero(),
657
Operand::c32(src_bits == 8 ? 8u : 16u));
658
}
659
660
if (dst_bits == 64) {
661
if (sign_extend && dst.regClass() == s2) {
662
Temp high =
663
bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
664
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
665
} else if (sign_extend && dst.regClass() == v2) {
666
Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
667
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
668
} else {
669
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
670
}
671
}
672
673
return dst;
674
}
675
676
enum sgpr_extract_mode {
677
sgpr_extract_sext,
678
sgpr_extract_zext,
679
sgpr_extract_undef,
680
};
681
682
Temp
683
extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
684
{
685
Temp vec = get_ssa_temp(ctx, src->src.ssa);
686
unsigned src_size = src->src.ssa->bit_size;
687
unsigned swizzle = src->swizzle[0];
688
689
if (vec.size() > 1) {
690
assert(src_size == 16);
691
vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
692
swizzle = swizzle & 1;
693
}
694
695
Builder bld(ctx->program, ctx->block);
696
Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
697
698
if (mode == sgpr_extract_undef && swizzle == 0)
699
bld.copy(Definition(tmp), vec);
700
else
701
bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
702
Operand::c32(swizzle), Operand::c32(src_size),
703
Operand::c32((mode == sgpr_extract_sext)));
704
705
if (dst.regClass() == s2)
706
convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
707
708
return dst;
709
}
710
711
Temp
712
get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
713
{
714
if (src.src.ssa->num_components == 1 && size == 1)
715
return get_ssa_temp(ctx, src.src.ssa);
716
717
Temp vec = get_ssa_temp(ctx, src.src.ssa);
718
unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
719
bool identity_swizzle = true;
720
721
for (unsigned i = 0; identity_swizzle && i < size; i++) {
722
if (src.swizzle[i] != i)
723
identity_swizzle = false;
724
}
725
if (identity_swizzle)
726
return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
727
728
assert(elem_size > 0);
729
assert(vec.bytes() % elem_size == 0);
730
731
if (elem_size < 4 && vec.type() == RegType::sgpr) {
732
assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
733
assert(size == 1);
734
return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
735
sgpr_extract_undef);
736
}
737
738
RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
739
: RegClass(vec.type(), elem_size / 4);
740
if (size == 1) {
741
return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
742
} else {
743
assert(size <= 4);
744
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
745
aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(
746
aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
747
for (unsigned i = 0; i < size; ++i) {
748
elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
749
vec_instr->operands[i] = Operand{elems[i]};
750
}
751
Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
752
vec_instr->definitions[0] = Definition(dst);
753
ctx->block->instructions.emplace_back(std::move(vec_instr));
754
ctx->allocated_vec.emplace(dst.id(), elems);
755
return dst;
756
}
757
}
758
759
Temp
760
get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
761
{
762
/* returns v2b or v1 for vop3p usage.
763
* The source expects exactly 2 16bit components
764
* which are within the same dword
765
*/
766
assert(src.src.ssa->bit_size == 16);
767
assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
768
769
Temp tmp = get_ssa_temp(ctx, src.src.ssa);
770
if (tmp.size() == 1)
771
return tmp;
772
773
/* the size is larger than 1 dword: check the swizzle */
774
unsigned dword = src.swizzle[0] >> 1;
775
776
/* extract a full dword if possible */
777
if (tmp.bytes() >= (dword + 1) * 4) {
778
return emit_extract_vector(ctx, tmp, dword, RegClass(tmp.type(), 1));
779
} else {
780
/* This must be a swizzled access to %a.zz where %a is v6b */
781
assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
782
assert(tmp.regClass() == v6b && dword == 1);
783
return emit_extract_vector(ctx, tmp, dword * 2, v2b);
784
}
785
}
786
787
uint32_t
788
get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
789
{
790
nir_ssa_scalar scalar =
791
nir_ssa_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
792
return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
793
}
794
795
Temp
796
convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
797
{
798
if (ptr.size() == 2)
799
return ptr;
800
Builder bld(ctx->program, ctx->block);
801
if (ptr.type() == RegType::vgpr && !non_uniform)
802
ptr = bld.as_uniform(ptr);
803
return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
804
Operand::c32((unsigned)ctx->options->address32_hi));
805
}
806
807
void
808
emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
809
bool writes_scc, uint8_t uses_ub = 0)
810
{
811
aco_ptr<SOP2_instruction> sop2{
812
create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
813
sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
814
sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
815
sop2->definitions[0] = Definition(dst);
816
if (instr->no_unsigned_wrap)
817
sop2->definitions[0].setNUW(true);
818
if (writes_scc)
819
sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);
820
821
for (int i = 0; i < 2; i++) {
822
if (uses_ub & (1 << i)) {
823
uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
824
if (src_ub <= 0xffff)
825
sop2->operands[i].set16bit(true);
826
else if (src_ub <= 0xffffff)
827
sop2->operands[i].set24bit(true);
828
}
829
}
830
831
ctx->block->instructions.emplace_back(std::move(sop2));
832
}
833
834
void
835
emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
836
bool commutative, bool swap_srcs = false, bool flush_denorms = false,
837
bool nuw = false, uint8_t uses_ub = 0)
838
{
839
Builder bld(ctx->program, ctx->block);
840
bld.is_precise = instr->exact;
841
842
Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
843
Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
844
if (src1.type() == RegType::sgpr) {
845
if (commutative && src0.type() == RegType::vgpr) {
846
Temp t = src0;
847
src0 = src1;
848
src1 = t;
849
} else {
850
src1 = as_vgpr(ctx, src1);
851
}
852
}
853
854
Operand op0(src0);
855
Operand op1(src1);
856
857
for (int i = 0; i < 2; i++) {
858
if (uses_ub & (1 << i)) {
859
uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
860
if (src_ub <= 0xffff)
861
bld.set16bit(i ? op1 : op0);
862
else if (src_ub <= 0xffffff)
863
bld.set24bit(i ? op1 : op0);
864
}
865
}
866
867
if (flush_denorms && ctx->program->chip_class < GFX9) {
868
assert(dst.size() == 1);
869
Temp tmp = bld.vop2(op, bld.def(v1), op0, op1);
870
bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
871
} else {
872
if (nuw) {
873
bld.nuw().vop2(op, Definition(dst), op0, op1);
874
} else {
875
bld.vop2(op, Definition(dst), op0, op1);
876
}
877
}
878
}
879
880
void
881
emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
882
{
883
Builder bld(ctx->program, ctx->block);
884
bld.is_precise = instr->exact;
885
886
Temp src0 = get_alu_src(ctx, instr->src[0]);
887
Temp src1 = get_alu_src(ctx, instr->src[1]);
888
889
if (src1.type() == RegType::sgpr) {
890
assert(src0.type() == RegType::vgpr);
891
std::swap(src0, src1);
892
}
893
894
Temp src00 = bld.tmp(src0.type(), 1);
895
Temp src01 = bld.tmp(src0.type(), 1);
896
bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
897
Temp src10 = bld.tmp(v1);
898
Temp src11 = bld.tmp(v1);
899
bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
900
Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
901
Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
902
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
903
}
904
905
void
906
emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
907
bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
908
{
909
assert(num_sources == 2 || num_sources == 3);
910
Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
911
bool has_sgpr = false;
912
for (unsigned i = 0; i < num_sources; i++) {
913
src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]);
914
if (has_sgpr)
915
src[i] = as_vgpr(ctx, src[i]);
916
else
917
has_sgpr = src[i].type() == RegType::sgpr;
918
}
919
920
Builder bld(ctx->program, ctx->block);
921
bld.is_precise = instr->exact;
922
if (flush_denorms && ctx->program->chip_class < GFX9) {
923
Temp tmp;
924
if (num_sources == 3)
925
tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
926
else
927
tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
928
if (dst.size() == 1)
929
bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
930
else
931
bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp);
932
} else if (num_sources == 3) {
933
bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
934
} else {
935
bld.vop3(op, Definition(dst), src[0], src[1]);
936
}
937
}
938
939
Builder::Result
940
emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
941
bool swap_srcs = false)
942
{
943
Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
944
Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
945
if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
946
src1 = as_vgpr(ctx, src1);
947
assert(instr->dest.dest.ssa.num_components == 2);
948
949
/* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
950
unsigned opsel_lo =
951
(instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
952
unsigned opsel_hi =
953
(instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
954
955
Builder bld(ctx->program, ctx->block);
956
bld.is_precise = instr->exact;
957
Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
958
emit_split_vector(ctx, dst, 2);
959
return res;
960
}
961
962
void
963
emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
964
{
965
Builder bld(ctx->program, ctx->block);
966
bld.is_precise = instr->exact;
967
if (dst.type() == RegType::sgpr)
968
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
969
bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
970
else
971
bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
972
}
973
974
void
975
emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
976
{
977
Temp src0 = get_alu_src(ctx, instr->src[0]);
978
Temp src1 = get_alu_src(ctx, instr->src[1]);
979
assert(src0.size() == src1.size());
980
981
aco_ptr<Instruction> vopc;
982
if (src1.type() == RegType::sgpr) {
983
if (src0.type() == RegType::vgpr) {
984
/* to swap the operands, we might also have to change the opcode */
985
switch (op) {
986
case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break;
987
case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break;
988
case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break;
989
case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break;
990
case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break;
991
case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break;
992
case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break;
993
case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break;
994
case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break;
995
case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break;
996
case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break;
997
case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break;
998
case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break;
999
case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break;
1000
case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break;
1001
case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break;
1002
case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break;
1003
case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break;
1004
default: /* eq and ne are commutative */ break;
1005
}
1006
Temp t = src0;
1007
src0 = src1;
1008
src1 = t;
1009
} else {
1010
src1 = as_vgpr(ctx, src1);
1011
}
1012
}
1013
1014
Builder bld(ctx->program, ctx->block);
1015
bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
1016
}
1017
1018
void
1019
emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1020
{
1021
Temp src0 = get_alu_src(ctx, instr->src[0]);
1022
Temp src1 = get_alu_src(ctx, instr->src[1]);
1023
Builder bld(ctx->program, ctx->block);
1024
1025
assert(dst.regClass() == bld.lm);
1026
assert(src0.type() == RegType::sgpr);
1027
assert(src1.type() == RegType::sgpr);
1028
assert(src0.regClass() == src1.regClass());
1029
1030
/* Emit the SALU comparison instruction */
1031
Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
1032
/* Turn the result into a per-lane bool */
1033
bool_to_vector_condition(ctx, cmp, dst);
1034
}
1035
1036
void
1037
emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1038
aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes,
1039
aco_opcode s64_op = aco_opcode::num_opcodes)
1040
{
1041
aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op
1042
: instr->src[0].src.ssa->bit_size == 32 ? s32_op
1043
: aco_opcode::num_opcodes;
1044
aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op
1045
: instr->src[0].src.ssa->bit_size == 32 ? v32_op
1046
: v16_op;
1047
bool use_valu = s_op == aco_opcode::num_opcodes || nir_dest_is_divergent(instr->dest.dest) ||
1048
get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1049
get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1050
aco_opcode op = use_valu ? v_op : s_op;
1051
assert(op != aco_opcode::num_opcodes);
1052
assert(dst.regClass() == ctx->program->lane_mask);
1053
1054
if (use_valu)
1055
emit_vopc_instruction(ctx, instr, op, dst);
1056
else
1057
emit_sopc_instruction(ctx, instr, op, dst);
1058
}
1059
1060
void
1061
emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1062
Temp dst)
1063
{
1064
Builder bld(ctx->program, ctx->block);
1065
Temp src0 = get_alu_src(ctx, instr->src[0]);
1066
Temp src1 = get_alu_src(ctx, instr->src[1]);
1067
1068
assert(dst.regClass() == bld.lm);
1069
assert(src0.regClass() == bld.lm);
1070
assert(src1.regClass() == bld.lm);
1071
1072
bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1073
}
1074
1075
void
1076
emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1077
{
1078
Builder bld(ctx->program, ctx->block);
1079
Temp cond = get_alu_src(ctx, instr->src[0]);
1080
Temp then = get_alu_src(ctx, instr->src[1]);
1081
Temp els = get_alu_src(ctx, instr->src[2]);
1082
1083
assert(cond.regClass() == bld.lm);
1084
1085
if (dst.type() == RegType::vgpr) {
1086
aco_ptr<Instruction> bcsel;
1087
if (dst.size() == 1) {
1088
then = as_vgpr(ctx, then);
1089
els = as_vgpr(ctx, els);
1090
1091
bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1092
} else if (dst.size() == 2) {
1093
Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1094
bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
1095
Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1096
bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
1097
1098
Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
1099
Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
1100
1101
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1102
} else {
1103
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1104
}
1105
return;
1106
}
1107
1108
if (instr->dest.dest.ssa.bit_size == 1) {
1109
assert(dst.regClass() == bld.lm);
1110
assert(then.regClass() == bld.lm);
1111
assert(els.regClass() == bld.lm);
1112
}
1113
1114
if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1115
if (dst.regClass() == s1 || dst.regClass() == s2) {
1116
assert((then.regClass() == s1 || then.regClass() == s2) &&
1117
els.regClass() == then.regClass());
1118
assert(dst.size() == then.size());
1119
aco_opcode op =
1120
dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1121
bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1122
} else {
1123
isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1124
}
1125
return;
1126
}
1127
1128
/* divergent boolean bcsel
1129
* this implements bcsel on bools: dst = s0 ? s1 : s2
1130
* are going to be: dst = (s0 & s1) | (~s0 & s2) */
1131
assert(instr->dest.dest.ssa.bit_size == 1);
1132
1133
if (cond.id() != then.id())
1134
then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1135
1136
if (cond.id() == els.id())
1137
bld.copy(Definition(dst), then);
1138
else
1139
bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1140
bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1141
}
1142
1143
void
1144
emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,
1145
uint32_t undo)
1146
{
1147
/* multiply by 16777216 to handle denormals */
1148
Temp is_denormal =
1149
bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), as_vgpr(ctx, val),
1150
bld.copy(bld.def(v1), Operand::c32((1u << 7) | (1u << 4))));
1151
Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val);
1152
scaled = bld.vop1(op, bld.def(v1), scaled);
1153
scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled);
1154
1155
Temp not_scaled = bld.vop1(op, bld.def(v1), val);
1156
1157
bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
1158
}
1159
1160
void
1161
emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1162
{
1163
if (ctx->block->fp_mode.denorm32 == 0) {
1164
bld.vop1(aco_opcode::v_rcp_f32, dst, val);
1165
return;
1166
}
1167
1168
emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
1169
}
1170
1171
void
1172
emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1173
{
1174
if (ctx->block->fp_mode.denorm32 == 0) {
1175
bld.vop1(aco_opcode::v_rsq_f32, dst, val);
1176
return;
1177
}
1178
1179
emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
1180
}
1181
1182
void
1183
emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1184
{
1185
if (ctx->block->fp_mode.denorm32 == 0) {
1186
bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
1187
return;
1188
}
1189
1190
emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
1191
}
1192
1193
void
1194
emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1195
{
1196
if (ctx->block->fp_mode.denorm32 == 0) {
1197
bld.vop1(aco_opcode::v_log_f32, dst, val);
1198
return;
1199
}
1200
1201
emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
1202
}
1203
1204
Temp
1205
emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1206
{
1207
if (ctx->options->chip_class >= GFX7)
1208
return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1209
1210
/* GFX6 doesn't support V_TRUNC_F64, lower it. */
1211
/* TODO: create more efficient code! */
1212
if (val.type() == RegType::sgpr)
1213
val = as_vgpr(ctx, val);
1214
1215
/* Split the input value. */
1216
Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1217
bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1218
1219
/* Extract the exponent and compute the unbiased value. */
1220
Temp exponent =
1221
bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1222
exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1223
1224
/* Extract the fractional part. */
1225
Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1226
Operand::c32(0x000fffffu));
1227
fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1228
1229
Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1230
bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1231
fract_mask);
1232
1233
Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1234
Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1235
fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1236
tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1237
fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1238
1239
/* Get the sign bit. */
1240
Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1241
1242
/* Decide the operation to apply depending on the unbiased exponent. */
1243
Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent,
1244
Operand::zero());
1245
Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1246
bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1247
Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1248
Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1249
dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1250
dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1251
1252
return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1253
}
1254
1255
Temp
1256
emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1257
{
1258
if (ctx->options->chip_class >= GFX7)
1259
return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1260
1261
/* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1262
* lowered at NIR level for precision reasons). */
1263
Temp src0 = as_vgpr(ctx, val);
1264
1265
Temp mask = bld.copy(bld.def(s1), Operand::c32(3u)); /* isnan */
1266
Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1267
Operand::c32(0x3fefffffu));
1268
1269
Temp isnan =
1270
bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
1271
Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1272
Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1273
1274
Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1275
bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1276
Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1277
bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1278
1279
Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1280
Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1281
1282
Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1283
1284
Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1285
add->vop3().neg[1] = true;
1286
1287
return add->definitions[0].getTemp();
1288
}
1289
1290
Temp
1291
uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1292
{
1293
if (bld.program->chip_class < GFX8) {
1294
Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1295
return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1296
add.def(1).getTemp());
1297
}
1298
1299
Builder::Result add(NULL);
1300
if (bld.program->chip_class >= GFX9) {
1301
add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1302
} else {
1303
add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.hint_vcc(bld.def(bld.lm)), src0, src1);
1304
}
1305
add.instr->vop3().clamp = 1;
1306
return dst.getTemp();
1307
}
1308
1309
void
1310
visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1311
{
1312
if (!instr->dest.dest.is_ssa) {
1313
isel_err(&instr->instr, "nir alu dst not in ssa");
1314
abort();
1315
}
1316
Builder bld(ctx->program, ctx->block);
1317
bld.is_precise = instr->exact;
1318
Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1319
switch (instr->op) {
1320
case nir_op_vec2:
1321
case nir_op_vec3:
1322
case nir_op_vec4:
1323
case nir_op_vec5: {
1324
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1325
unsigned num = instr->dest.dest.ssa.num_components;
1326
for (unsigned i = 0; i < num; ++i)
1327
elems[i] = get_alu_src(ctx, instr->src[i]);
1328
1329
if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
1330
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1331
aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1332
RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
1333
for (unsigned i = 0; i < num; ++i) {
1334
if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1335
elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1336
vec->operands[i] = Operand{elems[i]};
1337
}
1338
vec->definitions[0] = Definition(dst);
1339
ctx->block->instructions.emplace_back(std::move(vec));
1340
ctx->allocated_vec.emplace(dst.id(), elems);
1341
} else {
1342
bool use_s_pack = ctx->program->chip_class >= GFX9;
1343
Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->dest.dest.ssa.bit_size) - 1));
1344
1345
std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1346
uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1347
for (unsigned i = 0; i < num; i++) {
1348
unsigned packed_size = use_s_pack ? 16 : 32;
1349
unsigned idx = i * instr->dest.dest.ssa.bit_size / packed_size;
1350
unsigned offset = i * instr->dest.dest.ssa.bit_size % packed_size;
1351
if (nir_src_is_const(instr->src[i].src)) {
1352
const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1353
continue;
1354
}
1355
1356
if (offset != packed_size - instr->dest.dest.ssa.bit_size)
1357
elems[i] =
1358
bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1359
1360
if (offset)
1361
elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1362
Operand::c32(offset));
1363
1364
if (packed[idx].id())
1365
packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1366
packed[idx]);
1367
else
1368
packed[idx] = elems[i];
1369
}
1370
1371
if (use_s_pack) {
1372
for (unsigned i = 0; i < dst.size(); i++) {
1373
bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1374
1375
if (packed[i * 2].id() && packed[i * 2 + 1].id())
1376
packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1377
packed[i * 2 + 1]);
1378
else if (packed[i * 2 + 1].id())
1379
packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1380
Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1381
else if (packed[i * 2].id())
1382
packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1383
Operand::c32(const_vals[i * 2 + 1]));
1384
1385
if (same)
1386
const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1387
else
1388
const_vals[i] = 0;
1389
}
1390
}
1391
1392
for (unsigned i = 0; i < dst.size(); i++) {
1393
if (const_vals[i] && packed[i].id())
1394
packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1395
Operand::c32(const_vals[i]), packed[i]);
1396
else if (!packed[i].id())
1397
packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1398
}
1399
1400
if (dst.size() == 1)
1401
bld.copy(Definition(dst), packed[0]);
1402
else if (dst.size() == 2)
1403
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1]);
1404
else
1405
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1],
1406
packed[2]);
1407
}
1408
break;
1409
}
1410
case nir_op_mov: {
1411
Temp src = get_alu_src(ctx, instr->src[0]);
1412
if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1413
/* use size() instead of bytes() for 8/16-bit */
1414
assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1415
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1416
} else {
1417
assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1418
bld.copy(Definition(dst), src);
1419
}
1420
break;
1421
}
1422
case nir_op_inot: {
1423
Temp src = get_alu_src(ctx, instr->src[0]);
1424
if (instr->dest.dest.ssa.bit_size == 1) {
1425
assert(src.regClass() == bld.lm);
1426
assert(dst.regClass() == bld.lm);
1427
/* Don't use s_andn2 here, this allows the optimizer to make a better decision */
1428
Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
1429
bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
1430
} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1431
emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1432
} else if (dst.regClass() == v2) {
1433
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1434
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1435
lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1436
hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1437
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1438
} else if (dst.type() == RegType::sgpr) {
1439
aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1440
bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1441
} else {
1442
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1443
}
1444
break;
1445
}
1446
case nir_op_iabs: {
1447
Temp src = get_alu_src(ctx, instr->src[0]);
1448
if (dst.regClass() == s1) {
1449
bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1450
} else if (dst.regClass() == v1) {
1451
bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1452
bld.vsub32(bld.def(v1), Operand::zero(), src));
1453
} else {
1454
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1455
}
1456
break;
1457
}
1458
case nir_op_isign: {
1459
Temp src = get_alu_src(ctx, instr->src[0]);
1460
if (dst.regClass() == s1) {
1461
Temp tmp =
1462
bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1463
bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1464
} else if (dst.regClass() == s2) {
1465
Temp neg =
1466
bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1467
Temp neqz;
1468
if (ctx->program->chip_class >= GFX8)
1469
neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1470
else
1471
neqz =
1472
bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1473
.def(1)
1474
.getTemp();
1475
/* SCC gets zero-extended to 64 bit */
1476
bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1477
} else if (dst.regClass() == v1) {
1478
bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1479
} else if (dst.regClass() == v2) {
1480
Temp upper = emit_extract_vector(ctx, src, 1, v1);
1481
Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1482
Temp gtz =
1483
bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);
1484
Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1485
upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1486
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1487
} else {
1488
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1489
}
1490
break;
1491
}
1492
case nir_op_imax: {
1493
if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1494
emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1495
} else if (dst.regClass() == v2b) {
1496
emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1497
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1498
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1499
} else if (dst.regClass() == v1) {
1500
emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1501
} else if (dst.regClass() == s1) {
1502
emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1503
} else {
1504
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1505
}
1506
break;
1507
}
1508
case nir_op_umax: {
1509
if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1510
emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1511
} else if (dst.regClass() == v2b) {
1512
emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1513
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1514
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1515
} else if (dst.regClass() == v1) {
1516
emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1517
} else if (dst.regClass() == s1) {
1518
emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1519
} else {
1520
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1521
}
1522
break;
1523
}
1524
case nir_op_imin: {
1525
if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1526
emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1527
} else if (dst.regClass() == v2b) {
1528
emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1529
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1530
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1531
} else if (dst.regClass() == v1) {
1532
emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1533
} else if (dst.regClass() == s1) {
1534
emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1535
} else {
1536
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1537
}
1538
break;
1539
}
1540
case nir_op_umin: {
1541
if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1542
emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1543
} else if (dst.regClass() == v2b) {
1544
emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1545
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1546
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1547
} else if (dst.regClass() == v1) {
1548
emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1549
} else if (dst.regClass() == s1) {
1550
emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1551
} else {
1552
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1553
}
1554
break;
1555
}
1556
case nir_op_ior: {
1557
if (instr->dest.dest.ssa.bit_size == 1) {
1558
emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1559
} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1560
emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1561
} else if (dst.regClass() == v2) {
1562
emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1563
} else if (dst.regClass() == s1) {
1564
emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1565
} else if (dst.regClass() == s2) {
1566
emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1567
} else {
1568
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1569
}
1570
break;
1571
}
1572
case nir_op_iand: {
1573
if (instr->dest.dest.ssa.bit_size == 1) {
1574
emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1575
} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1576
emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1577
} else if (dst.regClass() == v2) {
1578
emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1579
} else if (dst.regClass() == s1) {
1580
emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1581
} else if (dst.regClass() == s2) {
1582
emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1583
} else {
1584
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1585
}
1586
break;
1587
}
1588
case nir_op_ixor: {
1589
if (instr->dest.dest.ssa.bit_size == 1) {
1590
emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1591
} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1592
emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1593
} else if (dst.regClass() == v2) {
1594
emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1595
} else if (dst.regClass() == s1) {
1596
emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1597
} else if (dst.regClass() == s2) {
1598
emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1599
} else {
1600
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1601
}
1602
break;
1603
}
1604
case nir_op_ushr: {
1605
if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1606
emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1607
} else if (dst.regClass() == v2b) {
1608
emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1609
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1610
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1611
} else if (dst.regClass() == v1) {
1612
emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1613
} else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1614
bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1615
get_alu_src(ctx, instr->src[0]));
1616
} else if (dst.regClass() == v2) {
1617
emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1618
} else if (dst.regClass() == s2) {
1619
emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1620
} else if (dst.regClass() == s1) {
1621
emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1622
} else {
1623
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1624
}
1625
break;
1626
}
1627
case nir_op_ishl: {
1628
if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1629
emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1630
} else if (dst.regClass() == v2b) {
1631
emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1632
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1633
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1634
} else if (dst.regClass() == v1) {
1635
emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1636
false, 1);
1637
} else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1638
bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1639
get_alu_src(ctx, instr->src[0]));
1640
} else if (dst.regClass() == v2) {
1641
emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1642
} else if (dst.regClass() == s1) {
1643
emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1644
} else if (dst.regClass() == s2) {
1645
emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1646
} else {
1647
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1648
}
1649
break;
1650
}
1651
case nir_op_ishr: {
1652
if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1653
emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1654
} else if (dst.regClass() == v2b) {
1655
emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1656
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1657
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1658
} else if (dst.regClass() == v1) {
1659
emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1660
} else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1661
bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1662
get_alu_src(ctx, instr->src[0]));
1663
} else if (dst.regClass() == v2) {
1664
emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1665
} else if (dst.regClass() == s1) {
1666
emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1667
} else if (dst.regClass() == s2) {
1668
emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1669
} else {
1670
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1671
}
1672
break;
1673
}
1674
case nir_op_find_lsb: {
1675
Temp src = get_alu_src(ctx, instr->src[0]);
1676
if (src.regClass() == s1) {
1677
bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1678
} else if (src.regClass() == v1) {
1679
emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1680
} else if (src.regClass() == s2) {
1681
bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1682
} else {
1683
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1684
}
1685
break;
1686
}
1687
case nir_op_ufind_msb:
1688
case nir_op_ifind_msb: {
1689
Temp src = get_alu_src(ctx, instr->src[0]);
1690
if (src.regClass() == s1 || src.regClass() == s2) {
1691
aco_opcode op = src.regClass() == s2
1692
? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1693
: aco_opcode::s_flbit_i32_i64)
1694
: (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1695
: aco_opcode::s_flbit_i32);
1696
Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1697
1698
Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1699
Operand::c32(src.size() * 32u - 1u), msb_rev);
1700
Temp msb = sub.def(0).getTemp();
1701
Temp carry = sub.def(1).getTemp();
1702
1703
bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1704
bld.scc(carry));
1705
} else if (src.regClass() == v1) {
1706
aco_opcode op =
1707
instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1708
Temp msb_rev = bld.tmp(v1);
1709
emit_vop1_instruction(ctx, instr, op, msb_rev);
1710
Temp msb = bld.tmp(v1);
1711
Temp carry =
1712
bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1713
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);
1714
} else if (src.regClass() == v2) {
1715
aco_opcode op =
1716
instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1717
1718
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1719
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1720
1721
lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)),
1722
bld.vop1(op, bld.def(v1), lo));
1723
hi = bld.vop1(op, bld.def(v1), hi);
1724
Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi);
1725
1726
Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi);
1727
1728
Temp msb = bld.tmp(v1);
1729
Temp carry =
1730
bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1731
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);
1732
} else {
1733
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1734
}
1735
break;
1736
}
1737
case nir_op_bitfield_reverse: {
1738
if (dst.regClass() == s1) {
1739
bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1740
} else if (dst.regClass() == v1) {
1741
bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1742
} else {
1743
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1744
}
1745
break;
1746
}
1747
case nir_op_iadd: {
1748
if (dst.regClass() == s1) {
1749
emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1750
break;
1751
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
1752
emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1753
break;
1754
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
1755
emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1756
break;
1757
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1758
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1759
break;
1760
}
1761
1762
Temp src0 = get_alu_src(ctx, instr->src[0]);
1763
Temp src1 = get_alu_src(ctx, instr->src[1]);
1764
if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1765
bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1766
break;
1767
}
1768
1769
assert(src0.size() == 2 && src1.size() == 2);
1770
Temp src00 = bld.tmp(src0.type(), 1);
1771
Temp src01 = bld.tmp(dst.type(), 1);
1772
bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1773
Temp src10 = bld.tmp(src1.type(), 1);
1774
Temp src11 = bld.tmp(dst.type(), 1);
1775
bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1776
1777
if (dst.regClass() == s2) {
1778
Temp carry = bld.tmp(s1);
1779
Temp dst0 =
1780
bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1781
Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1782
bld.scc(carry));
1783
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1784
} else if (dst.regClass() == v2) {
1785
Temp dst0 = bld.tmp(v1);
1786
Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1787
Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1788
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1789
} else {
1790
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1791
}
1792
break;
1793
}
1794
case nir_op_uadd_sat: {
1795
Temp src0 = get_alu_src(ctx, instr->src[0]);
1796
Temp src1 = get_alu_src(ctx, instr->src[1]);
1797
if (dst.regClass() == s1) {
1798
Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1799
bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1800
bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1801
bld.scc(carry));
1802
} else if (dst.regClass() == v2b) {
1803
Instruction* add_instr;
1804
if (ctx->program->chip_class >= GFX10) {
1805
add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1806
} else {
1807
if (src1.type() == RegType::sgpr)
1808
std::swap(src0, src1);
1809
add_instr =
1810
bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1811
}
1812
add_instr->vop3().clamp = 1;
1813
} else if (dst.regClass() == v1) {
1814
uadd32_sat(bld, Definition(dst), src0, src1);
1815
} else {
1816
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1817
}
1818
break;
1819
}
1820
case nir_op_uadd_carry: {
1821
Temp src0 = get_alu_src(ctx, instr->src[0]);
1822
Temp src1 = get_alu_src(ctx, instr->src[1]);
1823
if (dst.regClass() == s1) {
1824
bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1825
break;
1826
}
1827
if (dst.regClass() == v1) {
1828
Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1829
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1830
carry);
1831
break;
1832
}
1833
1834
Temp src00 = bld.tmp(src0.type(), 1);
1835
Temp src01 = bld.tmp(dst.type(), 1);
1836
bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1837
Temp src10 = bld.tmp(src1.type(), 1);
1838
Temp src11 = bld.tmp(dst.type(), 1);
1839
bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1840
if (dst.regClass() == s2) {
1841
Temp carry = bld.tmp(s1);
1842
bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1843
carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1844
bld.scc(carry))
1845
.def(1)
1846
.getTemp();
1847
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1848
} else if (dst.regClass() == v2) {
1849
Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1850
carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1851
carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1852
Operand::c32(1u), carry);
1853
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1854
} else {
1855
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1856
}
1857
break;
1858
}
1859
case nir_op_isub: {
1860
if (dst.regClass() == s1) {
1861
emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1862
break;
1863
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1864
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
1865
break;
1866
}
1867
1868
Temp src0 = get_alu_src(ctx, instr->src[0]);
1869
Temp src1 = get_alu_src(ctx, instr->src[1]);
1870
if (dst.regClass() == v1) {
1871
bld.vsub32(Definition(dst), src0, src1);
1872
break;
1873
} else if (dst.bytes() <= 2) {
1874
if (ctx->program->chip_class >= GFX10)
1875
bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
1876
else if (src1.type() == RegType::sgpr)
1877
bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
1878
else if (ctx->program->chip_class >= GFX8)
1879
bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
1880
else
1881
bld.vsub32(Definition(dst), src0, src1);
1882
break;
1883
}
1884
1885
Temp src00 = bld.tmp(src0.type(), 1);
1886
Temp src01 = bld.tmp(dst.type(), 1);
1887
bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1888
Temp src10 = bld.tmp(src1.type(), 1);
1889
Temp src11 = bld.tmp(dst.type(), 1);
1890
bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1891
if (dst.regClass() == s2) {
1892
Temp borrow = bld.tmp(s1);
1893
Temp dst0 =
1894
bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1895
Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1896
bld.scc(borrow));
1897
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1898
} else if (dst.regClass() == v2) {
1899
Temp lower = bld.tmp(v1);
1900
Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1901
Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1902
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1903
} else {
1904
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1905
}
1906
break;
1907
}
1908
case nir_op_usub_borrow: {
1909
Temp src0 = get_alu_src(ctx, instr->src[0]);
1910
Temp src1 = get_alu_src(ctx, instr->src[1]);
1911
if (dst.regClass() == s1) {
1912
bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1913
break;
1914
} else if (dst.regClass() == v1) {
1915
Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1916
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1917
borrow);
1918
break;
1919
}
1920
1921
Temp src00 = bld.tmp(src0.type(), 1);
1922
Temp src01 = bld.tmp(dst.type(), 1);
1923
bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1924
Temp src10 = bld.tmp(src1.type(), 1);
1925
Temp src11 = bld.tmp(dst.type(), 1);
1926
bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1927
if (dst.regClass() == s2) {
1928
Temp borrow = bld.tmp(s1);
1929
bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1930
borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1931
bld.scc(borrow))
1932
.def(1)
1933
.getTemp();
1934
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1935
} else if (dst.regClass() == v2) {
1936
Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1937
borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1938
borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1939
Operand::c32(1u), borrow);
1940
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1941
} else {
1942
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1943
}
1944
break;
1945
}
1946
case nir_op_imul: {
1947
if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
1948
emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
1949
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
1950
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
1951
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1952
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
1953
} else if (dst.type() == RegType::vgpr) {
1954
Temp src0 = get_alu_src(ctx, instr->src[0]);
1955
Temp src1 = get_alu_src(ctx, instr->src[1]);
1956
uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
1957
uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
1958
1959
if (src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff &&
1960
(ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9)) {
1961
/* If the 16-bit multiplication can't overflow, emit v_mul_lo_u16
1962
* but only on GFX8-9 because GFX10 doesn't zero the upper 16
1963
* bits.
1964
*/
1965
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true /* commutative */,
1966
false, false, true /* nuw */);
1967
} else if (src0_ub <= 0xffff && src1_ub <= 0xffff && ctx->options->chip_class >= GFX9) {
1968
/* Initialize the accumulator to 0 to allow further combinations
1969
* in the optimizer.
1970
*/
1971
Operand op0(src0);
1972
Operand op1(src1);
1973
bld.vop3(aco_opcode::v_mad_u32_u16, Definition(dst), bld.set16bit(op0),
1974
bld.set16bit(op1), Operand::zero());
1975
} else if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
1976
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, true);
1977
} else if (nir_src_is_const(instr->src[0].src)) {
1978
bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
1979
nir_src_as_uint(instr->src[0].src), false);
1980
} else if (nir_src_is_const(instr->src[1].src)) {
1981
bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
1982
nir_src_as_uint(instr->src[1].src), false);
1983
} else {
1984
emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
1985
}
1986
} else if (dst.regClass() == s1) {
1987
emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1988
} else {
1989
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1990
}
1991
break;
1992
}
1993
case nir_op_umul_high: {
1994
if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1995
emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
1996
} else if (dst.bytes() == 4) {
1997
uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
1998
uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
1999
2000
Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
2001
if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2002
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2003
} else {
2004
emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2005
}
2006
2007
if (dst.regClass() == s1)
2008
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2009
} else {
2010
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2011
}
2012
break;
2013
}
2014
case nir_op_imul_high: {
2015
if (dst.regClass() == v1) {
2016
emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2017
} else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
2018
emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2019
} else if (dst.regClass() == s1) {
2020
Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2021
as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2022
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2023
} else {
2024
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2025
}
2026
break;
2027
}
2028
case nir_op_fmul: {
2029
if (dst.regClass() == v2b) {
2030
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2031
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2032
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2033
} else if (dst.regClass() == v1) {
2034
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2035
} else if (dst.regClass() == v2) {
2036
emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst);
2037
} else {
2038
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2039
}
2040
break;
2041
}
2042
case nir_op_fadd: {
2043
if (dst.regClass() == v2b) {
2044
emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2045
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2046
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2047
} else if (dst.regClass() == v1) {
2048
emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2049
} else if (dst.regClass() == v2) {
2050
emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst);
2051
} else {
2052
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2053
}
2054
break;
2055
}
2056
case nir_op_fsub: {
2057
if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2058
Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2059
VOP3P_instruction& sub = add->vop3p();
2060
sub.neg_lo[1] = true;
2061
sub.neg_hi[1] = true;
2062
break;
2063
}
2064
2065
Temp src0 = get_alu_src(ctx, instr->src[0]);
2066
Temp src1 = get_alu_src(ctx, instr->src[1]);
2067
if (dst.regClass() == v2b) {
2068
if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2069
emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2070
else
2071
emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2072
} else if (dst.regClass() == v1) {
2073
if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2074
emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2075
else
2076
emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2077
} else if (dst.regClass() == v2) {
2078
Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),
2079
as_vgpr(ctx, src1));
2080
add->vop3().neg[1] = true;
2081
} else {
2082
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2083
}
2084
break;
2085
}
2086
case nir_op_fmax: {
2087
if (dst.regClass() == v2b) {
2088
// TODO: check fp_mode.must_flush_denorms16_64
2089
emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
2090
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2091
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2092
} else if (dst.regClass() == v1) {
2093
emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2094
ctx->block->fp_mode.must_flush_denorms32);
2095
} else if (dst.regClass() == v2) {
2096
emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,
2097
ctx->block->fp_mode.must_flush_denorms16_64);
2098
} else {
2099
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2100
}
2101
break;
2102
}
2103
case nir_op_fmin: {
2104
if (dst.regClass() == v2b) {
2105
// TODO: check fp_mode.must_flush_denorms16_64
2106
emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
2107
} else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2108
emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2109
} else if (dst.regClass() == v1) {
2110
emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2111
ctx->block->fp_mode.must_flush_denorms32);
2112
} else if (dst.regClass() == v2) {
2113
emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,
2114
ctx->block->fp_mode.must_flush_denorms16_64);
2115
} else {
2116
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2117
}
2118
break;
2119
}
2120
case nir_op_cube_face_coord_amd: {
2121
Temp in = get_alu_src(ctx, instr->src[0], 3);
2122
Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2123
emit_extract_vector(ctx, in, 2, v1)};
2124
Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2125
ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
2126
Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2127
Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2128
sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),
2129
bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma));
2130
tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),
2131
bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma));
2132
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
2133
break;
2134
}
2135
case nir_op_cube_face_index_amd: {
2136
Temp in = get_alu_src(ctx, instr->src[0], 3);
2137
Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2138
emit_extract_vector(ctx, in, 2, v1)};
2139
bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
2140
break;
2141
}
2142
case nir_op_bcsel: {
2143
emit_bcsel(ctx, instr, dst);
2144
break;
2145
}
2146
case nir_op_frsq: {
2147
if (dst.regClass() == v2b) {
2148
emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2149
} else if (dst.regClass() == v1) {
2150
Temp src = get_alu_src(ctx, instr->src[0]);
2151
emit_rsq(ctx, bld, Definition(dst), src);
2152
} else if (dst.regClass() == v2) {
2153
/* Lowered at NIR level for precision reasons. */
2154
emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2155
} else {
2156
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2157
}
2158
break;
2159
}
2160
case nir_op_fneg: {
2161
if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2162
Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2163
bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0xBC00),
2164
instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2165
emit_split_vector(ctx, dst, 2);
2166
break;
2167
}
2168
Temp src = get_alu_src(ctx, instr->src[0]);
2169
if (dst.regClass() == v2b) {
2170
bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2171
} else if (dst.regClass() == v1) {
2172
bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2173
as_vgpr(ctx, src));
2174
} else if (dst.regClass() == v2) {
2175
if (ctx->block->fp_mode.must_flush_denorms16_64)
2176
src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2177
as_vgpr(ctx, src));
2178
Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2179
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2180
upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2181
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2182
} else {
2183
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2184
}
2185
break;
2186
}
2187
case nir_op_fabs: {
2188
Temp src = get_alu_src(ctx, instr->src[0]);
2189
if (dst.regClass() == v2b) {
2190
Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2191
Operand::c16(0x3c00), as_vgpr(ctx, src))
2192
.instr;
2193
mul->vop3().abs[1] = true;
2194
} else if (dst.regClass() == v1) {
2195
Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2196
Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2197
.instr;
2198
mul->vop3().abs[1] = true;
2199
} else if (dst.regClass() == v2) {
2200
if (ctx->block->fp_mode.must_flush_denorms16_64)
2201
src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2202
as_vgpr(ctx, src));
2203
Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2204
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2205
upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2206
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2207
} else {
2208
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2209
}
2210
break;
2211
}
2212
case nir_op_fsat: {
2213
if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2214
Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2215
Instruction* vop3p =
2216
bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2217
instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2218
vop3p->vop3p().clamp = true;
2219
emit_split_vector(ctx, dst, 2);
2220
break;
2221
}
2222
Temp src = get_alu_src(ctx, instr->src[0]);
2223
if (dst.regClass() == v2b) {
2224
bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2225
src);
2226
} else if (dst.regClass() == v1) {
2227
bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2228
Operand::c32(0x3f800000u), src);
2229
/* apparently, it is not necessary to flush denorms if this instruction is used with these
2230
* operands */
2231
// TODO: confirm that this holds under any circumstances
2232
} else if (dst.regClass() == v2) {
2233
Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero());
2234
add->vop3().clamp = true;
2235
} else {
2236
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2237
}
2238
break;
2239
}
2240
case nir_op_flog2: {
2241
if (dst.regClass() == v2b) {
2242
emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2243
} else if (dst.regClass() == v1) {
2244
Temp src = get_alu_src(ctx, instr->src[0]);
2245
emit_log2(ctx, bld, Definition(dst), src);
2246
} else {
2247
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2248
}
2249
break;
2250
}
2251
case nir_op_frcp: {
2252
if (dst.regClass() == v2b) {
2253
emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2254
} else if (dst.regClass() == v1) {
2255
Temp src = get_alu_src(ctx, instr->src[0]);
2256
emit_rcp(ctx, bld, Definition(dst), src);
2257
} else if (dst.regClass() == v2) {
2258
/* Lowered at NIR level for precision reasons. */
2259
emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2260
} else {
2261
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2262
}
2263
break;
2264
}
2265
case nir_op_fexp2: {
2266
if (dst.regClass() == v2b) {
2267
emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2268
} else if (dst.regClass() == v1) {
2269
emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2270
} else {
2271
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2272
}
2273
break;
2274
}
2275
case nir_op_fsqrt: {
2276
if (dst.regClass() == v2b) {
2277
emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2278
} else if (dst.regClass() == v1) {
2279
Temp src = get_alu_src(ctx, instr->src[0]);
2280
emit_sqrt(ctx, bld, Definition(dst), src);
2281
} else if (dst.regClass() == v2) {
2282
/* Lowered at NIR level for precision reasons. */
2283
emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2284
} else {
2285
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2286
}
2287
break;
2288
}
2289
case nir_op_ffract: {
2290
if (dst.regClass() == v2b) {
2291
emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2292
} else if (dst.regClass() == v1) {
2293
emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2294
} else if (dst.regClass() == v2) {
2295
emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2296
} else {
2297
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2298
}
2299
break;
2300
}
2301
case nir_op_ffloor: {
2302
if (dst.regClass() == v2b) {
2303
emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2304
} else if (dst.regClass() == v1) {
2305
emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2306
} else if (dst.regClass() == v2) {
2307
Temp src = get_alu_src(ctx, instr->src[0]);
2308
emit_floor_f64(ctx, bld, Definition(dst), src);
2309
} else {
2310
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2311
}
2312
break;
2313
}
2314
case nir_op_fceil: {
2315
if (dst.regClass() == v2b) {
2316
emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2317
} else if (dst.regClass() == v1) {
2318
emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2319
} else if (dst.regClass() == v2) {
2320
if (ctx->options->chip_class >= GFX7) {
2321
emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2322
} else {
2323
/* GFX6 doesn't support V_CEIL_F64, lower it. */
2324
/* trunc = trunc(src0)
2325
* if (src0 > 0.0 && src0 != trunc)
2326
* trunc += 1.0
2327
*/
2328
Temp src0 = get_alu_src(ctx, instr->src[0]);
2329
Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2330
Temp tmp0 =
2331
bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2332
Temp tmp1 =
2333
bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
2334
Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc),
2335
tmp0, tmp1);
2336
Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2337
bld.copy(bld.def(v1), Operand::zero()),
2338
bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2339
add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2340
bld.copy(bld.def(v1), Operand::zero()), add);
2341
bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2342
}
2343
} else {
2344
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2345
}
2346
break;
2347
}
2348
case nir_op_ftrunc: {
2349
if (dst.regClass() == v2b) {
2350
emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2351
} else if (dst.regClass() == v1) {
2352
emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2353
} else if (dst.regClass() == v2) {
2354
Temp src = get_alu_src(ctx, instr->src[0]);
2355
emit_trunc_f64(ctx, bld, Definition(dst), src);
2356
} else {
2357
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2358
}
2359
break;
2360
}
2361
case nir_op_fround_even: {
2362
if (dst.regClass() == v2b) {
2363
emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2364
} else if (dst.regClass() == v1) {
2365
emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2366
} else if (dst.regClass() == v2) {
2367
if (ctx->options->chip_class >= GFX7) {
2368
emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2369
} else {
2370
/* GFX6 doesn't support V_RNDNE_F64, lower it. */
2371
Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2372
Temp src0 = get_alu_src(ctx, instr->src[0]);
2373
bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2374
2375
Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2376
bld.copy(bld.def(s1), Operand::c32(-2u)));
2377
Temp bfi =
2378
bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2379
bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2380
Temp tmp =
2381
bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0,
2382
bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2383
Instruction* sub =
2384
bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp,
2385
bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2386
sub->vop3().neg[1] = true;
2387
tmp = sub->definitions[0].getTemp();
2388
2389
Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2390
Operand::c32(0x432fffffu));
2391
Instruction* vop3 =
2392
bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2393
vop3->vop3().abs[0] = true;
2394
Temp cond = vop3->definitions[0].getTemp();
2395
2396
Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2397
bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2398
Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2399
as_vgpr(ctx, src0_lo), cond);
2400
Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2401
as_vgpr(ctx, src0_hi), cond);
2402
2403
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2404
}
2405
} else {
2406
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2407
}
2408
break;
2409
}
2410
case nir_op_fsin:
2411
case nir_op_fcos: {
2412
Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2413
aco_ptr<Instruction> norm;
2414
if (dst.regClass() == v2b) {
2415
Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3118u));
2416
Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2417
aco_opcode opcode =
2418
instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2419
bld.vop1(opcode, Definition(dst), tmp);
2420
} else if (dst.regClass() == v1) {
2421
Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3e22f983u));
2422
Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2423
2424
/* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2425
if (ctx->options->chip_class < GFX9)
2426
tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2427
2428
aco_opcode opcode =
2429
instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2430
bld.vop1(opcode, Definition(dst), tmp);
2431
} else {
2432
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2433
}
2434
break;
2435
}
2436
case nir_op_ldexp: {
2437
if (dst.regClass() == v2b) {
2438
emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2439
} else if (dst.regClass() == v1) {
2440
emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2441
} else if (dst.regClass() == v2) {
2442
emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2443
} else {
2444
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2445
}
2446
break;
2447
}
2448
case nir_op_frexp_sig: {
2449
if (dst.regClass() == v2b) {
2450
emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2451
} else if (dst.regClass() == v1) {
2452
emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2453
} else if (dst.regClass() == v2) {
2454
emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2455
} else {
2456
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2457
}
2458
break;
2459
}
2460
case nir_op_frexp_exp: {
2461
if (instr->src[0].src.ssa->bit_size == 16) {
2462
Temp src = get_alu_src(ctx, instr->src[0]);
2463
Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2464
tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2465
convert_int(ctx, bld, tmp, 8, 32, true, dst);
2466
} else if (instr->src[0].src.ssa->bit_size == 32) {
2467
emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2468
} else if (instr->src[0].src.ssa->bit_size == 64) {
2469
emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2470
} else {
2471
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2472
}
2473
break;
2474
}
2475
case nir_op_fsign: {
2476
Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2477
if (dst.regClass() == v2b) {
2478
assert(ctx->program->chip_class >= GFX9);
2479
/* replace negative zero with positive zero */
2480
src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src);
2481
src =
2482
bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src, Operand::c16(1u));
2483
bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2484
} else if (dst.regClass() == v1) {
2485
src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src);
2486
src =
2487
bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u));
2488
bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2489
} else if (dst.regClass() == v2) {
2490
Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)),
2491
Operand::zero(), src);
2492
Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2493
Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2494
emit_extract_vector(ctx, src, 1, v1), cond);
2495
2496
cond =
2497
bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);
2498
tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2499
upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2500
2501
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2502
} else {
2503
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2504
}
2505
break;
2506
}
2507
case nir_op_f2f16:
2508
case nir_op_f2f16_rtne: {
2509
Temp src = get_alu_src(ctx, instr->src[0]);
2510
if (instr->src[0].src.ssa->bit_size == 64)
2511
src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2512
if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2513
/* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2514
* keep value numbering and the scheduler simpler.
2515
*/
2516
bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2517
else
2518
bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2519
break;
2520
}
2521
case nir_op_f2f16_rtz: {
2522
Temp src = get_alu_src(ctx, instr->src[0]);
2523
if (instr->src[0].src.ssa->bit_size == 64)
2524
src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2525
if (ctx->block->fp_mode.round16_64 == fp_round_tz)
2526
bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2527
else if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
2528
bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
2529
else
2530
bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2531
break;
2532
}
2533
case nir_op_f2f32: {
2534
if (instr->src[0].src.ssa->bit_size == 16) {
2535
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2536
} else if (instr->src[0].src.ssa->bit_size == 64) {
2537
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2538
} else {
2539
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2540
}
2541
break;
2542
}
2543
case nir_op_f2f64: {
2544
Temp src = get_alu_src(ctx, instr->src[0]);
2545
if (instr->src[0].src.ssa->bit_size == 16)
2546
src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2547
bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2548
break;
2549
}
2550
case nir_op_i2f16: {
2551
assert(dst.regClass() == v2b);
2552
Temp src = get_alu_src(ctx, instr->src[0]);
2553
const unsigned input_size = instr->src[0].src.ssa->bit_size;
2554
if (input_size <= 16) {
2555
/* Expand integer to the size expected by the uint→float converter used below */
2556
unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);
2557
if (input_size != target_size) {
2558
src = convert_int(ctx, bld, src, input_size, target_size, true);
2559
}
2560
} else if (input_size == 64) {
2561
/* Truncate down to 32 bits; if any of the upper bits are relevant,
2562
* the value does not fall into the single-precision float range
2563
* anyway. SPIR-V does not mandate any specific behavior for such
2564
* large inputs.
2565
*/
2566
src = convert_int(ctx, bld, src, 64, 32, false);
2567
}
2568
2569
if (ctx->program->chip_class >= GFX8 && input_size <= 16) {
2570
bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2571
} else {
2572
/* Convert to f32 and then down to f16. This is needed to handle
2573
* inputs slightly outside the range [INT16_MIN, INT16_MAX],
2574
* which are representable via f16 but wouldn't be converted
2575
* correctly by v_cvt_f16_i16.
2576
*
2577
* This is also the fallback-path taken on GFX7 and earlier, which
2578
* do not support direct f16⟷i16 conversions.
2579
*/
2580
src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
2581
bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2582
}
2583
break;
2584
}
2585
case nir_op_i2f32: {
2586
assert(dst.size() == 1);
2587
Temp src = get_alu_src(ctx, instr->src[0]);
2588
const unsigned input_size = instr->src[0].src.ssa->bit_size;
2589
if (input_size <= 32) {
2590
if (input_size <= 16) {
2591
/* Sign-extend to 32-bits */
2592
src = convert_int(ctx, bld, src, input_size, 32, true);
2593
}
2594
bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2595
} else {
2596
assert(input_size == 64);
2597
RegClass rc = RegClass(src.type(), 1);
2598
Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2599
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2600
lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2601
upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2602
upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2603
upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);
2604
bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);
2605
}
2606
2607
break;
2608
}
2609
case nir_op_i2f64: {
2610
if (instr->src[0].src.ssa->bit_size <= 32) {
2611
Temp src = get_alu_src(ctx, instr->src[0]);
2612
if (instr->src[0].src.ssa->bit_size <= 16)
2613
src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2614
bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2615
} else if (instr->src[0].src.ssa->bit_size == 64) {
2616
Temp src = get_alu_src(ctx, instr->src[0]);
2617
RegClass rc = RegClass(src.type(), 1);
2618
Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2619
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2620
lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2621
upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2622
upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2623
bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2624
2625
} else {
2626
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2627
}
2628
break;
2629
}
2630
case nir_op_u2f16: {
2631
assert(dst.regClass() == v2b);
2632
Temp src = get_alu_src(ctx, instr->src[0]);
2633
const unsigned input_size = instr->src[0].src.ssa->bit_size;
2634
if (input_size <= 16) {
2635
/* Expand integer to the size expected by the uint→float converter used below */
2636
unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);
2637
if (input_size != target_size) {
2638
src = convert_int(ctx, bld, src, input_size, target_size, false);
2639
}
2640
} else if (input_size == 64) {
2641
/* Truncate down to 32 bits; if any of the upper bits are non-zero,
2642
* the value does not fall into the single-precision float range
2643
* anyway. SPIR-V does not mandate any specific behavior for such
2644
* large inputs.
2645
*/
2646
src = convert_int(ctx, bld, src, 64, 32, false);
2647
}
2648
2649
if (ctx->program->chip_class >= GFX8) {
2650
/* float16 has a range of [0, 65519]. Converting from larger
2651
* inputs is UB, so we just need to consider the lower 16 bits */
2652
bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2653
} else {
2654
/* GFX7 and earlier do not support direct f16⟷u16 conversions */
2655
src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
2656
bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2657
}
2658
break;
2659
}
2660
case nir_op_u2f32: {
2661
assert(dst.size() == 1);
2662
Temp src = get_alu_src(ctx, instr->src[0]);
2663
const unsigned input_size = instr->src[0].src.ssa->bit_size;
2664
if (input_size == 8) {
2665
bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
2666
} else if (input_size <= 32) {
2667
if (input_size == 16)
2668
src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2669
bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
2670
} else {
2671
assert(input_size == 64);
2672
RegClass rc = RegClass(src.type(), 1);
2673
Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2674
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2675
lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2676
upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2677
upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2678
upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);
2679
bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);
2680
}
2681
break;
2682
}
2683
case nir_op_u2f64: {
2684
if (instr->src[0].src.ssa->bit_size <= 32) {
2685
Temp src = get_alu_src(ctx, instr->src[0]);
2686
if (instr->src[0].src.ssa->bit_size <= 16)
2687
src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2688
bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
2689
} else if (instr->src[0].src.ssa->bit_size == 64) {
2690
Temp src = get_alu_src(ctx, instr->src[0]);
2691
RegClass rc = RegClass(src.type(), 1);
2692
Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2693
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2694
lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2695
upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2696
upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2697
bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2698
} else {
2699
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2700
}
2701
break;
2702
}
2703
case nir_op_f2i8:
2704
case nir_op_f2i16: {
2705
if (instr->src[0].src.ssa->bit_size == 16) {
2706
if (ctx->program->chip_class >= GFX8) {
2707
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
2708
} else {
2709
/* GFX7 and earlier do not support direct f16⟷i16 conversions */
2710
Temp tmp = bld.tmp(v1);
2711
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
2712
tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
2713
tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
2714
(dst.type() == RegType::sgpr) ? Temp() : dst);
2715
if (dst.type() == RegType::sgpr) {
2716
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2717
}
2718
}
2719
} else if (instr->src[0].src.ssa->bit_size == 32) {
2720
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2721
} else {
2722
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2723
}
2724
break;
2725
}
2726
case nir_op_f2u8:
2727
case nir_op_f2u16: {
2728
if (instr->src[0].src.ssa->bit_size == 16) {
2729
if (ctx->program->chip_class >= GFX8) {
2730
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
2731
} else {
2732
/* GFX7 and earlier do not support direct f16⟷u16 conversions */
2733
Temp tmp = bld.tmp(v1);
2734
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
2735
tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
2736
tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
2737
(dst.type() == RegType::sgpr) ? Temp() : dst);
2738
if (dst.type() == RegType::sgpr) {
2739
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2740
}
2741
}
2742
} else if (instr->src[0].src.ssa->bit_size == 32) {
2743
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2744
} else {
2745
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2746
}
2747
break;
2748
}
2749
case nir_op_f2i32: {
2750
Temp src = get_alu_src(ctx, instr->src[0]);
2751
if (instr->src[0].src.ssa->bit_size == 16) {
2752
Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2753
if (dst.type() == RegType::vgpr) {
2754
bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
2755
} else {
2756
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2757
bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
2758
}
2759
} else if (instr->src[0].src.ssa->bit_size == 32) {
2760
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2761
} else if (instr->src[0].src.ssa->bit_size == 64) {
2762
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2763
} else {
2764
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2765
}
2766
break;
2767
}
2768
case nir_op_f2u32: {
2769
Temp src = get_alu_src(ctx, instr->src[0]);
2770
if (instr->src[0].src.ssa->bit_size == 16) {
2771
Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2772
if (dst.type() == RegType::vgpr) {
2773
bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
2774
} else {
2775
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2776
bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
2777
}
2778
} else if (instr->src[0].src.ssa->bit_size == 32) {
2779
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2780
} else if (instr->src[0].src.ssa->bit_size == 64) {
2781
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2782
} else {
2783
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2784
}
2785
break;
2786
}
2787
case nir_op_f2i64: {
2788
Temp src = get_alu_src(ctx, instr->src[0]);
2789
if (instr->src[0].src.ssa->bit_size == 16)
2790
src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2791
2792
if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2793
Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2794
exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::zero(), exponent,
2795
Operand::c32(64u));
2796
Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);
2797
Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), src);
2798
mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);
2799
mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), mantissa);
2800
mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);
2801
Temp new_exponent = bld.tmp(v1);
2802
Temp borrow =
2803
bld.vsub32(Definition(new_exponent), Operand::c32(63u), exponent, true).def(1).getTemp();
2804
if (ctx->program->chip_class >= GFX8)
2805
mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
2806
else
2807
mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
2808
Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand::c32(0xfffffffeu));
2809
Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2810
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2811
lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower,
2812
Operand::c32(0xffffffffu), borrow);
2813
upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
2814
lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
2815
upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
2816
Temp new_lower = bld.tmp(v1);
2817
borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
2818
Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
2819
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
2820
2821
} else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2822
if (src.type() == RegType::vgpr)
2823
src = bld.as_uniform(src);
2824
Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,
2825
Operand::c32(0x80017u));
2826
exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,
2827
Operand::c32(126u));
2828
exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),
2829
exponent);
2830
exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc),
2831
Operand::c32(64u), exponent);
2832
Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
2833
Operand::c32(0x7fffffu), src);
2834
Temp sign =
2835
bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(31u));
2836
mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
2837
Operand::c32(0x800000u), mantissa);
2838
mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa,
2839
Operand::c32(7u));
2840
mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);
2841
exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2842
Operand::c32(63u), exponent);
2843
mantissa =
2844
bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
2845
Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent,
2846
Operand::c32(0xffffffffu)); // exp >= 64
2847
Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand::c32(0xfffffffeu));
2848
mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
2849
Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2850
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2851
lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
2852
upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
2853
Temp borrow = bld.tmp(s1);
2854
lower =
2855
bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
2856
upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign,
2857
bld.scc(borrow));
2858
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2859
2860
} else if (instr->src[0].src.ssa->bit_size == 64) {
2861
Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2862
Operand::c32(0x3df00000u));
2863
Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2864
Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2865
vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2866
Operand::c32(0xc1f00000u));
2867
Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2868
Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2869
Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2870
Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
2871
if (dst.type() == RegType::sgpr) {
2872
lower = bld.as_uniform(lower);
2873
upper = bld.as_uniform(upper);
2874
}
2875
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2876
2877
} else {
2878
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2879
}
2880
break;
2881
}
2882
case nir_op_f2u64: {
2883
Temp src = get_alu_src(ctx, instr->src[0]);
2884
if (instr->src[0].src.ssa->bit_size == 16)
2885
src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2886
2887
if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2888
Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2889
Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)),
2890
Operand::c32(64u), exponent);
2891
exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::zero(), exponent);
2892
Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);
2893
mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);
2894
Temp exponent_small = bld.vsub32(bld.def(v1), Operand::c32(24u), exponent);
2895
Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
2896
mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);
2897
Temp new_exponent = bld.tmp(v1);
2898
Temp cond_small =
2899
bld.vsub32(Definition(new_exponent), exponent, Operand::c32(24u), true).def(1).getTemp();
2900
if (ctx->program->chip_class >= GFX8)
2901
mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
2902
else
2903
mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
2904
Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2905
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2906
lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
2907
upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand::zero(),
2908
cond_small);
2909
lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), lower,
2910
exponent_in_range);
2911
upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), upper,
2912
exponent_in_range);
2913
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2914
2915
} else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2916
if (src.type() == RegType::vgpr)
2917
src = bld.as_uniform(src);
2918
Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,
2919
Operand::c32(0x80017u));
2920
exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,
2921
Operand::c32(126u));
2922
exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),
2923
exponent);
2924
Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
2925
Operand::c32(0x7fffffu), src);
2926
mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
2927
Operand::c32(0x800000u), mantissa);
2928
Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2929
Operand::c32(24u), exponent);
2930
Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa,
2931
exponent_small);
2932
mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);
2933
Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2934
exponent, Operand::c32(24u));
2935
mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa,
2936
exponent_large);
2937
Temp cond =
2938
bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand::c32(64u), exponent);
2939
mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa,
2940
Operand::c32(0xffffffffu), cond);
2941
Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2942
bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2943
Temp cond_small =
2944
bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand::c32(24u));
2945
lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2946
upper =
2947
bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::zero(), upper, cond_small);
2948
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2949
2950
} else if (instr->src[0].src.ssa->bit_size == 64) {
2951
Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2952
Operand::c32(0x3df00000u));
2953
Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2954
Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2955
vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2956
Operand::c32(0xc1f00000u));
2957
Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2958
Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2959
Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2960
Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2961
if (dst.type() == RegType::sgpr) {
2962
lower = bld.as_uniform(lower);
2963
upper = bld.as_uniform(upper);
2964
}
2965
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2966
2967
} else {
2968
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2969
}
2970
break;
2971
}
2972
case nir_op_b2f16: {
2973
Temp src = get_alu_src(ctx, instr->src[0]);
2974
assert(src.regClass() == bld.lm);
2975
2976
if (dst.regClass() == s1) {
2977
src = bool_to_scalar_condition(ctx, src);
2978
bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
2979
} else if (dst.regClass() == v2b) {
2980
Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
2981
bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
2982
} else {
2983
unreachable("Wrong destination register class for nir_op_b2f16.");
2984
}
2985
break;
2986
}
2987
case nir_op_b2f32: {
2988
Temp src = get_alu_src(ctx, instr->src[0]);
2989
assert(src.regClass() == bld.lm);
2990
2991
if (dst.regClass() == s1) {
2992
src = bool_to_scalar_condition(ctx, src);
2993
bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
2994
} else if (dst.regClass() == v1) {
2995
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
2996
Operand::c32(0x3f800000u), src);
2997
} else {
2998
unreachable("Wrong destination register class for nir_op_b2f32.");
2999
}
3000
break;
3001
}
3002
case nir_op_b2f64: {
3003
Temp src = get_alu_src(ctx, instr->src[0]);
3004
assert(src.regClass() == bld.lm);
3005
3006
if (dst.regClass() == s2) {
3007
src = bool_to_scalar_condition(ctx, src);
3008
bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3009
Operand::zero(), bld.scc(src));
3010
} else if (dst.regClass() == v2) {
3011
Temp one = bld.copy(bld.def(v2), Operand::c32(0x3FF00000u));
3012
Temp upper =
3013
bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3014
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3015
} else {
3016
unreachable("Wrong destination register class for nir_op_b2f64.");
3017
}
3018
break;
3019
}
3020
case nir_op_i2i8:
3021
case nir_op_i2i16:
3022
case nir_op_i2i32:
3023
case nir_op_i2i64: {
3024
if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3025
/* no need to do the extract in get_alu_src() */
3026
sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
3027
? sgpr_extract_sext
3028
: sgpr_extract_undef;
3029
extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3030
} else {
3031
const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3032
const unsigned output_bitsize = instr->dest.dest.ssa.bit_size;
3033
convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3034
output_bitsize > input_bitsize, dst);
3035
}
3036
break;
3037
}
3038
case nir_op_u2u8:
3039
case nir_op_u2u16:
3040
case nir_op_u2u32:
3041
case nir_op_u2u64: {
3042
if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3043
/* no need to do the extract in get_alu_src() */
3044
sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
3045
? sgpr_extract_zext
3046
: sgpr_extract_undef;
3047
extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3048
} else {
3049
convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3050
instr->dest.dest.ssa.bit_size, false, dst);
3051
}
3052
break;
3053
}
3054
case nir_op_b2b32:
3055
case nir_op_b2i8:
3056
case nir_op_b2i16:
3057
case nir_op_b2i32:
3058
case nir_op_b2i64: {
3059
Temp src = get_alu_src(ctx, instr->src[0]);
3060
assert(src.regClass() == bld.lm);
3061
3062
Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
3063
if (tmp.regClass() == s1) {
3064
bool_to_scalar_condition(ctx, src, tmp);
3065
} else if (tmp.type() == RegType::vgpr) {
3066
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand::zero(), Operand::c32(1u),
3067
src);
3068
} else {
3069
unreachable("Invalid register class for b2i32");
3070
}
3071
3072
if (tmp != dst)
3073
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
3074
break;
3075
}
3076
case nir_op_b2b1:
3077
case nir_op_i2b1: {
3078
Temp src = get_alu_src(ctx, instr->src[0]);
3079
assert(dst.regClass() == bld.lm);
3080
3081
if (src.type() == RegType::vgpr) {
3082
assert(src.regClass() == v1 || src.regClass() == v2);
3083
assert(dst.regClass() == bld.lm);
3084
bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3085
Definition(dst), Operand::zero(), src)
3086
.def(0)
3087
.setHint(vcc);
3088
} else {
3089
assert(src.regClass() == s1 || src.regClass() == s2);
3090
Temp tmp;
3091
if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
3092
tmp =
3093
bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3094
.def(1)
3095
.getTemp();
3096
} else {
3097
tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3098
bld.scc(bld.def(s1)), Operand::zero(), src);
3099
}
3100
bool_to_vector_condition(ctx, tmp, dst);
3101
}
3102
break;
3103
}
3104
case nir_op_unpack_64_2x32:
3105
case nir_op_unpack_32_2x16:
3106
case nir_op_unpack_64_4x16:
3107
bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3108
emit_split_vector(ctx, dst, instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3109
break;
3110
case nir_op_pack_64_2x32_split: {
3111
Temp src0 = get_alu_src(ctx, instr->src[0]);
3112
Temp src1 = get_alu_src(ctx, instr->src[1]);
3113
3114
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3115
break;
3116
}
3117
case nir_op_unpack_64_2x32_split_x:
3118
bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3119
get_alu_src(ctx, instr->src[0]));
3120
break;
3121
case nir_op_unpack_64_2x32_split_y:
3122
bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3123
get_alu_src(ctx, instr->src[0]));
3124
break;
3125
case nir_op_unpack_32_2x16_split_x:
3126
if (dst.type() == RegType::vgpr) {
3127
bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3128
get_alu_src(ctx, instr->src[0]));
3129
} else {
3130
bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3131
}
3132
break;
3133
case nir_op_unpack_32_2x16_split_y:
3134
if (dst.type() == RegType::vgpr) {
3135
bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3136
get_alu_src(ctx, instr->src[0]));
3137
} else {
3138
bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3139
get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3140
Operand::zero());
3141
}
3142
break;
3143
case nir_op_pack_32_2x16_split: {
3144
Temp src0 = get_alu_src(ctx, instr->src[0]);
3145
Temp src1 = get_alu_src(ctx, instr->src[1]);
3146
if (dst.regClass() == v1) {
3147
src0 = emit_extract_vector(ctx, src0, 0, v2b);
3148
src1 = emit_extract_vector(ctx, src1, 0, v2b);
3149
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3150
} else {
3151
src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3152
Operand::c32(0xFFFFu));
3153
src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3154
Operand::c32(16u));
3155
bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3156
}
3157
break;
3158
}
3159
case nir_op_pack_half_2x16_split: {
3160
if (dst.regClass() == v1) {
3161
nir_const_value* val = nir_src_as_const_value(instr->src[1].src);
3162
if (val && val->u32 == 0 && ctx->program->chip_class <= GFX9) {
3163
/* upper bits zero on GFX6-GFX9 */
3164
bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), get_alu_src(ctx, instr->src[0]));
3165
} else if (!ctx->block->fp_mode.care_about_round16_64 ||
3166
ctx->block->fp_mode.round16_64 == fp_round_tz) {
3167
if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
3168
emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3169
else
3170
emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3171
} else {
3172
Temp src0 =
3173
bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[0]));
3174
Temp src1 =
3175
bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[1]));
3176
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3177
}
3178
} else {
3179
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3180
}
3181
break;
3182
}
3183
case nir_op_unpack_half_2x16_split_x_flush_to_zero:
3184
case nir_op_unpack_half_2x16_split_x: {
3185
Temp src = get_alu_src(ctx, instr->src[0]);
3186
if (src.regClass() == v1)
3187
src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3188
if (dst.regClass() == v1) {
3189
assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3190
(instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));
3191
bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3192
} else {
3193
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3194
}
3195
break;
3196
}
3197
case nir_op_unpack_half_2x16_split_y_flush_to_zero:
3198
case nir_op_unpack_half_2x16_split_y: {
3199
Temp src = get_alu_src(ctx, instr->src[0]);
3200
if (src.regClass() == s1)
3201
src =
3202
bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(16u));
3203
else
3204
src =
3205
bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3206
if (dst.regClass() == v1) {
3207
assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3208
(instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));
3209
bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3210
} else {
3211
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3212
}
3213
break;
3214
}
3215
case nir_op_sad_u8x4: {
3216
assert(dst.regClass() == v1);
3217
emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false);
3218
break;
3219
}
3220
case nir_op_fquantize2f16: {
3221
Temp src = get_alu_src(ctx, instr->src[0]);
3222
Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
3223
Temp f32, cmp_res;
3224
3225
if (ctx->program->chip_class >= GFX8) {
3226
Temp mask = bld.copy(
3227
bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */
3228
cmp_res =
3229
bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
3230
f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3231
} else {
3232
/* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3233
* so compare the result and flush to 0 if it's smaller.
3234
*/
3235
f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3236
Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3237
Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3238
tmp0->vop3().abs[0] = true;
3239
Temp tmp1 =
3240
bld.vopc(aco_opcode::v_cmp_lg_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), f32);
3241
cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc),
3242
tmp0->definitions[0].getTemp(), tmp1);
3243
}
3244
3245
if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
3246
Temp copysign_0 =
3247
bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3248
bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
3249
} else {
3250
bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res);
3251
}
3252
break;
3253
}
3254
case nir_op_bfm: {
3255
Temp bits = get_alu_src(ctx, instr->src[0]);
3256
Temp offset = get_alu_src(ctx, instr->src[1]);
3257
3258
if (dst.regClass() == s1) {
3259
bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3260
} else if (dst.regClass() == v1) {
3261
bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3262
} else {
3263
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3264
}
3265
break;
3266
}
3267
case nir_op_bitfield_select: {
3268
3269
/* dst = (insert & bitmask) | (base & ~bitmask) */
3270
if (dst.regClass() == s1) {
3271
Temp bitmask = get_alu_src(ctx, instr->src[0]);
3272
Temp insert = get_alu_src(ctx, instr->src[1]);
3273
Temp base = get_alu_src(ctx, instr->src[2]);
3274
aco_ptr<Instruction> sop2;
3275
nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3276
nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3277
Operand lhs;
3278
if (const_insert && const_bitmask) {
3279
lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3280
} else {
3281
insert =
3282
bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3283
lhs = Operand(insert);
3284
}
3285
3286
Operand rhs;
3287
nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3288
if (const_base && const_bitmask) {
3289
rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3290
} else {
3291
base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3292
rhs = Operand(base);
3293
}
3294
3295
bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3296
3297
} else if (dst.regClass() == v1) {
3298
emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3299
} else {
3300
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3301
}
3302
break;
3303
}
3304
case nir_op_ubfe:
3305
case nir_op_ibfe: {
3306
if (dst.bytes() != 4)
3307
unreachable("Unsupported BFE bit size");
3308
3309
if (dst.type() == RegType::sgpr) {
3310
Temp base = get_alu_src(ctx, instr->src[0]);
3311
3312
nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3313
nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3314
if (const_offset && const_bits) {
3315
uint32_t extract = (const_bits->u32 << 16) | (const_offset->u32 & 0x1f);
3316
aco_opcode opcode =
3317
instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3318
bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3319
break;
3320
}
3321
3322
Temp offset = get_alu_src(ctx, instr->src[1]);
3323
Temp bits = get_alu_src(ctx, instr->src[2]);
3324
if (instr->op == nir_op_ubfe) {
3325
Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3326
Temp masked =
3327
bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3328
bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3329
} else {
3330
Operand bits_op = const_bits ? Operand::c32(const_bits->u32 << 16)
3331
: bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1),
3332
bld.def(s1, scc), bits, Operand::c32(16u));
3333
Operand offset_op = const_offset
3334
? Operand::c32(const_offset->u32 & 0x1fu)
3335
: bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3336
offset, Operand::c32(0x1fu));
3337
3338
Temp extract =
3339
bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3340
bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3341
}
3342
3343
} else {
3344
aco_opcode opcode =
3345
instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3346
emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3347
}
3348
break;
3349
}
3350
case nir_op_extract_u8:
3351
case nir_op_extract_i8:
3352
case nir_op_extract_u16:
3353
case nir_op_extract_i16: {
3354
bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3355
unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3356
uint32_t bits = comp == 4 ? 8 : 16;
3357
unsigned index = nir_src_as_uint(instr->src[1].src);
3358
if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {
3359
assert(index == 0);
3360
bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3361
} else if (dst.regClass() == s1 && instr->dest.dest.ssa.bit_size == 16) {
3362
Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3363
unsigned swizzle = instr->src[0].swizzle[0];
3364
if (vec.size() > 1) {
3365
vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3366
swizzle = swizzle & 1;
3367
}
3368
index += swizzle * instr->dest.dest.ssa.bit_size / bits;
3369
bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3370
Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3371
} else {
3372
Temp src = get_alu_src(ctx, instr->src[0]);
3373
Definition def(dst);
3374
if (dst.bytes() == 8) {
3375
src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3376
index %= comp;
3377
def = bld.def(src.type(), 1);
3378
}
3379
assert(def.bytes() <= 4);
3380
if (def.regClass() == s1) {
3381
bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
3382
Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3383
} else {
3384
src = emit_extract_vector(ctx, src, 0, def.regClass());
3385
bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3386
Operand::c32(bits), Operand::c32(is_signed));
3387
}
3388
if (dst.size() == 2)
3389
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3390
Operand::zero());
3391
}
3392
break;
3393
}
3394
case nir_op_insert_u8:
3395
case nir_op_insert_u16: {
3396
unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3397
uint32_t bits = comp == 4 ? 8 : 16;
3398
unsigned index = nir_src_as_uint(instr->src[1].src);
3399
if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {
3400
assert(index == 0);
3401
bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3402
} else {
3403
Temp src = get_alu_src(ctx, instr->src[0]);
3404
Definition def(dst);
3405
bool swap = false;
3406
if (dst.bytes() == 8) {
3407
src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3408
swap = index >= comp;
3409
index %= comp;
3410
def = bld.def(src.type(), 1);
3411
}
3412
if (def.regClass() == s1) {
3413
bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3414
Operand::c32(index), Operand::c32(bits));
3415
} else {
3416
src = emit_extract_vector(ctx, src, 0, def.regClass());
3417
bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3418
Operand::c32(bits));
3419
}
3420
if (dst.size() == 2 && swap)
3421
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3422
def.getTemp());
3423
else if (dst.size() == 2)
3424
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3425
Operand::zero());
3426
}
3427
break;
3428
}
3429
case nir_op_bit_count: {
3430
Temp src = get_alu_src(ctx, instr->src[0]);
3431
if (src.regClass() == s1) {
3432
bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3433
} else if (src.regClass() == v1) {
3434
bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3435
} else if (src.regClass() == v2) {
3436
bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3437
bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3438
emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3439
} else if (src.regClass() == s2) {
3440
bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3441
} else {
3442
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3443
}
3444
break;
3445
}
3446
case nir_op_flt: {
3447
emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3448
aco_opcode::v_cmp_lt_f64);
3449
break;
3450
}
3451
case nir_op_fge: {
3452
emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3453
aco_opcode::v_cmp_ge_f64);
3454
break;
3455
}
3456
case nir_op_feq: {
3457
emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3458
aco_opcode::v_cmp_eq_f64);
3459
break;
3460
}
3461
case nir_op_fneu: {
3462
emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3463
aco_opcode::v_cmp_neq_f64);
3464
break;
3465
}
3466
case nir_op_ilt: {
3467
emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3468
aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
3469
break;
3470
}
3471
case nir_op_ige: {
3472
emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3473
aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3474
break;
3475
}
3476
case nir_op_ieq: {
3477
if (instr->src[0].src.ssa->bit_size == 1)
3478
emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3479
else
3480
emit_comparison(
3481
ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3482
aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3483
ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3484
break;
3485
}
3486
case nir_op_ine: {
3487
if (instr->src[0].src.ssa->bit_size == 1)
3488
emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3489
else
3490
emit_comparison(
3491
ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3492
aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3493
ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3494
break;
3495
}
3496
case nir_op_ult: {
3497
emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3498
aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3499
break;
3500
}
3501
case nir_op_uge: {
3502
emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3503
aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3504
break;
3505
}
3506
case nir_op_fddx:
3507
case nir_op_fddy:
3508
case nir_op_fddx_fine:
3509
case nir_op_fddy_fine:
3510
case nir_op_fddx_coarse:
3511
case nir_op_fddy_coarse: {
3512
Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
3513
uint16_t dpp_ctrl1, dpp_ctrl2;
3514
if (instr->op == nir_op_fddx_fine) {
3515
dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3516
dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3517
} else if (instr->op == nir_op_fddy_fine) {
3518
dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3519
dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3520
} else {
3521
dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3522
if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3523
dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3524
else
3525
dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3526
}
3527
3528
Temp tmp;
3529
if (ctx->program->chip_class >= GFX8) {
3530
Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3531
tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
3532
} else {
3533
Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3534
Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3535
tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
3536
}
3537
emit_wqm(bld, tmp, dst, true);
3538
break;
3539
}
3540
default: isel_err(&instr->instr, "Unknown NIR ALU instr");
3541
}
3542
}
3543
3544
void
3545
visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
3546
{
3547
Temp dst = get_ssa_temp(ctx, &instr->def);
3548
3549
// TODO: we really want to have the resulting type as this would allow for 64bit literals
3550
// which get truncated the lsb if double and msb if int
3551
// for now, we only use s_mov_b64 with 64bit inline constants
3552
assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3553
assert(dst.type() == RegType::sgpr);
3554
3555
Builder bld(ctx->program, ctx->block);
3556
3557
if (instr->def.bit_size == 1) {
3558
assert(dst.regClass() == bld.lm);
3559
int val = instr->value[0].b ? -1 : 0;
3560
Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
3561
bld.copy(Definition(dst), op);
3562
} else if (instr->def.bit_size == 8) {
3563
bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
3564
} else if (instr->def.bit_size == 16) {
3565
/* sign-extend to use s_movk_i32 instead of a literal */
3566
bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
3567
} else if (dst.size() == 1) {
3568
bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
3569
} else {
3570
assert(dst.size() != 1);
3571
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3572
aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3573
if (instr->def.bit_size == 64)
3574
for (unsigned i = 0; i < dst.size(); i++)
3575
vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
3576
else {
3577
for (unsigned i = 0; i < dst.size(); i++)
3578
vec->operands[i] = Operand::c32(instr->value[i].u32);
3579
}
3580
vec->definitions[0] = Definition(dst);
3581
ctx->block->instructions.emplace_back(std::move(vec));
3582
}
3583
}
3584
3585
uint32_t
3586
widen_mask(uint32_t mask, unsigned multiplier)
3587
{
3588
uint32_t new_mask = 0;
3589
for (unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
3590
if (mask & (1u << i))
3591
new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
3592
return new_mask;
3593
}
3594
3595
struct LoadEmitInfo {
3596
Operand offset;
3597
Temp dst;
3598
unsigned num_components;
3599
unsigned component_size;
3600
Temp resource = Temp(0, s1);
3601
unsigned component_stride = 0;
3602
unsigned const_offset = 0;
3603
unsigned align_mul = 0;
3604
unsigned align_offset = 0;
3605
3606
bool glc = false;
3607
bool slc = false;
3608
unsigned swizzle_component_size = 0;
3609
memory_sync_info sync;
3610
Temp soffset = Temp(0, s1);
3611
};
3612
3613
struct EmitLoadParameters {
3614
using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
3615
unsigned bytes_needed, unsigned align, unsigned const_offset,
3616
Temp dst_hint);
3617
3618
Callback callback;
3619
bool byte_align_loads;
3620
bool supports_8bit_16bit_loads;
3621
unsigned max_const_offset_plus_one;
3622
};
3623
3624
void
3625
emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
3626
const EmitLoadParameters& params)
3627
{
3628
unsigned load_size = info.num_components * info.component_size;
3629
unsigned component_size = info.component_size;
3630
3631
unsigned num_vals = 0;
3632
Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
3633
3634
unsigned const_offset = info.const_offset;
3635
3636
const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
3637
unsigned align_offset = (info.align_offset + const_offset) % align_mul;
3638
3639
unsigned bytes_read = 0;
3640
while (bytes_read < load_size) {
3641
unsigned bytes_needed = load_size - bytes_read;
3642
3643
/* add buffer for unaligned loads */
3644
int byte_align = 0;
3645
if (params.byte_align_loads) {
3646
byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
3647
}
3648
3649
if (byte_align) {
3650
if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
3651
!params.supports_8bit_16bit_loads) {
3652
if (info.component_stride) {
3653
assert(params.supports_8bit_16bit_loads && "unimplemented");
3654
bytes_needed = 2;
3655
byte_align = 0;
3656
} else {
3657
bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;
3658
bytes_needed = align(bytes_needed, 4);
3659
}
3660
} else {
3661
byte_align = 0;
3662
}
3663
}
3664
3665
if (info.swizzle_component_size)
3666
bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
3667
if (info.component_stride)
3668
bytes_needed = MIN2(bytes_needed, info.component_size);
3669
3670
bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
3671
3672
/* reduce constant offset */
3673
Operand offset = info.offset;
3674
unsigned reduced_const_offset = const_offset;
3675
bool remove_const_offset_completely = need_to_align_offset;
3676
if (const_offset &&
3677
(remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
3678
unsigned to_add = const_offset;
3679
if (remove_const_offset_completely) {
3680
reduced_const_offset = 0;
3681
} else {
3682
to_add =
3683
const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
3684
reduced_const_offset %= params.max_const_offset_plus_one;
3685
}
3686
Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3687
if (offset.isConstant()) {
3688
offset = Operand::c32(offset.constantValue() + to_add);
3689
} else if (offset_tmp.regClass() == s1) {
3690
offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
3691
Operand::c32(to_add));
3692
} else if (offset_tmp.regClass() == v1) {
3693
offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
3694
} else {
3695
Temp lo = bld.tmp(offset_tmp.type(), 1);
3696
Temp hi = bld.tmp(offset_tmp.type(), 1);
3697
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3698
3699
if (offset_tmp.regClass() == s2) {
3700
Temp carry = bld.tmp(s1);
3701
lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
3702
Operand::c32(to_add));
3703
hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
3704
offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
3705
} else {
3706
Temp new_lo = bld.tmp(v1);
3707
Temp carry =
3708
bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
3709
hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
3710
offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
3711
}
3712
}
3713
}
3714
3715
/* align offset down if needed */
3716
Operand aligned_offset = offset;
3717
unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
3718
if (need_to_align_offset) {
3719
align = 4;
3720
Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3721
if (offset.isConstant()) {
3722
aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);
3723
} else if (offset_tmp.regClass() == s1) {
3724
aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3725
Operand::c32(0xfffffffcu), offset_tmp);
3726
} else if (offset_tmp.regClass() == s2) {
3727
aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
3728
Operand::c64(0xfffffffffffffffcllu), offset_tmp);
3729
} else if (offset_tmp.regClass() == v1) {
3730
aligned_offset =
3731
bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);
3732
} else if (offset_tmp.regClass() == v2) {
3733
Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
3734
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3735
lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);
3736
aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
3737
}
3738
}
3739
Temp aligned_offset_tmp =
3740
aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset);
3741
3742
Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
3743
reduced_const_offset, byte_align ? Temp() : info.dst);
3744
3745
/* the callback wrote directly to dst */
3746
if (val == info.dst) {
3747
assert(num_vals == 0);
3748
emit_split_vector(ctx, info.dst, info.num_components);
3749
return;
3750
}
3751
3752
/* shift result right if needed */
3753
if (params.byte_align_loads && info.component_size < 4) {
3754
Operand byte_align_off = Operand::c32(byte_align);
3755
if (byte_align == -1) {
3756
if (offset.isConstant())
3757
byte_align_off = Operand::c32(offset.constantValue() % 4u);
3758
else if (offset.size() == 2)
3759
byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
3760
RegClass(offset.getTemp().type(), 1)));
3761
else
3762
byte_align_off = offset;
3763
}
3764
3765
assert(val.bytes() >= load_size && "unimplemented");
3766
if (val.type() == RegType::sgpr)
3767
byte_align_scalar(ctx, val, byte_align_off, info.dst);
3768
else
3769
byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
3770
return;
3771
}
3772
3773
/* add result to list and advance */
3774
if (info.component_stride) {
3775
assert(val.bytes() == info.component_size && "unimplemented");
3776
const_offset += info.component_stride;
3777
align_offset = (align_offset + info.component_stride) % align_mul;
3778
} else {
3779
const_offset += val.bytes();
3780
align_offset = (align_offset + val.bytes()) % align_mul;
3781
}
3782
bytes_read += val.bytes();
3783
vals[num_vals++] = val;
3784
}
3785
3786
/* create array of components */
3787
unsigned components_split = 0;
3788
std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3789
bool has_vgprs = false;
3790
for (unsigned i = 0; i < num_vals;) {
3791
Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
3792
unsigned num_tmps = 0;
3793
unsigned tmp_size = 0;
3794
RegType reg_type = RegType::sgpr;
3795
while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
3796
if (vals[i].type() == RegType::vgpr)
3797
reg_type = RegType::vgpr;
3798
tmp_size += vals[i].bytes();
3799
tmp[num_tmps++] = vals[i++];
3800
}
3801
if (num_tmps > 1) {
3802
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3803
aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
3804
for (unsigned j = 0; j < num_tmps; j++)
3805
vec->operands[j] = Operand(tmp[j]);
3806
tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
3807
vec->definitions[0] = Definition(tmp[0]);
3808
bld.insert(std::move(vec));
3809
}
3810
3811
if (tmp[0].bytes() % component_size) {
3812
/* trim tmp[0] */
3813
assert(i == num_vals);
3814
RegClass new_rc =
3815
RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
3816
tmp[0] =
3817
bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
3818
}
3819
3820
RegClass elem_rc = RegClass::get(reg_type, component_size);
3821
3822
unsigned start = components_split;
3823
3824
if (tmp_size == elem_rc.bytes()) {
3825
allocated_vec[components_split++] = tmp[0];
3826
} else {
3827
assert(tmp_size % elem_rc.bytes() == 0);
3828
aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
3829
aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
3830
for (auto& def : split->definitions) {
3831
Temp component = bld.tmp(elem_rc);
3832
allocated_vec[components_split++] = component;
3833
def = Definition(component);
3834
}
3835
split->operands[0] = Operand(tmp[0]);
3836
bld.insert(std::move(split));
3837
}
3838
3839
/* try to p_as_uniform early so we can create more optimizable code and
3840
* also update allocated_vec */
3841
for (unsigned j = start; j < components_split; j++) {
3842
if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr)
3843
allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
3844
has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
3845
}
3846
}
3847
3848
/* concatenate components and p_as_uniform() result if needed */
3849
if (info.dst.type() == RegType::vgpr || !has_vgprs)
3850
ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
3851
3852
int padding_bytes =
3853
MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
3854
3855
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3856
aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};
3857
for (unsigned i = 0; i < info.num_components; i++)
3858
vec->operands[i] = Operand(allocated_vec[i]);
3859
if (padding_bytes)
3860
vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
3861
if (info.dst.type() == RegType::sgpr && has_vgprs) {
3862
Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
3863
vec->definitions[0] = Definition(tmp);
3864
bld.insert(std::move(vec));
3865
bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
3866
} else {
3867
vec->definitions[0] = Definition(info.dst);
3868
bld.insert(std::move(vec));
3869
}
3870
}
3871
3872
Operand
3873
load_lds_size_m0(Builder& bld)
3874
{
3875
/* TODO: m0 does not need to be initialized on GFX9+ */
3876
return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
3877
}
3878
3879
Temp
3880
lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3881
unsigned align, unsigned const_offset, Temp dst_hint)
3882
{
3883
offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
3884
3885
Operand m = load_lds_size_m0(bld);
3886
3887
bool large_ds_read = bld.program->chip_class >= GFX7;
3888
bool usable_read2 = bld.program->chip_class >= GFX7;
3889
3890
bool read2 = false;
3891
unsigned size = 0;
3892
aco_opcode op;
3893
if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
3894
size = 16;
3895
op = aco_opcode::ds_read_b128;
3896
} else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
3897
size = 16;
3898
read2 = true;
3899
op = aco_opcode::ds_read2_b64;
3900
} else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
3901
size = 12;
3902
op = aco_opcode::ds_read_b96;
3903
} else if (bytes_needed >= 8 && align % 8 == 0) {
3904
size = 8;
3905
op = aco_opcode::ds_read_b64;
3906
} else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
3907
size = 8;
3908
read2 = true;
3909
op = aco_opcode::ds_read2_b32;
3910
} else if (bytes_needed >= 4 && align % 4 == 0) {
3911
size = 4;
3912
op = aco_opcode::ds_read_b32;
3913
} else if (bytes_needed >= 2 && align % 2 == 0) {
3914
size = 2;
3915
op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
3916
} else {
3917
size = 1;
3918
op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
3919
}
3920
3921
unsigned const_offset_unit = read2 ? size / 2u : 1u;
3922
unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
3923
3924
if (const_offset > (const_offset_range - const_offset_unit)) {
3925
unsigned excess = const_offset - (const_offset % const_offset_range);
3926
offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
3927
const_offset -= excess;
3928
}
3929
3930
const_offset /= const_offset_unit;
3931
3932
RegClass rc = RegClass::get(RegType::vgpr, size);
3933
Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
3934
Instruction* instr;
3935
if (read2)
3936
instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
3937
else
3938
instr = bld.ds(op, Definition(val), offset, m, const_offset);
3939
instr->ds().sync = info.sync;
3940
3941
return val;
3942
}
3943
3944
const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
3945
3946
Temp
3947
smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3948
unsigned align, unsigned const_offset, Temp dst_hint)
3949
{
3950
unsigned size = 0;
3951
aco_opcode op;
3952
if (bytes_needed <= 4) {
3953
size = 1;
3954
op = info.resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
3955
} else if (bytes_needed <= 8) {
3956
size = 2;
3957
op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
3958
} else if (bytes_needed <= 16) {
3959
size = 4;
3960
op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
3961
} else if (bytes_needed <= 32) {
3962
size = 8;
3963
op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
3964
} else {
3965
size = 16;
3966
op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
3967
}
3968
aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3969
if (info.resource.id()) {
3970
load->operands[0] = Operand(info.resource);
3971
load->operands[1] = Operand(offset);
3972
} else {
3973
load->operands[0] = Operand(offset);
3974
load->operands[1] = Operand::zero();
3975
}
3976
RegClass rc(RegType::sgpr, size);
3977
Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
3978
load->definitions[0] = Definition(val);
3979
load->glc = info.glc;
3980
load->dlc = info.glc && bld.program->chip_class >= GFX10;
3981
load->sync = info.sync;
3982
bld.insert(std::move(load));
3983
return val;
3984
}
3985
3986
const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
3987
3988
Temp
3989
mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3990
unsigned align_, unsigned const_offset, Temp dst_hint)
3991
{
3992
Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3993
Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
3994
3995
if (info.soffset.id()) {
3996
if (soffset.isTemp())
3997
vaddr = bld.copy(bld.def(v1), soffset);
3998
soffset = Operand(info.soffset);
3999
}
4000
4001
unsigned bytes_size = 0;
4002
aco_opcode op;
4003
if (bytes_needed == 1 || align_ % 2) {
4004
bytes_size = 1;
4005
op = aco_opcode::buffer_load_ubyte;
4006
} else if (bytes_needed == 2 || align_ % 4) {
4007
bytes_size = 2;
4008
op = aco_opcode::buffer_load_ushort;
4009
} else if (bytes_needed <= 4) {
4010
bytes_size = 4;
4011
op = aco_opcode::buffer_load_dword;
4012
} else if (bytes_needed <= 8) {
4013
bytes_size = 8;
4014
op = aco_opcode::buffer_load_dwordx2;
4015
} else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {
4016
bytes_size = 12;
4017
op = aco_opcode::buffer_load_dwordx3;
4018
} else {
4019
bytes_size = 16;
4020
op = aco_opcode::buffer_load_dwordx4;
4021
}
4022
aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4023
mubuf->operands[0] = Operand(info.resource);
4024
mubuf->operands[1] = vaddr;
4025
mubuf->operands[2] = soffset;
4026
mubuf->offen = (offset.type() == RegType::vgpr);
4027
mubuf->glc = info.glc;
4028
mubuf->dlc = info.glc && bld.program->chip_class >= GFX10;
4029
mubuf->slc = info.slc;
4030
mubuf->sync = info.sync;
4031
mubuf->offset = const_offset;
4032
mubuf->swizzled = info.swizzle_component_size != 0;
4033
RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4034
Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4035
mubuf->definitions[0] = Definition(val);
4036
bld.insert(std::move(mubuf));
4037
4038
return val;
4039
}
4040
4041
const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
4042
const EmitLoadParameters scratch_load_params{mubuf_load_callback, false, true, 4096};
4043
4044
Temp
4045
get_gfx6_global_rsrc(Builder& bld, Temp addr)
4046
{
4047
uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4048
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4049
4050
if (addr.type() == RegType::vgpr)
4051
return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4052
Operand::c32(-1u), Operand::c32(rsrc_conf));
4053
return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u),
4054
Operand::c32(rsrc_conf));
4055
}
4056
4057
Temp
4058
global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4059
unsigned align_, unsigned const_offset, Temp dst_hint)
4060
{
4061
unsigned bytes_size = 0;
4062
bool use_mubuf = bld.program->chip_class == GFX6;
4063
bool global = bld.program->chip_class >= GFX9;
4064
aco_opcode op;
4065
if (bytes_needed == 1) {
4066
bytes_size = 1;
4067
op = use_mubuf ? aco_opcode::buffer_load_ubyte
4068
: global ? aco_opcode::global_load_ubyte
4069
: aco_opcode::flat_load_ubyte;
4070
} else if (bytes_needed == 2) {
4071
bytes_size = 2;
4072
op = use_mubuf ? aco_opcode::buffer_load_ushort
4073
: global ? aco_opcode::global_load_ushort
4074
: aco_opcode::flat_load_ushort;
4075
} else if (bytes_needed <= 4) {
4076
bytes_size = 4;
4077
op = use_mubuf ? aco_opcode::buffer_load_dword
4078
: global ? aco_opcode::global_load_dword
4079
: aco_opcode::flat_load_dword;
4080
} else if (bytes_needed <= 8) {
4081
bytes_size = 8;
4082
op = use_mubuf ? aco_opcode::buffer_load_dwordx2
4083
: global ? aco_opcode::global_load_dwordx2
4084
: aco_opcode::flat_load_dwordx2;
4085
} else if (bytes_needed <= 12 && !use_mubuf) {
4086
bytes_size = 12;
4087
op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4088
} else {
4089
bytes_size = 16;
4090
op = use_mubuf ? aco_opcode::buffer_load_dwordx4
4091
: global ? aco_opcode::global_load_dwordx4
4092
: aco_opcode::flat_load_dwordx4;
4093
}
4094
RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
4095
Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4096
if (use_mubuf) {
4097
aco_ptr<MUBUF_instruction> mubuf{
4098
create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4099
mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
4100
mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4101
mubuf->operands[2] = Operand::zero();
4102
mubuf->glc = info.glc;
4103
mubuf->dlc = false;
4104
mubuf->offset = 0;
4105
mubuf->addr64 = offset.type() == RegType::vgpr;
4106
mubuf->disable_wqm = false;
4107
mubuf->sync = info.sync;
4108
mubuf->definitions[0] = Definition(val);
4109
bld.insert(std::move(mubuf));
4110
} else {
4111
offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
4112
4113
aco_ptr<FLAT_instruction> flat{
4114
create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4115
flat->operands[0] = Operand(offset);
4116
flat->operands[1] = Operand(s1);
4117
flat->glc = info.glc;
4118
flat->dlc = info.glc && bld.program->chip_class >= GFX10;
4119
flat->sync = info.sync;
4120
flat->offset = 0u;
4121
flat->definitions[0] = Definition(val);
4122
bld.insert(std::move(flat));
4123
}
4124
4125
return val;
4126
}
4127
4128
const EmitLoadParameters global_load_params{global_load_callback, true, true, 1};
4129
4130
Temp
4131
load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4132
Temp address, unsigned base_offset, unsigned align)
4133
{
4134
assert(util_is_power_of_two_nonzero(align));
4135
4136
Builder bld(ctx->program, ctx->block);
4137
4138
LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4139
info.align_mul = align;
4140
info.align_offset = 0;
4141
info.sync = memory_sync_info(storage_shared);
4142
info.const_offset = base_offset;
4143
emit_load(ctx, bld, info, lds_load_params);
4144
4145
return dst;
4146
}
4147
4148
void
4149
split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4150
Temp src)
4151
{
4152
if (!count)
4153
return;
4154
4155
Builder bld(ctx->program, ctx->block);
4156
4157
/* count == 1 fast path */
4158
if (count == 1) {
4159
if (dst_type == RegType::sgpr)
4160
dst[0] = bld.as_uniform(src);
4161
else
4162
dst[0] = as_vgpr(ctx, src);
4163
return;
4164
}
4165
4166
/* elem_size_bytes is the greatest common divisor which is a power of 2 */
4167
unsigned elem_size_bytes =
4168
1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
4169
4170
ASSERTED bool is_subdword = elem_size_bytes < 4;
4171
assert(!is_subdword || dst_type == RegType::vgpr);
4172
4173
for (unsigned i = 0; i < count; i++)
4174
dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
4175
4176
std::vector<Temp> temps;
4177
/* use allocated_vec if possible */
4178
auto it = ctx->allocated_vec.find(src.id());
4179
if (it != ctx->allocated_vec.end()) {
4180
if (!it->second[0].id())
4181
goto split;
4182
unsigned elem_size = it->second[0].bytes();
4183
assert(src.bytes() % elem_size == 0);
4184
4185
for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
4186
if (!it->second[i].id())
4187
goto split;
4188
}
4189
if (elem_size_bytes % elem_size)
4190
goto split;
4191
4192
temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
4193
elem_size_bytes = elem_size;
4194
}
4195
4196
split:
4197
/* split src if necessary */
4198
if (temps.empty()) {
4199
if (is_subdword && src.type() == RegType::sgpr)
4200
src = as_vgpr(ctx, src);
4201
if (dst_type == RegType::sgpr)
4202
src = bld.as_uniform(src);
4203
4204
unsigned num_elems = src.bytes() / elem_size_bytes;
4205
aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
4206
aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
4207
split->operands[0] = Operand(src);
4208
for (unsigned i = 0; i < num_elems; i++) {
4209
temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
4210
split->definitions[i] = Definition(temps.back());
4211
}
4212
bld.insert(std::move(split));
4213
}
4214
4215
unsigned idx = 0;
4216
for (unsigned i = 0; i < count; i++) {
4217
unsigned op_count = dst[i].bytes() / elem_size_bytes;
4218
if (op_count == 1) {
4219
if (dst_type == RegType::sgpr)
4220
dst[i] = bld.as_uniform(temps[idx++]);
4221
else
4222
dst[i] = as_vgpr(ctx, temps[idx++]);
4223
continue;
4224
}
4225
4226
aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4227
Format::PSEUDO, op_count, 1)};
4228
for (unsigned j = 0; j < op_count; j++) {
4229
Temp tmp = temps[idx++];
4230
if (dst_type == RegType::sgpr)
4231
tmp = bld.as_uniform(tmp);
4232
vec->operands[j] = Operand(tmp);
4233
}
4234
vec->definitions[0] = Definition(dst[i]);
4235
bld.insert(std::move(vec));
4236
}
4237
return;
4238
}
4239
4240
bool
4241
scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
4242
{
4243
unsigned start_elem = ffs(todo_mask) - 1;
4244
bool skip = !(mask & (1 << start_elem));
4245
if (skip)
4246
mask = ~mask & todo_mask;
4247
4248
mask &= todo_mask;
4249
4250
u_bit_scan_consecutive_range(&mask, start, count);
4251
4252
return !skip;
4253
}
4254
4255
void
4256
advance_write_mask(uint32_t* todo_mask, int start, int count)
4257
{
4258
*todo_mask &= ~u_bit_consecutive(0, count) << start;
4259
}
4260
4261
void
4262
store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
4263
unsigned base_offset, unsigned align)
4264
{
4265
assert(util_is_power_of_two_nonzero(align));
4266
assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
4267
4268
Builder bld(ctx->program, ctx->block);
4269
bool large_ds_write = ctx->options->chip_class >= GFX7;
4270
bool usable_write2 = ctx->options->chip_class >= GFX7;
4271
4272
unsigned write_count = 0;
4273
Temp write_datas[32];
4274
unsigned offsets[32];
4275
unsigned bytes[32];
4276
aco_opcode opcodes[32];
4277
4278
wrmask = widen_mask(wrmask, elem_size_bytes);
4279
4280
uint32_t todo = u_bit_consecutive(0, data.bytes());
4281
while (todo) {
4282
int offset, byte;
4283
if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
4284
offsets[write_count] = offset;
4285
bytes[write_count] = byte;
4286
opcodes[write_count] = aco_opcode::num_opcodes;
4287
write_count++;
4288
advance_write_mask(&todo, offset, byte);
4289
continue;
4290
}
4291
4292
bool aligned2 = offset % 2 == 0 && align % 2 == 0;
4293
bool aligned4 = offset % 4 == 0 && align % 4 == 0;
4294
bool aligned8 = offset % 8 == 0 && align % 8 == 0;
4295
bool aligned16 = offset % 16 == 0 && align % 16 == 0;
4296
4297
// TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
4298
aco_opcode op = aco_opcode::num_opcodes;
4299
if (byte >= 16 && aligned16 && large_ds_write) {
4300
op = aco_opcode::ds_write_b128;
4301
byte = 16;
4302
} else if (byte >= 12 && aligned16 && large_ds_write) {
4303
op = aco_opcode::ds_write_b96;
4304
byte = 12;
4305
} else if (byte >= 8 && aligned8) {
4306
op = aco_opcode::ds_write_b64;
4307
byte = 8;
4308
} else if (byte >= 4 && aligned4) {
4309
op = aco_opcode::ds_write_b32;
4310
byte = 4;
4311
} else if (byte >= 2 && aligned2) {
4312
op = aco_opcode::ds_write_b16;
4313
byte = 2;
4314
} else if (byte >= 1) {
4315
op = aco_opcode::ds_write_b8;
4316
byte = 1;
4317
} else {
4318
assert(false);
4319
}
4320
4321
offsets[write_count] = offset;
4322
bytes[write_count] = byte;
4323
opcodes[write_count] = op;
4324
write_count++;
4325
advance_write_mask(&todo, offset, byte);
4326
}
4327
4328
Operand m = load_lds_size_m0(bld);
4329
4330
split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
4331
4332
for (unsigned i = 0; i < write_count; i++) {
4333
aco_opcode op = opcodes[i];
4334
if (op == aco_opcode::num_opcodes)
4335
continue;
4336
4337
Temp split_data = write_datas[i];
4338
4339
unsigned second = write_count;
4340
if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
4341
for (second = i + 1; second < write_count; second++) {
4342
if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
4343
op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
4344
opcodes[second] = aco_opcode::num_opcodes;
4345
break;
4346
}
4347
}
4348
}
4349
4350
bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
4351
unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
4352
4353
unsigned inline_offset = base_offset + offsets[i];
4354
unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
4355
Temp address_offset = address;
4356
if (inline_offset > max_offset) {
4357
address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
4358
inline_offset = offsets[i];
4359
}
4360
4361
/* offsets[i] shouldn't be large enough for this to happen */
4362
assert(inline_offset <= max_offset);
4363
4364
Instruction* instr;
4365
if (write2) {
4366
Temp second_data = write_datas[second];
4367
inline_offset /= split_data.bytes();
4368
instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
4369
inline_offset + write2_off);
4370
} else {
4371
instr = bld.ds(op, address_offset, split_data, m, inline_offset);
4372
}
4373
instr->ds().sync = memory_sync_info(storage_shared);
4374
}
4375
}
4376
4377
aco_opcode
4378
get_buffer_store_op(unsigned bytes)
4379
{
4380
switch (bytes) {
4381
case 1: return aco_opcode::buffer_store_byte;
4382
case 2: return aco_opcode::buffer_store_short;
4383
case 4: return aco_opcode::buffer_store_dword;
4384
case 8: return aco_opcode::buffer_store_dwordx2;
4385
case 12: return aco_opcode::buffer_store_dwordx3;
4386
case 16: return aco_opcode::buffer_store_dwordx4;
4387
}
4388
unreachable("Unexpected store size");
4389
return aco_opcode::num_opcodes;
4390
}
4391
4392
void
4393
split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
4394
Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
4395
Temp* write_datas, unsigned* offsets)
4396
{
4397
unsigned write_count_with_skips = 0;
4398
bool skips[16];
4399
unsigned bytes[16];
4400
4401
/* determine how to split the data */
4402
unsigned todo = u_bit_consecutive(0, data.bytes());
4403
while (todo) {
4404
int offset, byte;
4405
skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
4406
offsets[write_count_with_skips] = offset;
4407
if (skips[write_count_with_skips]) {
4408
bytes[write_count_with_skips] = byte;
4409
advance_write_mask(&todo, offset, byte);
4410
write_count_with_skips++;
4411
continue;
4412
}
4413
4414
/* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
4415
* larger than swizzle_element_size */
4416
byte = MIN2(byte, swizzle_element_size);
4417
if (byte % 4)
4418
byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
4419
4420
/* SMEM and GFX6 VMEM can't emit 12-byte stores */
4421
if ((ctx->program->chip_class == GFX6 || smem) && byte == 12)
4422
byte = 8;
4423
4424
/* dword or larger stores have to be dword-aligned */
4425
unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
4426
unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
4427
bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
4428
if (!dword_aligned)
4429
byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
4430
4431
bytes[write_count_with_skips] = byte;
4432
advance_write_mask(&todo, offset, byte);
4433
write_count_with_skips++;
4434
}
4435
4436
/* actually split data */
4437
split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
4438
4439
/* remove skips */
4440
for (unsigned i = 0; i < write_count_with_skips; i++) {
4441
if (skips[i])
4442
continue;
4443
write_datas[*write_count] = write_datas[i];
4444
offsets[*write_count] = offsets[i];
4445
(*write_count)++;
4446
}
4447
}
4448
4449
Temp
4450
create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
4451
unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
4452
{
4453
Builder bld(ctx->program, ctx->block);
4454
unsigned dword_size = elem_size_bytes / 4;
4455
4456
if (!dst.id())
4457
dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
4458
4459
std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4460
aco_ptr<Pseudo_instruction> instr{
4461
create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
4462
instr->definitions[0] = Definition(dst);
4463
4464
for (unsigned i = 0; i < cnt; ++i) {
4465
if (arr[i].id()) {
4466
assert(arr[i].size() == dword_size);
4467
allocated_vec[i] = arr[i];
4468
instr->operands[i] = Operand(arr[i]);
4469
} else {
4470
Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
4471
Operand::zero(dword_size == 2 ? 8 : 4));
4472
allocated_vec[i] = zero;
4473
instr->operands[i] = Operand(zero);
4474
}
4475
}
4476
4477
bld.insert(std::move(instr));
4478
4479
if (split_cnt)
4480
emit_split_vector(ctx, dst, split_cnt);
4481
else
4482
ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
4483
4484
return dst;
4485
}
4486
4487
inline unsigned
4488
resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
4489
{
4490
if (const_offset >= 4096) {
4491
unsigned excess_const_offset = const_offset / 4096u * 4096u;
4492
const_offset %= 4096u;
4493
4494
if (!voffset.id())
4495
voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
4496
else if (unlikely(voffset.regClass() == s1))
4497
voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
4498
Operand::c32(excess_const_offset), Operand(voffset));
4499
else if (likely(voffset.regClass() == v1))
4500
voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
4501
else
4502
unreachable("Unsupported register class of voffset");
4503
}
4504
4505
return const_offset;
4506
}
4507
4508
void
4509
emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
4510
unsigned const_offset = 0u, memory_sync_info sync = memory_sync_info(),
4511
bool slc = false, bool swizzled = false)
4512
{
4513
assert(vdata.id());
4514
assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
4515
assert(vdata.size() >= 1 && vdata.size() <= 4);
4516
4517
Builder bld(ctx->program, ctx->block);
4518
aco_opcode op = get_buffer_store_op(vdata.bytes());
4519
const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
4520
4521
Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
4522
Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();
4523
Builder::Result r =
4524
bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
4525
/* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
4526
/* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
4527
/* dlc*/ false, /* slc */ slc);
4528
4529
r.instr->mubuf().sync = sync;
4530
}
4531
4532
void
4533
store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
4534
unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
4535
bool allow_combining = true, memory_sync_info sync = memory_sync_info(),
4536
bool slc = false)
4537
{
4538
Builder bld(ctx->program, ctx->block);
4539
assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4540
assert(write_mask);
4541
write_mask = widen_mask(write_mask, elem_size_bytes);
4542
4543
unsigned write_count = 0;
4544
Temp write_datas[32];
4545
unsigned offsets[32];
4546
split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, allow_combining ? 16 : 4,
4547
&write_count, write_datas, offsets);
4548
4549
for (unsigned i = 0; i < write_count; i++) {
4550
unsigned const_offset = offsets[i] + base_const_offset;
4551
emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync,
4552
slc, !allow_combining);
4553
}
4554
}
4555
4556
void
4557
load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
4558
unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
4559
unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true,
4560
bool slc = false)
4561
{
4562
assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4563
assert((num_components * elem_size_bytes) == dst.bytes());
4564
assert(!!stride != allow_combining);
4565
4566
Builder bld(ctx->program, ctx->block);
4567
4568
LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
4569
info.component_stride = allow_combining ? 0 : stride;
4570
info.glc = true;
4571
info.slc = slc;
4572
info.swizzle_component_size = allow_combining ? 0 : 4;
4573
info.align_mul = MIN2(elem_size_bytes, 4);
4574
info.align_offset = 0;
4575
info.soffset = soffset;
4576
info.const_offset = base_const_offset;
4577
emit_load(ctx, bld, info, mubuf_load_params);
4578
}
4579
4580
Temp
4581
wave_id_in_threadgroup(isel_context* ctx)
4582
{
4583
Builder bld(ctx->program, ctx->block);
4584
return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
4585
get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(24u | (4u << 16)));
4586
}
4587
4588
Temp
4589
thread_id_in_threadgroup(isel_context* ctx)
4590
{
4591
/* tid_in_tg = wave_id * wave_size + tid_in_wave */
4592
4593
Builder bld(ctx->program, ctx->block);
4594
Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
4595
4596
if (ctx->program->workgroup_size <= ctx->program->wave_size)
4597
return tid_in_wave;
4598
4599
Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
4600
Temp num_pre_threads =
4601
bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
4602
Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
4603
return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
4604
}
4605
4606
Temp
4607
get_tess_rel_patch_id(isel_context* ctx)
4608
{
4609
Builder bld(ctx->program, ctx->block);
4610
4611
switch (ctx->shader->info.stage) {
4612
case MESA_SHADER_TESS_CTRL:
4613
return bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
4614
Operand::zero(), Operand::c32(8u), Operand::zero());
4615
case MESA_SHADER_TESS_EVAL: return get_arg(ctx, ctx->args->ac.tes_rel_patch_id);
4616
default: unreachable("Unsupported stage in get_tess_rel_patch_id");
4617
}
4618
}
4619
4620
bool
4621
store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
4622
{
4623
unsigned write_mask = nir_intrinsic_write_mask(instr);
4624
unsigned component = nir_intrinsic_component(instr);
4625
unsigned idx = nir_intrinsic_base(instr) * 4u + component;
4626
nir_src offset = *nir_get_io_offset_src(instr);
4627
4628
if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
4629
return false;
4630
4631
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4632
4633
if (instr->src[0].ssa->bit_size == 64)
4634
write_mask = widen_mask(write_mask, 2);
4635
4636
RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
4637
4638
for (unsigned i = 0; i < 8; ++i) {
4639
if (write_mask & (1 << i)) {
4640
ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
4641
ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
4642
}
4643
idx++;
4644
}
4645
4646
return true;
4647
}
4648
4649
bool
4650
load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
4651
{
4652
/* Only TCS per-vertex inputs are supported by this function.
4653
* Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
4654
* is the same.
4655
*/
4656
if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
4657
return false;
4658
4659
nir_src* off_src = nir_get_io_offset_src(instr);
4660
nir_src* vertex_index_src = nir_get_io_vertex_index_src(instr);
4661
nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
4662
bool can_use_temps =
4663
nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
4664
nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
4665
4666
if (!can_use_temps)
4667
return false;
4668
4669
unsigned idx = nir_intrinsic_base(instr) * 4u + nir_intrinsic_component(instr) +
4670
4 * nir_src_as_uint(*off_src);
4671
Temp* src = &ctx->inputs.temps[idx];
4672
create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
4673
4674
return true;
4675
}
4676
4677
static void export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos);
4678
4679
void
4680
visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
4681
{
4682
if (ctx->stage == vertex_vs || ctx->stage == tess_eval_vs || ctx->stage == fragment_fs ||
4683
ctx->stage == vertex_ngg || ctx->stage == tess_eval_ngg ||
4684
(ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4685
ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
4686
bool stored_to_temps = store_output_to_temps(ctx, instr);
4687
if (!stored_to_temps) {
4688
isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
4689
abort();
4690
}
4691
} else {
4692
unreachable("Shader stage not implemented");
4693
}
4694
4695
/* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we
4696
* have to emit an exp here manually */
4697
if (ctx->stage.hw == HWStage::NGG &&
4698
(ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) &&
4699
nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PRIMITIVE_ID)
4700
export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, NULL);
4701
}
4702
4703
void
4704
emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
4705
Temp prim_mask)
4706
{
4707
Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
4708
Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
4709
4710
Builder bld(ctx->program, ctx->block);
4711
4712
if (dst.regClass() == v2b) {
4713
if (ctx->program->dev.has_16bank_lds) {
4714
assert(ctx->options->chip_class <= GFX8);
4715
Builder::Result interp_p1 =
4716
bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
4717
bld.m0(prim_mask), idx, component);
4718
interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1,
4719
bld.m0(prim_mask), interp_p1, idx, component);
4720
bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
4721
interp_p1, idx, component);
4722
} else {
4723
aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
4724
4725
if (ctx->options->chip_class == GFX8)
4726
interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
4727
4728
Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
4729
bld.m0(prim_mask), idx, component);
4730
bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
4731
component);
4732
}
4733
} else {
4734
Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
4735
bld.m0(prim_mask), idx, component);
4736
4737
if (ctx->program->dev.has_16bank_lds)
4738
interp_p1.instr->operands[0].setLateKill(true);
4739
4740
bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
4741
idx, component);
4742
}
4743
}
4744
4745
void
4746
emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
4747
{
4748
Builder bld(ctx->program, ctx->block);
4749
4750
aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
4751
aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
4752
for (unsigned i = 0; i < num_components; i++)
4753
vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
4754
if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4755
assert(num_components == 4);
4756
vec->operands[3] =
4757
bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
4758
}
4759
4760
if (ctx->options->adjust_frag_coord_z &&
4761
G_0286CC_POS_Z_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4762
/* Adjust gl_FragCoord.z for VRS due to a hw bug on some GFX10.3 chips. */
4763
Operand frag_z = vec->operands[2];
4764
Temp adjusted_frag_z = bld.tmp(v1);
4765
Temp tmp;
4766
4767
/* dFdx fine */
4768
Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), frag_z, dpp_quad_perm(0, 0, 2, 2));
4769
tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), frag_z, tl, dpp_quad_perm(1, 1, 3, 3));
4770
emit_wqm(bld, tmp, adjusted_frag_z, true);
4771
4772
/* adjusted_frag_z * 0.0625 + frag_z */
4773
adjusted_frag_z = bld.vop3(aco_opcode::v_fma_f32, bld.def(v1), adjusted_frag_z,
4774
Operand::c32(0x3d800000u /* 0.0625 */), frag_z);
4775
4776
/* VRS Rate X = Ancillary[2:3] */
4777
Temp x_rate =
4778
bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4779
Operand::c32(2u), Operand::c32(2u));
4780
4781
/* xRate = xRate == 0x1 ? adjusted_frag_z : frag_z. */
4782
Temp cond =
4783
bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
4784
vec->operands[2] =
4785
bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), frag_z, adjusted_frag_z, cond);
4786
}
4787
4788
for (Operand& op : vec->operands)
4789
op = op.isUndefined() ? Operand::zero() : op;
4790
4791
vec->definitions[0] = Definition(dst);
4792
ctx->block->instructions.emplace_back(std::move(vec));
4793
emit_split_vector(ctx, dst, num_components);
4794
return;
4795
}
4796
4797
void
4798
emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
4799
{
4800
Builder bld(ctx->program, ctx->block);
4801
Temp cond;
4802
4803
/* VRS Rate X = Ancillary[2:3]
4804
* VRS Rate Y = Ancillary[4:5]
4805
*/
4806
Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4807
Operand::c32(2u), Operand::c32(2u));
4808
Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4809
Operand::c32(4u), Operand::c32(2u));
4810
4811
/* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
4812
cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
4813
x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
4814
bld.copy(bld.def(v1), Operand::c32(4u)), cond);
4815
4816
/* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
4817
cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));
4818
y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
4819
bld.copy(bld.def(v1), Operand::c32(1u)), cond);
4820
4821
bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
4822
}
4823
4824
void
4825
visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
4826
{
4827
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4828
Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
4829
unsigned idx = nir_intrinsic_base(instr);
4830
unsigned component = nir_intrinsic_component(instr);
4831
Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4832
4833
assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
4834
4835
if (instr->dest.ssa.num_components == 1) {
4836
emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
4837
} else {
4838
aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
4839
aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
4840
for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) {
4841
Temp tmp = ctx->program->allocateTmp(v1);
4842
emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
4843
vec->operands[i] = Operand(tmp);
4844
}
4845
vec->definitions[0] = Definition(dst);
4846
ctx->block->instructions.emplace_back(std::move(vec));
4847
}
4848
}
4849
4850
bool
4851
check_vertex_fetch_size(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
4852
unsigned binding_align, unsigned channels)
4853
{
4854
unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
4855
if (vtx_info->chan_byte_size != 4 && channels == 3)
4856
return false;
4857
4858
/* Split typed vertex buffer loads on GFX6 and GFX10+ to avoid any
4859
* alignment issues that triggers memory violations and eventually a GPU
4860
* hang. This can happen if the stride (static or dynamic) is unaligned and
4861
* also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO
4862
* offset is 2 for R16G16B16A16_SNORM).
4863
*/
4864
return (ctx->options->chip_class >= GFX7 && ctx->options->chip_class <= GFX9) ||
4865
(offset % vertex_byte_size == 0 && MAX2(binding_align, 1) % vertex_byte_size == 0);
4866
}
4867
4868
uint8_t
4869
get_fetch_data_format(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
4870
unsigned* channels, unsigned max_channels, unsigned binding_align)
4871
{
4872
if (!vtx_info->chan_byte_size) {
4873
*channels = vtx_info->num_channels;
4874
return vtx_info->chan_format;
4875
}
4876
4877
unsigned num_channels = *channels;
4878
if (!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, *channels)) {
4879
unsigned new_channels = num_channels + 1;
4880
/* first, assume more loads is worse and try using a larger data format */
4881
while (new_channels <= max_channels &&
4882
!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) {
4883
new_channels++;
4884
}
4885
4886
if (new_channels > max_channels) {
4887
/* then try decreasing load size (at the cost of more loads) */
4888
new_channels = *channels;
4889
while (new_channels > 1 &&
4890
!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels))
4891
new_channels--;
4892
}
4893
4894
if (new_channels < *channels)
4895
*channels = new_channels;
4896
num_channels = new_channels;
4897
}
4898
4899
switch (vtx_info->chan_format) {
4900
case V_008F0C_BUF_DATA_FORMAT_8:
4901
return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
4902
V_008F0C_BUF_DATA_FORMAT_INVALID,
4903
V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
4904
case V_008F0C_BUF_DATA_FORMAT_16:
4905
return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
4906
V_008F0C_BUF_DATA_FORMAT_INVALID,
4907
V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
4908
case V_008F0C_BUF_DATA_FORMAT_32:
4909
return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
4910
V_008F0C_BUF_DATA_FORMAT_32_32_32,
4911
V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
4912
}
4913
unreachable("shouldn't reach here");
4914
return V_008F0C_BUF_DATA_FORMAT_INVALID;
4915
}
4916
4917
/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
4918
* so we may need to fix it up. */
4919
Temp
4920
adjust_vertex_fetch_alpha(isel_context* ctx, unsigned adjustment, Temp alpha)
4921
{
4922
Builder bld(ctx->program, ctx->block);
4923
4924
if (adjustment == AC_FETCH_FORMAT_SSCALED)
4925
alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
4926
4927
/* For the integer-like cases, do a natural sign extension.
4928
*
4929
* For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
4930
* and happen to contain 0, 1, 2, 3 as the two LSBs of the
4931
* exponent.
4932
*/
4933
unsigned offset = adjustment == AC_FETCH_FORMAT_SNORM ? 23u : 0u;
4934
alpha =
4935
bld.vop3(aco_opcode::v_bfe_i32, bld.def(v1), alpha, Operand::c32(offset), Operand::c32(2u));
4936
4937
/* Convert back to the right type. */
4938
if (adjustment == AC_FETCH_FORMAT_SNORM) {
4939
alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4940
alpha = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::c32(0xbf800000u), alpha);
4941
} else if (adjustment == AC_FETCH_FORMAT_SSCALED) {
4942
alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4943
}
4944
4945
return alpha;
4946
}
4947
4948
void
4949
visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
4950
{
4951
Builder bld(ctx->program, ctx->block);
4952
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4953
nir_src offset = *nir_get_io_offset_src(instr);
4954
4955
if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
4956
4957
if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
4958
isel_err(offset.ssa->parent_instr,
4959
"Unimplemented non-zero nir_intrinsic_load_input offset");
4960
4961
Temp vertex_buffers =
4962
convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers));
4963
4964
unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
4965
unsigned component = nir_intrinsic_component(instr);
4966
unsigned bitsize = instr->dest.ssa.bit_size;
4967
unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
4968
uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
4969
uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
4970
unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
4971
unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding];
4972
enum ac_fetch_format alpha_adjust = ctx->options->key.vs.alpha_adjust[location];
4973
4974
unsigned dfmt = attrib_format & 0xf;
4975
unsigned nfmt = (attrib_format >> 4) & 0x7;
4976
const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
4977
4978
unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
4979
unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
4980
bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
4981
if (post_shuffle)
4982
num_channels = MAX2(num_channels, 3);
4983
4984
unsigned desc_index =
4985
ctx->program->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;
4986
desc_index = util_bitcount(ctx->program->info->vs.vb_desc_usage_mask &
4987
u_bit_consecutive(0, desc_index));
4988
Operand off = bld.copy(bld.def(s1), Operand::c32(desc_index * 16u));
4989
Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
4990
4991
Temp index;
4992
if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
4993
uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
4994
Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
4995
if (divisor) {
4996
Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
4997
if (divisor != 1) {
4998
Temp divided = bld.tmp(v1);
4999
emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
5000
index = bld.vadd32(bld.def(v1), start_instance, divided);
5001
} else {
5002
index = bld.vadd32(bld.def(v1), start_instance, instance_id);
5003
}
5004
} else {
5005
index = bld.copy(bld.def(v1), start_instance);
5006
}
5007
} else {
5008
index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.base_vertex),
5009
get_arg(ctx, ctx->args->ac.vertex_id));
5010
}
5011
5012
Temp* const channels = (Temp*)alloca(num_channels * sizeof(Temp));
5013
unsigned channel_start = 0;
5014
bool direct_fetch = false;
5015
5016
/* skip unused channels at the start */
5017
if (vtx_info->chan_byte_size && !post_shuffle) {
5018
channel_start = ffs(mask) - 1;
5019
for (unsigned i = 0; i < MIN2(channel_start, num_channels); i++)
5020
channels[i] = Temp(0, s1);
5021
} else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {
5022
num_channels = 3 - (ffs(mask) - 1);
5023
}
5024
5025
/* load channels */
5026
while (channel_start < num_channels) {
5027
unsigned fetch_component = num_channels - channel_start;
5028
unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
5029
bool expanded = false;
5030
5031
/* use MUBUF when possible to avoid possible alignment issues */
5032
/* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
5033
bool use_mubuf =
5034
(nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
5035
nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
5036
vtx_info->chan_byte_size == 4;
5037
unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
5038
if (!use_mubuf) {
5039
fetch_dfmt =
5040
get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component,
5041
vtx_info->num_channels - channel_start, binding_align);
5042
} else {
5043
if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
5044
/* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
5045
fetch_component = 4;
5046
expanded = true;
5047
}
5048
}
5049
5050
unsigned fetch_bytes = fetch_component * bitsize / 8;
5051
5052
Temp fetch_index = index;
5053
if (attrib_stride != 0 && fetch_offset > attrib_stride) {
5054
fetch_index =
5055
bld.vadd32(bld.def(v1), Operand::c32(fetch_offset / attrib_stride), fetch_index);
5056
fetch_offset = fetch_offset % attrib_stride;
5057
}
5058
5059
Operand soffset = Operand::zero();
5060
if (fetch_offset >= 4096) {
5061
soffset = bld.copy(bld.def(s1), Operand::c32(fetch_offset / 4096 * 4096));
5062
fetch_offset %= 4096;
5063
}
5064
5065
aco_opcode opcode;
5066
switch (fetch_bytes) {
5067
case 2:
5068
assert(!use_mubuf && bitsize == 16);
5069
opcode = aco_opcode::tbuffer_load_format_d16_x;
5070
break;
5071
case 4:
5072
if (bitsize == 16) {
5073
assert(!use_mubuf);
5074
opcode = aco_opcode::tbuffer_load_format_d16_xy;
5075
} else {
5076
opcode =
5077
use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
5078
}
5079
break;
5080
case 6:
5081
assert(!use_mubuf && bitsize == 16);
5082
opcode = aco_opcode::tbuffer_load_format_d16_xyz;
5083
break;
5084
case 8:
5085
if (bitsize == 16) {
5086
assert(!use_mubuf);
5087
opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
5088
} else {
5089
opcode =
5090
use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
5091
}
5092
break;
5093
case 12:
5094
assert(ctx->options->chip_class >= GFX7 ||
5095
(!use_mubuf && ctx->options->chip_class == GFX6));
5096
opcode =
5097
use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
5098
break;
5099
case 16:
5100
opcode =
5101
use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
5102
break;
5103
default: unreachable("Unimplemented load_input vector size");
5104
}
5105
5106
Temp fetch_dst;
5107
if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded &&
5108
(alpha_adjust == AC_FETCH_FORMAT_NONE || num_channels <= 3)) {
5109
direct_fetch = true;
5110
fetch_dst = dst;
5111
} else {
5112
fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
5113
}
5114
5115
if (use_mubuf) {
5116
Instruction* mubuf = bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index,
5117
soffset, fetch_offset, false, false, true)
5118
.instr;
5119
mubuf->mubuf().vtx_binding = attrib_binding + 1;
5120
} else {
5121
Instruction* mtbuf = bld.mtbuf(opcode, Definition(fetch_dst), list, fetch_index,
5122
soffset, fetch_dfmt, nfmt, fetch_offset, false, true)
5123
.instr;
5124
mtbuf->mtbuf().vtx_binding = attrib_binding + 1;
5125
}
5126
5127
emit_split_vector(ctx, fetch_dst, fetch_dst.size());
5128
5129
if (fetch_component == 1) {
5130
channels[channel_start] = fetch_dst;
5131
} else {
5132
for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
5133
channels[channel_start + i] =
5134
emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1);
5135
}
5136
5137
channel_start += fetch_component;
5138
}
5139
5140
if (!direct_fetch) {
5141
bool is_float =
5142
nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
5143
5144
static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
5145
static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
5146
const unsigned* swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
5147
unsigned num_components = instr->dest.ssa.num_components;
5148
5149
aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
5150
aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5151
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5152
unsigned num_temp = 0;
5153
for (unsigned i = 0; i < num_components; i++) {
5154
unsigned idx = i + component;
5155
if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
5156
Temp channel = channels[swizzle[idx]];
5157
if (idx == 3 && alpha_adjust != AC_FETCH_FORMAT_NONE)
5158
channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
5159
vec->operands[i] = Operand(channel);
5160
5161
num_temp++;
5162
elems[i] = channel;
5163
} else if (is_float && idx == 3) {
5164
vec->operands[i] = Operand::c32(0x3f800000u);
5165
} else if (!is_float && idx == 3) {
5166
vec->operands[i] = Operand::c32(1u);
5167
} else {
5168
vec->operands[i] = Operand::zero();
5169
}
5170
}
5171
vec->definitions[0] = Definition(dst);
5172
ctx->block->instructions.emplace_back(std::move(vec));
5173
emit_split_vector(ctx, dst, num_components);
5174
5175
if (num_temp == num_components)
5176
ctx->allocated_vec.emplace(dst.id(), elems);
5177
}
5178
} else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
5179
if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5180
isel_err(offset.ssa->parent_instr,
5181
"Unimplemented non-zero nir_intrinsic_load_input offset");
5182
5183
Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
5184
5185
unsigned idx = nir_intrinsic_base(instr);
5186
unsigned component = nir_intrinsic_component(instr);
5187
unsigned vertex_id = 2; /* P0 */
5188
5189
if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
5190
nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);
5191
switch (src0->u32) {
5192
case 0:
5193
vertex_id = 2; /* P0 */
5194
break;
5195
case 1:
5196
vertex_id = 0; /* P10 */
5197
break;
5198
case 2:
5199
vertex_id = 1; /* P20 */
5200
break;
5201
default: unreachable("invalid vertex index");
5202
}
5203
}
5204
5205
if (dst.size() == 1) {
5206
bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id),
5207
bld.m0(prim_mask), idx, component);
5208
} else {
5209
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5210
aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
5211
for (unsigned i = 0; i < dst.size(); i++)
5212
vec->operands[i] =
5213
bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(vertex_id),
5214
bld.m0(prim_mask), idx, component + i);
5215
vec->definitions[0] = Definition(dst);
5216
bld.insert(std::move(vec));
5217
}
5218
} else {
5219
unreachable("Shader stage not implemented");
5220
}
5221
}
5222
5223
void
5224
visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5225
{
5226
assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5227
5228
Builder bld(ctx->program, ctx->block);
5229
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5230
5231
if (load_input_from_temps(ctx, instr, dst))
5232
return;
5233
5234
unreachable("LDS-based TCS input should have been lowered in NIR.");
5235
}
5236
5237
void
5238
visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5239
{
5240
switch (ctx->shader->info.stage) {
5241
case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5242
default: unreachable("Unimplemented shader stage");
5243
}
5244
}
5245
5246
void
5247
visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5248
{
5249
assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5250
5251
Builder bld(ctx->program, ctx->block);
5252
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5253
5254
Operand tes_u(get_arg(ctx, ctx->args->ac.tes_u));
5255
Operand tes_v(get_arg(ctx, ctx->args->ac.tes_v));
5256
Operand tes_w = Operand::zero();
5257
5258
if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {
5259
Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5260
tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);
5261
tes_w = Operand(tmp);
5262
}
5263
5264
Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5265
emit_split_vector(ctx, tess_coord, 3);
5266
}
5267
5268
Temp
5269
load_desc_ptr(isel_context* ctx, unsigned desc_set)
5270
{
5271
if (ctx->program->info->need_indirect_descriptor_sets) {
5272
Builder bld(ctx->program, ctx->block);
5273
Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
5274
Operand off = bld.copy(bld.def(s1), Operand::c32(desc_set << 2));
5275
return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off); //, false, false, false);
5276
}
5277
5278
return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
5279
}
5280
5281
void
5282
visit_load_resource(isel_context* ctx, nir_intrinsic_instr* instr)
5283
{
5284
Builder bld(ctx->program, ctx->block);
5285
Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
5286
if (!nir_dest_is_divergent(instr->dest))
5287
index = bld.as_uniform(index);
5288
unsigned desc_set = nir_intrinsic_desc_set(instr);
5289
unsigned binding = nir_intrinsic_binding(instr);
5290
5291
Temp desc_ptr;
5292
radv_pipeline_layout* pipeline_layout = ctx->options->layout;
5293
radv_descriptor_set_layout* layout = pipeline_layout->set[desc_set].layout;
5294
unsigned offset = layout->binding[binding].offset;
5295
unsigned stride;
5296
if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
5297
layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
5298
unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
5299
layout->binding[binding].dynamic_offset_offset;
5300
desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
5301
offset = pipeline_layout->push_constant_size + 16 * idx;
5302
stride = 16;
5303
} else {
5304
desc_ptr = load_desc_ptr(ctx, desc_set);
5305
stride = layout->binding[binding].size;
5306
}
5307
5308
if (nir_src_is_const(instr->src[0])) {
5309
index =
5310
bld.copy(bld.def(s1), Operand::c32((offset + nir_src_as_uint(instr->src[0]) * stride)));
5311
} else if (index.type() == RegType::vgpr) {
5312
if (stride != 1) {
5313
bool index24bit = layout->binding[binding].array_size <= 0x1000000;
5314
index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
5315
}
5316
if (offset)
5317
index = bld.vadd32(bld.def(v1), Operand::c32(offset), index);
5318
} else {
5319
if (stride != 1)
5320
index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index);
5321
if (offset)
5322
index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5323
Operand::c32(offset), index);
5324
}
5325
5326
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5327
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5328
elems[0] = desc_ptr;
5329
elems[1] = index;
5330
ctx->allocated_vec.emplace(dst.id(), elems);
5331
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), desc_ptr, index, Operand::zero());
5332
}
5333
5334
void
5335
load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5336
Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,
5337
bool allow_smem = true, memory_sync_info sync = memory_sync_info())
5338
{
5339
Builder bld(ctx->program, ctx->block);
5340
5341
bool use_smem =
5342
dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
5343
if (use_smem)
5344
offset = bld.as_uniform(offset);
5345
else {
5346
/* GFX6-7 are affected by a hw bug that prevents address clamping to
5347
* work correctly when the SGPR offset is used.
5348
*/
5349
if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)
5350
offset = as_vgpr(ctx, offset);
5351
}
5352
5353
LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5354
info.glc = glc;
5355
info.sync = sync;
5356
info.align_mul = align_mul;
5357
info.align_offset = align_offset;
5358
if (use_smem)
5359
emit_load(ctx, bld, info, smem_load_params);
5360
else
5361
emit_load(ctx, bld, info, mubuf_load_params);
5362
}
5363
5364
Temp
5365
load_buffer_rsrc(isel_context* ctx, Temp rsrc)
5366
{
5367
Builder bld(ctx->program, ctx->block);
5368
Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));
5369
Temp binding = bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
5370
set_ptr = convert_pointer_to_64_bit(ctx, set_ptr);
5371
return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), set_ptr, binding);
5372
}
5373
5374
bool
5375
is_inline_ubo(isel_context* ctx, nir_src rsrc)
5376
{
5377
nir_binding binding = nir_chase_binding(rsrc);
5378
if (!binding.success)
5379
return false;
5380
5381
radv_descriptor_set_layout* layout = ctx->options->layout->set[binding.desc_set].layout;
5382
return layout->binding[binding.binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT;
5383
}
5384
5385
void
5386
visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5387
{
5388
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5389
Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
5390
5391
Builder bld(ctx->program, ctx->block);
5392
5393
if (is_inline_ubo(ctx, instr->src[0])) {
5394
Temp set_ptr = bld.as_uniform(emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1)));
5395
Temp binding_off =
5396
bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
5397
rsrc = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), set_ptr, binding_off);
5398
5399
uint32_t desc_type =
5400
S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5401
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5402
if (ctx->options->chip_class >= GFX10) {
5403
desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5404
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5405
} else {
5406
desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5407
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5408
}
5409
rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), rsrc,
5410
Operand::c32(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
5411
Operand::c32(0xFFFFFFFFu), Operand::c32(desc_type));
5412
} else {
5413
rsrc = load_buffer_rsrc(ctx, rsrc);
5414
}
5415
unsigned size = instr->dest.ssa.bit_size / 8;
5416
load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5417
nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5418
}
5419
5420
void
5421
visit_load_sbt_amd(isel_context* ctx, nir_intrinsic_instr* instr)
5422
{
5423
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5424
Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
5425
unsigned binding = nir_intrinsic_binding(instr);
5426
unsigned base = nir_intrinsic_base(instr);
5427
5428
index = as_vgpr(ctx, index);
5429
5430
Builder bld(ctx->program, ctx->block);
5431
Temp desc_base = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.sbt_descriptors));
5432
Operand desc_off = bld.copy(bld.def(s1), Operand::c32(binding * 16u));
5433
Temp rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), desc_base, desc_off);
5434
5435
/* If we want more we need to implement */
5436
assert(instr->dest.ssa.bit_size == 32);
5437
assert(instr->num_components == 1);
5438
5439
bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst), rsrc, index, Operand::zero(), base,
5440
false, false, true);
5441
}
5442
5443
void
5444
visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5445
{
5446
Builder bld(ctx->program, ctx->block);
5447
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5448
unsigned offset = nir_intrinsic_base(instr);
5449
unsigned count = instr->dest.ssa.num_components;
5450
nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
5451
5452
if (index_cv && instr->dest.ssa.bit_size == 32) {
5453
unsigned start = (offset + index_cv->u32) / 4u;
5454
start -= ctx->args->ac.base_inline_push_consts;
5455
if (start + count <= ctx->args->ac.num_inline_push_consts) {
5456
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5457
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5458
aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5459
for (unsigned i = 0; i < count; ++i) {
5460
elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
5461
vec->operands[i] = Operand{elems[i]};
5462
}
5463
vec->definitions[0] = Definition(dst);
5464
ctx->block->instructions.emplace_back(std::move(vec));
5465
ctx->allocated_vec.emplace(dst.id(), elems);
5466
return;
5467
}
5468
}
5469
5470
Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5471
if (offset != 0) // TODO check if index != 0 as well
5472
index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5473
Operand::c32(offset), index);
5474
Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
5475
Temp vec = dst;
5476
bool trim = false;
5477
bool aligned = true;
5478
5479
if (instr->dest.ssa.bit_size == 8) {
5480
aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5481
bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5482
if (!aligned)
5483
vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5484
} else if (instr->dest.ssa.bit_size == 16) {
5485
aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5486
if (!aligned)
5487
vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5488
}
5489
5490
aco_opcode op;
5491
5492
switch (vec.size()) {
5493
case 1: op = aco_opcode::s_load_dword; break;
5494
case 2: op = aco_opcode::s_load_dwordx2; break;
5495
case 3:
5496
vec = bld.tmp(s4);
5497
trim = true;
5498
FALLTHROUGH;
5499
case 4: op = aco_opcode::s_load_dwordx4; break;
5500
case 6:
5501
vec = bld.tmp(s8);
5502
trim = true;
5503
FALLTHROUGH;
5504
case 8: op = aco_opcode::s_load_dwordx8; break;
5505
default: unreachable("unimplemented or forbidden load_push_constant.");
5506
}
5507
5508
bld.smem(op, Definition(vec), ptr, index).instr->smem().prevent_overflow = true;
5509
5510
if (!aligned) {
5511
Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index);
5512
byte_align_scalar(ctx, vec, byte_offset, dst);
5513
return;
5514
}
5515
5516
if (trim) {
5517
emit_split_vector(ctx, vec, 4);
5518
RegClass rc = dst.size() == 3 ? s1 : s2;
5519
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),
5520
emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));
5521
}
5522
emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
5523
}
5524
5525
void
5526
visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5527
{
5528
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5529
5530
Builder bld(ctx->program, ctx->block);
5531
5532
uint32_t desc_type =
5533
S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5534
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5535
if (ctx->options->chip_class >= GFX10) {
5536
desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5537
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5538
} else {
5539
desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5540
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5541
}
5542
5543
unsigned base = nir_intrinsic_base(instr);
5544
unsigned range = nir_intrinsic_range(instr);
5545
5546
Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5547
if (base && offset.type() == RegType::sgpr)
5548
offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
5549
Operand::c32(base));
5550
else if (base && offset.type() == RegType::vgpr)
5551
offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
5552
5553
Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5554
bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
5555
Operand::c32(ctx->constant_data_offset)),
5556
Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5557
Operand::c32(desc_type));
5558
unsigned size = instr->dest.ssa.bit_size / 8;
5559
// TODO: get alignment information for subdword constants
5560
load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5561
}
5562
5563
void
5564
visit_discard_if(isel_context* ctx, nir_intrinsic_instr* instr)
5565
{
5566
if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5567
ctx->cf_info.exec_potentially_empty_discard = true;
5568
5569
ctx->program->needs_exact = true;
5570
5571
// TODO: optimize uniform conditions
5572
Builder bld(ctx->program, ctx->block);
5573
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5574
assert(src.regClass() == bld.lm);
5575
src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
5576
bld.pseudo(aco_opcode::p_discard_if, src);
5577
ctx->block->kind |= block_kind_uses_discard_if;
5578
return;
5579
}
5580
5581
void
5582
visit_discard(isel_context* ctx, nir_intrinsic_instr* instr)
5583
{
5584
Builder bld(ctx->program, ctx->block);
5585
5586
if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5587
ctx->cf_info.exec_potentially_empty_discard = true;
5588
5589
bool divergent =
5590
ctx->cf_info.parent_if.is_divergent || ctx->cf_info.parent_loop.has_divergent_continue;
5591
5592
if (ctx->block->loop_nest_depth && (nir_instr_is_last(&instr->instr) && !divergent)) {
5593
/* we handle discards the same way as jump instructions */
5594
append_logical_end(ctx->block);
5595
5596
/* in loops, discard behaves like break */
5597
Block* linear_target = ctx->cf_info.parent_loop.exit;
5598
ctx->block->kind |= block_kind_discard;
5599
5600
/* uniform discard - loop ends here */
5601
assert(nir_instr_is_last(&instr->instr));
5602
ctx->block->kind |= block_kind_uniform;
5603
ctx->cf_info.has_branch = true;
5604
bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
5605
add_linear_edge(ctx->block->index, linear_target);
5606
return;
5607
}
5608
5609
/* it can currently happen that NIR doesn't remove the unreachable code */
5610
if (!nir_instr_is_last(&instr->instr)) {
5611
ctx->program->needs_exact = true;
5612
/* save exec somewhere temporarily so that it doesn't get
5613
* overwritten before the discard from outer exec masks */
5614
Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc),
5615
Operand::c32(0xFFFFFFFF), Operand(exec, bld.lm));
5616
bld.pseudo(aco_opcode::p_discard_if, cond);
5617
ctx->block->kind |= block_kind_uses_discard_if;
5618
return;
5619
}
5620
5621
/* This condition is incorrect for uniformly branched discards in a loop
5622
* predicated by a divergent condition, but the above code catches that case
5623
* and the discard would end up turning into a discard_if.
5624
* For example:
5625
* if (divergent) {
5626
* while (...) {
5627
* if (uniform) {
5628
* discard;
5629
* }
5630
* }
5631
* }
5632
*/
5633
if (!ctx->cf_info.parent_if.is_divergent) {
5634
/* program just ends here */
5635
ctx->block->kind |= block_kind_uses_discard_if;
5636
bld.pseudo(aco_opcode::p_discard_if, Operand::c32(0xFFFFFFFFu));
5637
// TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
5638
} else {
5639
ctx->block->kind |= block_kind_discard;
5640
/* branch and linear edge is added by visit_if() */
5641
}
5642
}
5643
5644
enum aco_descriptor_type {
5645
ACO_DESC_IMAGE,
5646
ACO_DESC_FMASK,
5647
ACO_DESC_SAMPLER,
5648
ACO_DESC_BUFFER,
5649
ACO_DESC_PLANE_0,
5650
ACO_DESC_PLANE_1,
5651
ACO_DESC_PLANE_2,
5652
};
5653
5654
static bool
5655
should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool is_array)
5656
{
5657
if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
5658
return false;
5659
ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
5660
return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
5661
dim == ac_image_2darraymsaa;
5662
}
5663
5664
Temp
5665
get_sampler_desc(isel_context* ctx, nir_deref_instr* deref_instr,
5666
enum aco_descriptor_type desc_type, const nir_tex_instr* tex_instr, bool write)
5667
{
5668
/* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
5669
std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type <<
5670
32 | deref_instr->dest.ssa.index); if (it != ctx->tex_desc.end()) return it->second;
5671
*/
5672
Temp index = Temp();
5673
bool index_set = false;
5674
unsigned constant_index = 0;
5675
unsigned descriptor_set;
5676
unsigned base_index;
5677
Builder bld(ctx->program, ctx->block);
5678
5679
if (!deref_instr) {
5680
assert(tex_instr);
5681
descriptor_set = 0;
5682
base_index = tex_instr->sampler_index;
5683
} else {
5684
while (deref_instr->deref_type != nir_deref_type_var) {
5685
unsigned array_size = glsl_get_aoa_size(deref_instr->type);
5686
if (!array_size)
5687
array_size = 1;
5688
5689
assert(deref_instr->deref_type == nir_deref_type_array);
5690
nir_const_value* const_value = nir_src_as_const_value(deref_instr->arr.index);
5691
if (const_value) {
5692
constant_index += array_size * const_value->u32;
5693
} else {
5694
Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
5695
if (indirect.type() == RegType::vgpr)
5696
indirect = bld.as_uniform(indirect);
5697
5698
if (array_size != 1)
5699
indirect =
5700
bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(array_size), indirect);
5701
5702
if (!index_set) {
5703
index = indirect;
5704
index_set = true;
5705
} else {
5706
index =
5707
bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
5708
}
5709
}
5710
5711
deref_instr = nir_src_as_deref(deref_instr->parent);
5712
}
5713
descriptor_set = deref_instr->var->data.descriptor_set;
5714
base_index = deref_instr->var->data.binding;
5715
}
5716
5717
Temp list = load_desc_ptr(ctx, descriptor_set);
5718
list = convert_pointer_to_64_bit(ctx, list);
5719
5720
struct radv_descriptor_set_layout* layout = ctx->options->layout->set[descriptor_set].layout;
5721
struct radv_descriptor_set_binding_layout* binding = layout->binding + base_index;
5722
unsigned offset = binding->offset;
5723
unsigned stride = binding->size;
5724
aco_opcode opcode;
5725
RegClass type;
5726
5727
assert(base_index < layout->binding_count);
5728
5729
switch (desc_type) {
5730
case ACO_DESC_IMAGE:
5731
type = s8;
5732
opcode = aco_opcode::s_load_dwordx8;
5733
break;
5734
case ACO_DESC_FMASK:
5735
type = s8;
5736
opcode = aco_opcode::s_load_dwordx8;
5737
offset += 32;
5738
break;
5739
case ACO_DESC_SAMPLER:
5740
type = s4;
5741
opcode = aco_opcode::s_load_dwordx4;
5742
if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
5743
offset += radv_combined_image_descriptor_sampler_offset(binding);
5744
break;
5745
case ACO_DESC_BUFFER:
5746
type = s4;
5747
opcode = aco_opcode::s_load_dwordx4;
5748
break;
5749
case ACO_DESC_PLANE_0:
5750
case ACO_DESC_PLANE_1:
5751
type = s8;
5752
opcode = aco_opcode::s_load_dwordx8;
5753
offset += 32 * (desc_type - ACO_DESC_PLANE_0);
5754
break;
5755
case ACO_DESC_PLANE_2:
5756
type = s4;
5757
opcode = aco_opcode::s_load_dwordx4;
5758
offset += 64;
5759
break;
5760
default: unreachable("invalid desc_type\n");
5761
}
5762
5763
offset += constant_index * stride;
5764
5765
if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
5766
(!index_set || binding->immutable_samplers_equal)) {
5767
if (binding->immutable_samplers_equal)
5768
constant_index = 0;
5769
5770
const uint32_t* samplers = radv_immutable_samplers(layout, binding);
5771
uint32_t dword0_mask = tex_instr->op == nir_texop_tg4 ? C_008F30_TRUNC_COORD : 0xffffffffu;
5772
return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5773
Operand::c32(samplers[constant_index * 4 + 0] & dword0_mask),
5774
Operand::c32(samplers[constant_index * 4 + 1]),
5775
Operand::c32(samplers[constant_index * 4 + 2]),
5776
Operand::c32(samplers[constant_index * 4 + 3]));
5777
}
5778
5779
Operand off;
5780
if (!index_set) {
5781
off = bld.copy(bld.def(s1), Operand::c32(offset));
5782
} else {
5783
off = Operand(
5784
(Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand::c32(offset),
5785
bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index)));
5786
}
5787
5788
Temp res = bld.smem(opcode, bld.def(type), list, off);
5789
5790
if (desc_type == ACO_DESC_PLANE_2) {
5791
Temp components[8];
5792
for (unsigned i = 0; i < 8; i++)
5793
components[i] = bld.tmp(s1);
5794
bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5795
Definition(components[2]), Definition(components[3]), res);
5796
5797
Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, write);
5798
bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
5799
Definition(components[4]), Definition(components[5]), Definition(components[6]),
5800
Definition(components[7]), desc2);
5801
5802
res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
5803
components[2], components[3], components[4], components[5], components[6],
5804
components[7]);
5805
} else if (desc_type == ACO_DESC_IMAGE && ctx->options->has_image_load_dcc_bug && !tex_instr &&
5806
!write) {
5807
Temp components[8];
5808
for (unsigned i = 0; i < 8; i++)
5809
components[i] = bld.tmp(s1);
5810
5811
bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5812
Definition(components[2]), Definition(components[3]), Definition(components[4]),
5813
Definition(components[5]), Definition(components[6]), Definition(components[7]),
5814
res);
5815
5816
/* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a
5817
* hardware bug.
5818
*/
5819
components[6] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[6],
5820
bld.copy(bld.def(s1), Operand::c32(C_00A018_WRITE_COMPRESS_ENABLE)));
5821
5822
res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
5823
components[2], components[3], components[4], components[5], components[6],
5824
components[7]);
5825
} else if (desc_type == ACO_DESC_SAMPLER && tex_instr->op == nir_texop_tg4) {
5826
Temp components[4];
5827
for (unsigned i = 0; i < 4; i++)
5828
components[i] = bld.tmp(s1);
5829
5830
bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5831
Definition(components[2]), Definition(components[3]), res);
5832
5833
/* We want to always use the linear filtering truncation behaviour for
5834
* nir_texop_tg4, even if the sampler uses nearest/point filtering.
5835
*/
5836
components[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[0],
5837
Operand::c32(C_008F30_TRUNC_COORD));
5838
5839
res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), components[0], components[1],
5840
components[2], components[3]);
5841
}
5842
5843
return res;
5844
}
5845
5846
static int
5847
image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5848
{
5849
switch (dim) {
5850
case GLSL_SAMPLER_DIM_BUF: return 1;
5851
case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
5852
case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
5853
case GLSL_SAMPLER_DIM_MS: return array ? 4 : 3;
5854
case GLSL_SAMPLER_DIM_3D:
5855
case GLSL_SAMPLER_DIM_CUBE: return 3;
5856
case GLSL_SAMPLER_DIM_RECT:
5857
case GLSL_SAMPLER_DIM_SUBPASS: return 2;
5858
case GLSL_SAMPLER_DIM_SUBPASS_MS: return 3;
5859
default: break;
5860
}
5861
return 0;
5862
}
5863
5864
static MIMG_instruction*
5865
emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp,
5866
std::vector<Temp> coords, unsigned wqm_mask = 0, Operand vdata = Operand(v1))
5867
{
5868
/* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues. */
5869
unsigned max_nsa_size = bld.program->chip_class >= GFX10_3 ? 13 : 5;
5870
bool use_nsa = bld.program->chip_class >= GFX10 && coords.size() <= max_nsa_size;
5871
5872
if (!use_nsa) {
5873
Temp coord = coords[0];
5874
if (coords.size() > 1) {
5875
coord = bld.tmp(RegType::vgpr, coords.size());
5876
5877
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5878
aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
5879
for (unsigned i = 0; i < coords.size(); i++)
5880
vec->operands[i] = Operand(coords[i]);
5881
vec->definitions[0] = Definition(coord);
5882
bld.insert(std::move(vec));
5883
} else if (coord.type() == RegType::sgpr) {
5884
coord = bld.copy(bld.def(v1), coord);
5885
}
5886
5887
if (wqm_mask) {
5888
/* We don't need the bias, sample index, compare value or offset to be
5889
* computed in WQM but if the p_create_vector copies the coordinates, then it
5890
* needs to be in WQM. */
5891
coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
5892
}
5893
5894
coords[0] = coord;
5895
coords.resize(1);
5896
} else {
5897
for (unsigned i = 0; i < coords.size(); i++) {
5898
if (wqm_mask & (1u << i))
5899
coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true);
5900
}
5901
5902
for (Temp& coord : coords) {
5903
if (coord.type() == RegType::sgpr)
5904
coord = bld.copy(bld.def(v1), coord);
5905
}
5906
}
5907
5908
aco_ptr<MIMG_instruction> mimg{
5909
create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), dst.isTemp())};
5910
if (dst.isTemp())
5911
mimg->definitions[0] = dst;
5912
mimg->operands[0] = Operand(rsrc);
5913
mimg->operands[1] = samp;
5914
mimg->operands[2] = vdata;
5915
for (unsigned i = 0; i < coords.size(); i++)
5916
mimg->operands[3 + i] = Operand(coords[i]);
5917
5918
MIMG_instruction* res = mimg.get();
5919
bld.insert(std::move(mimg));
5920
return res;
5921
}
5922
5923
void
5924
visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
5925
{
5926
Builder bld(ctx->program, ctx->block);
5927
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5928
Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
5929
Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
5930
Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
5931
Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
5932
Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
5933
Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
5934
5935
std::vector<Temp> args;
5936
args.push_back(emit_extract_vector(ctx, node, 0, v1));
5937
args.push_back(emit_extract_vector(ctx, node, 1, v1));
5938
args.push_back(as_vgpr(ctx, tmax));
5939
args.push_back(emit_extract_vector(ctx, origin, 0, v1));
5940
args.push_back(emit_extract_vector(ctx, origin, 1, v1));
5941
args.push_back(emit_extract_vector(ctx, origin, 2, v1));
5942
args.push_back(emit_extract_vector(ctx, dir, 0, v1));
5943
args.push_back(emit_extract_vector(ctx, dir, 1, v1));
5944
args.push_back(emit_extract_vector(ctx, dir, 2, v1));
5945
args.push_back(emit_extract_vector(ctx, inv_dir, 0, v1));
5946
args.push_back(emit_extract_vector(ctx, inv_dir, 1, v1));
5947
args.push_back(emit_extract_vector(ctx, inv_dir, 2, v1));
5948
5949
MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, Definition(dst),
5950
resource, Operand(s4), args);
5951
mimg->dim = ac_image_1d;
5952
mimg->dmask = 0xf;
5953
mimg->unrm = true;
5954
mimg->r128 = true;
5955
}
5956
5957
/* Adjust the sample index according to FMASK.
5958
*
5959
* For uncompressed MSAA surfaces, FMASK should return 0x76543210,
5960
* which is the identity mapping. Each nibble says which physical sample
5961
* should be fetched to get that sample.
5962
*
5963
* For example, 0x11111100 means there are only 2 samples stored and
5964
* the second sample covers 3/4 of the pixel. When reading samples 0
5965
* and 1, return physical sample 0 (determined by the first two 0s
5966
* in FMASK), otherwise return physical sample 1.
5967
*
5968
* The sample index should be adjusted as follows:
5969
* sample_index = (fmask >> (sample_index * 4)) & 0xF;
5970
*/
5971
static Temp
5972
adjust_sample_index_using_fmask(isel_context* ctx, bool da, std::vector<Temp>& coords,
5973
Operand sample_index, Temp fmask_desc_ptr)
5974
{
5975
Builder bld(ctx->program, ctx->block);
5976
Temp fmask = bld.tmp(v1);
5977
unsigned dim = ctx->options->chip_class >= GFX10
5978
? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
5979
: 0;
5980
5981
MIMG_instruction* load = emit_mimg(bld, aco_opcode::image_load, Definition(fmask),
5982
fmask_desc_ptr, Operand(s4), coords);
5983
load->glc = false;
5984
load->dlc = false;
5985
load->dmask = 0x1;
5986
load->unrm = true;
5987
load->da = da;
5988
load->dim = dim;
5989
5990
Operand sample_index4;
5991
if (sample_index.isConstant()) {
5992
if (sample_index.constantValue() < 16) {
5993
sample_index4 = Operand::c32(sample_index.constantValue() << 2);
5994
} else {
5995
sample_index4 = Operand::zero();
5996
}
5997
} else if (sample_index.regClass() == s1) {
5998
sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index,
5999
Operand::c32(2u));
6000
} else {
6001
assert(sample_index.regClass() == v1);
6002
sample_index4 =
6003
bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), sample_index);
6004
}
6005
6006
Temp final_sample;
6007
if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
6008
final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(15u), fmask);
6009
else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
6010
final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand::c32(28u), fmask);
6011
else
6012
final_sample =
6013
bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand::c32(4u));
6014
6015
/* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
6016
* resource descriptor is 0 (invalid),
6017
*/
6018
Temp compare = bld.tmp(bld.lm);
6019
bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare), Operand::zero(),
6020
emit_extract_vector(ctx, fmask_desc_ptr, 1, s1))
6021
.def(0)
6022
.setHint(vcc);
6023
6024
Temp sample_index_v = bld.copy(bld.def(v1), sample_index);
6025
6026
/* Replace the MSAA sample index. */
6027
return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
6028
}
6029
6030
static std::vector<Temp>
6031
get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr, const struct glsl_type* type)
6032
{
6033
6034
Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6035
enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6036
bool is_array = glsl_sampler_type_is_array(type);
6037
ASSERTED bool add_frag_pos =
6038
(dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6039
assert(!add_frag_pos && "Input attachments should be lowered.");
6040
bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6041
bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6042
int count = image_type_to_components_count(dim, is_array);
6043
std::vector<Temp> coords(count);
6044
Builder bld(ctx->program, ctx->block);
6045
6046
if (is_ms) {
6047
count--;
6048
Temp src2 = get_ssa_temp(ctx, instr->src[2].ssa);
6049
/* get sample index */
6050
if (instr->intrinsic == nir_intrinsic_image_deref_load ||
6051
instr->intrinsic == nir_intrinsic_image_deref_sparse_load) {
6052
nir_const_value* sample_cv = nir_src_as_const_value(instr->src[2]);
6053
Operand sample_index = sample_cv ? Operand::c32(sample_cv->u32)
6054
: Operand(emit_extract_vector(ctx, src2, 0, v1));
6055
std::vector<Temp> fmask_load_address;
6056
for (unsigned i = 0; i < (is_array ? 3 : 2); i++)
6057
fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1));
6058
6059
Temp fmask_desc_ptr =
6060
get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6061
ACO_DESC_FMASK, nullptr, false);
6062
coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address,
6063
sample_index, fmask_desc_ptr);
6064
} else {
6065
coords[count] = emit_extract_vector(ctx, src2, 0, v1);
6066
}
6067
}
6068
6069
if (gfx9_1d) {
6070
coords[0] = emit_extract_vector(ctx, src0, 0, v1);
6071
coords.resize(coords.size() + 1);
6072
coords[1] = bld.copy(bld.def(v1), Operand::zero());
6073
if (is_array)
6074
coords[2] = emit_extract_vector(ctx, src0, 1, v1);
6075
} else {
6076
for (int i = 0; i < count; i++)
6077
coords[i] = emit_extract_vector(ctx, src0, i, v1);
6078
}
6079
6080
if (instr->intrinsic == nir_intrinsic_image_deref_load ||
6081
instr->intrinsic == nir_intrinsic_image_deref_sparse_load ||
6082
instr->intrinsic == nir_intrinsic_image_deref_store) {
6083
int lod_index = instr->intrinsic == nir_intrinsic_image_deref_store ? 4 : 3;
6084
bool level_zero =
6085
nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
6086
6087
if (!level_zero)
6088
coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
6089
}
6090
6091
return coords;
6092
}
6093
6094
memory_sync_info
6095
get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6096
{
6097
/* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6098
if (semantics & semantic_atomicrmw)
6099
return memory_sync_info(storage, semantics);
6100
6101
unsigned access = nir_intrinsic_access(instr);
6102
6103
if (access & ACCESS_VOLATILE)
6104
semantics |= semantic_volatile;
6105
if (access & ACCESS_CAN_REORDER)
6106
semantics |= semantic_can_reorder | semantic_private;
6107
6108
return memory_sync_info(storage, semantics);
6109
}
6110
6111
Operand
6112
emit_tfe_init(Builder& bld, Temp dst)
6113
{
6114
Temp tmp = bld.tmp(dst.regClass());
6115
6116
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6117
aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6118
for (unsigned i = 0; i < dst.size(); i++)
6119
vec->operands[i] = Operand::zero();
6120
vec->definitions[0] = Definition(tmp);
6121
/* Since this is fixed to an instruction's definition register, any CSE will
6122
* just create copies. Copying costs about the same as zero-initialization,
6123
* but these copies can break up clauses.
6124
*/
6125
vec->definitions[0].setNoCSE(true);
6126
bld.insert(std::move(vec));
6127
6128
return Operand(tmp);
6129
}
6130
6131
void
6132
visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6133
{
6134
Builder bld(ctx->program, ctx->block);
6135
const nir_variable* var =
6136
nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6137
const struct glsl_type* type = glsl_without_array(var->type);
6138
const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6139
bool is_array = glsl_sampler_type_is_array(type);
6140
bool is_sparse = instr->intrinsic == nir_intrinsic_image_deref_sparse_load;
6141
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6142
6143
memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6144
unsigned access = var->data.access | nir_intrinsic_access(instr);
6145
6146
unsigned result_size = instr->dest.ssa.num_components - is_sparse;
6147
unsigned expand_mask =
6148
nir_ssa_def_components_read(&instr->dest.ssa) & u_bit_consecutive(0, result_size);
6149
expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6150
if (dim == GLSL_SAMPLER_DIM_BUF)
6151
expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6152
unsigned dmask = expand_mask;
6153
if (instr->dest.ssa.bit_size == 64) {
6154
expand_mask &= 0x9;
6155
/* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6156
dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6157
}
6158
if (is_sparse)
6159
expand_mask |= 1 << result_size;
6160
unsigned num_components = util_bitcount(dmask) + is_sparse;
6161
6162
Temp tmp;
6163
if (num_components == dst.size() && dst.type() == RegType::vgpr)
6164
tmp = dst;
6165
else
6166
tmp = ctx->program->allocateTmp(RegClass(RegType::vgpr, num_components));
6167
6168
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6169
dim == GLSL_SAMPLER_DIM_BUF ? ACO_DESC_BUFFER : ACO_DESC_IMAGE,
6170
nullptr, false);
6171
6172
if (dim == GLSL_SAMPLER_DIM_BUF) {
6173
Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6174
6175
aco_opcode opcode;
6176
switch (util_bitcount(dmask)) {
6177
case 1: opcode = aco_opcode::buffer_load_format_x; break;
6178
case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6179
case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6180
case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6181
default: unreachable(">4 channel buffer image load");
6182
}
6183
aco_ptr<MUBUF_instruction> load{
6184
create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6185
load->operands[0] = Operand(resource);
6186
load->operands[1] = Operand(vindex);
6187
load->operands[2] = Operand::c32(0);
6188
load->definitions[0] = Definition(tmp);
6189
load->idxen = true;
6190
load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6191
load->dlc = load->glc && ctx->options->chip_class >= GFX10;
6192
load->sync = sync;
6193
load->tfe = is_sparse;
6194
if (load->tfe)
6195
load->operands[3] = emit_tfe_init(bld, tmp);
6196
ctx->block->instructions.emplace_back(std::move(load));
6197
} else {
6198
std::vector<Temp> coords = get_image_coords(ctx, instr, type);
6199
6200
bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6201
aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6202
6203
Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6204
MIMG_instruction* load =
6205
emit_mimg(bld, opcode, Definition(tmp), resource, Operand(s4), coords, 0, vdata);
6206
load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
6207
load->dlc = load->glc && ctx->options->chip_class >= GFX10;
6208
load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6209
load->dmask = dmask;
6210
load->unrm = true;
6211
load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
6212
load->sync = sync;
6213
load->tfe = is_sparse;
6214
}
6215
6216
if (is_sparse && instr->dest.ssa.bit_size == 64) {
6217
/* The result components are 64-bit but the sparse residency code is
6218
* 32-bit. So add a zero to the end so expand_vector() works correctly.
6219
*/
6220
tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6221
Operand::zero());
6222
}
6223
6224
expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, expand_mask);
6225
}
6226
6227
void
6228
visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6229
{
6230
const nir_variable* var =
6231
nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6232
const struct glsl_type* type = glsl_without_array(var->type);
6233
const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6234
bool is_array = glsl_sampler_type_is_array(type);
6235
Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6236
6237
/* only R64_UINT and R64_SINT supported */
6238
if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6239
data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6240
data = as_vgpr(ctx, data);
6241
6242
memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6243
unsigned access = var->data.access | nir_intrinsic_access(instr);
6244
bool glc = ctx->options->chip_class == GFX6 ||
6245
access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE)
6246
? 1
6247
: 0;
6248
6249
if (dim == GLSL_SAMPLER_DIM_BUF) {
6250
Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6251
ACO_DESC_BUFFER, nullptr, true);
6252
Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6253
aco_opcode opcode;
6254
switch (data.size()) {
6255
case 1: opcode = aco_opcode::buffer_store_format_x; break;
6256
case 2: opcode = aco_opcode::buffer_store_format_xy; break;
6257
case 3: opcode = aco_opcode::buffer_store_format_xyz; break;
6258
case 4: opcode = aco_opcode::buffer_store_format_xyzw; break;
6259
default: unreachable(">4 channel buffer image store");
6260
}
6261
aco_ptr<MUBUF_instruction> store{
6262
create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
6263
store->operands[0] = Operand(rsrc);
6264
store->operands[1] = Operand(vindex);
6265
store->operands[2] = Operand::c32(0);
6266
store->operands[3] = Operand(data);
6267
store->idxen = true;
6268
store->glc = glc;
6269
store->dlc = false;
6270
store->disable_wqm = true;
6271
store->sync = sync;
6272
ctx->program->needs_exact = true;
6273
ctx->block->instructions.emplace_back(std::move(store));
6274
return;
6275
}
6276
6277
assert(data.type() == RegType::vgpr);
6278
std::vector<Temp> coords = get_image_coords(ctx, instr, type);
6279
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6280
ACO_DESC_IMAGE, nullptr, true);
6281
6282
bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6283
aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6284
6285
Builder bld(ctx->program, ctx->block);
6286
MIMG_instruction* store =
6287
emit_mimg(bld, opcode, Definition(), resource, Operand(s4), coords, 0, Operand(data));
6288
store->glc = glc;
6289
store->dlc = false;
6290
store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6291
store->dmask = (1 << data.size()) - 1;
6292
store->unrm = true;
6293
store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
6294
store->disable_wqm = true;
6295
store->sync = sync;
6296
ctx->program->needs_exact = true;
6297
return;
6298
}
6299
6300
void
6301
visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6302
{
6303
bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6304
const nir_variable* var =
6305
nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6306
const struct glsl_type* type = glsl_without_array(var->type);
6307
const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6308
bool is_array = glsl_sampler_type_is_array(type);
6309
Builder bld(ctx->program, ctx->block);
6310
6311
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6312
bool is_64bit = data.bytes() == 8;
6313
assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6314
6315
if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
6316
data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6317
get_ssa_temp(ctx, instr->src[4].ssa), data);
6318
6319
aco_opcode buf_op, buf_op64, image_op;
6320
switch (instr->intrinsic) {
6321
case nir_intrinsic_image_deref_atomic_add:
6322
buf_op = aco_opcode::buffer_atomic_add;
6323
buf_op64 = aco_opcode::buffer_atomic_add_x2;
6324
image_op = aco_opcode::image_atomic_add;
6325
break;
6326
case nir_intrinsic_image_deref_atomic_umin:
6327
buf_op = aco_opcode::buffer_atomic_umin;
6328
buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6329
image_op = aco_opcode::image_atomic_umin;
6330
break;
6331
case nir_intrinsic_image_deref_atomic_imin:
6332
buf_op = aco_opcode::buffer_atomic_smin;
6333
buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6334
image_op = aco_opcode::image_atomic_smin;
6335
break;
6336
case nir_intrinsic_image_deref_atomic_umax:
6337
buf_op = aco_opcode::buffer_atomic_umax;
6338
buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6339
image_op = aco_opcode::image_atomic_umax;
6340
break;
6341
case nir_intrinsic_image_deref_atomic_imax:
6342
buf_op = aco_opcode::buffer_atomic_smax;
6343
buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6344
image_op = aco_opcode::image_atomic_smax;
6345
break;
6346
case nir_intrinsic_image_deref_atomic_and:
6347
buf_op = aco_opcode::buffer_atomic_and;
6348
buf_op64 = aco_opcode::buffer_atomic_and_x2;
6349
image_op = aco_opcode::image_atomic_and;
6350
break;
6351
case nir_intrinsic_image_deref_atomic_or:
6352
buf_op = aco_opcode::buffer_atomic_or;
6353
buf_op64 = aco_opcode::buffer_atomic_or_x2;
6354
image_op = aco_opcode::image_atomic_or;
6355
break;
6356
case nir_intrinsic_image_deref_atomic_xor:
6357
buf_op = aco_opcode::buffer_atomic_xor;
6358
buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6359
image_op = aco_opcode::image_atomic_xor;
6360
break;
6361
case nir_intrinsic_image_deref_atomic_exchange:
6362
buf_op = aco_opcode::buffer_atomic_swap;
6363
buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6364
image_op = aco_opcode::image_atomic_swap;
6365
break;
6366
case nir_intrinsic_image_deref_atomic_comp_swap:
6367
buf_op = aco_opcode::buffer_atomic_cmpswap;
6368
buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6369
image_op = aco_opcode::image_atomic_cmpswap;
6370
break;
6371
default:
6372
unreachable("visit_image_atomic should only be called with "
6373
"nir_intrinsic_image_deref_atomic_* instructions.");
6374
}
6375
6376
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6377
memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6378
6379
if (dim == GLSL_SAMPLER_DIM_BUF) {
6380
Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6381
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6382
ACO_DESC_BUFFER, nullptr, true);
6383
// assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet
6384
// implemented.");
6385
aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
6386
is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6387
mubuf->operands[0] = Operand(resource);
6388
mubuf->operands[1] = Operand(vindex);
6389
mubuf->operands[2] = Operand::c32(0);
6390
mubuf->operands[3] = Operand(data);
6391
if (return_previous)
6392
mubuf->definitions[0] = Definition(dst);
6393
mubuf->offset = 0;
6394
mubuf->idxen = true;
6395
mubuf->glc = return_previous;
6396
mubuf->dlc = false; /* Not needed for atomics */
6397
mubuf->disable_wqm = true;
6398
mubuf->sync = sync;
6399
ctx->program->needs_exact = true;
6400
ctx->block->instructions.emplace_back(std::move(mubuf));
6401
return;
6402
}
6403
6404
std::vector<Temp> coords = get_image_coords(ctx, instr, type);
6405
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6406
ACO_DESC_IMAGE, nullptr, true);
6407
Definition def = return_previous ? Definition(dst) : Definition();
6408
MIMG_instruction* mimg =
6409
emit_mimg(bld, image_op, def, resource, Operand(s4), coords, 0, Operand(data));
6410
mimg->glc = return_previous;
6411
mimg->dlc = false; /* Not needed for atomics */
6412
mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6413
mimg->dmask = (1 << data.size()) - 1;
6414
mimg->unrm = true;
6415
mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
6416
mimg->disable_wqm = true;
6417
mimg->sync = sync;
6418
ctx->program->needs_exact = true;
6419
return;
6420
}
6421
6422
void
6423
get_buffer_size(isel_context* ctx, Temp desc, Temp dst)
6424
{
6425
if (ctx->options->chip_class == GFX8) {
6426
/* we only have to divide by 1, 2, 4, 8, 12 or 16 */
6427
Builder bld(ctx->program, ctx->block);
6428
6429
Temp size = emit_extract_vector(ctx, desc, 2, s1);
6430
6431
Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1),
6432
bld.copy(bld.def(v1), Operand::c32(0xaaaaaaabu)), size);
6433
size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
6434
bld.as_uniform(size_div3), Operand::c32(1u));
6435
6436
Temp stride = emit_extract_vector(ctx, desc, 1, s1);
6437
stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride,
6438
Operand::c32((5u << 16) | 16u));
6439
6440
Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand::c32(12u));
6441
size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
6442
6443
Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
6444
bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), size,
6445
bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
6446
if (dst.type() == RegType::vgpr)
6447
bld.copy(Definition(dst), shr_dst);
6448
6449
/* TODO: we can probably calculate this faster with v_skip when stride != 12 */
6450
} else {
6451
emit_extract_vector(ctx, desc, 2, dst);
6452
}
6453
}
6454
6455
void
6456
visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr)
6457
{
6458
const nir_variable* var =
6459
nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6460
const struct glsl_type* type = glsl_without_array(var->type);
6461
const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6462
bool is_array = glsl_sampler_type_is_array(type);
6463
Builder bld(ctx->program, ctx->block);
6464
6465
if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
6466
Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6467
ACO_DESC_BUFFER, NULL, false);
6468
return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa));
6469
}
6470
6471
/* LOD */
6472
assert(nir_src_as_uint(instr->src[1]) == 0);
6473
std::vector<Temp> lod{bld.copy(bld.def(v1), Operand::zero())};
6474
6475
/* Resource */
6476
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6477
ACO_DESC_IMAGE, NULL, false);
6478
6479
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6480
6481
MIMG_instruction* mimg =
6482
emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(dst), resource, Operand(s4), lod);
6483
uint8_t& dmask = mimg->dmask;
6484
mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6485
mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
6486
mimg->da = glsl_sampler_type_is_array(type);
6487
6488
if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE && glsl_sampler_type_is_array(type)) {
6489
6490
assert(instr->dest.ssa.num_components == 3);
6491
Temp tmp = ctx->program->allocateTmp(v3);
6492
mimg->definitions[0] = Definition(tmp);
6493
emit_split_vector(ctx, tmp, 3);
6494
6495
/* divide 3rd value by 6 by multiplying with magic number */
6496
Temp c = bld.copy(bld.def(s1), Operand::c32(0x2AAAAAAB));
6497
Temp by_6 =
6498
bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
6499
6500
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, tmp, 0, v1),
6501
emit_extract_vector(ctx, tmp, 1, v1), by_6);
6502
6503
} else if (ctx->options->chip_class == GFX9 &&
6504
glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
6505
glsl_sampler_type_is_array(type)) {
6506
assert(instr->dest.ssa.num_components == 2);
6507
dmask = 0x5;
6508
}
6509
6510
emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
6511
}
6512
6513
void
6514
get_image_samples(isel_context* ctx, Definition dst, Temp resource)
6515
{
6516
Builder bld(ctx->program, ctx->block);
6517
6518
Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6519
Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
6520
Operand::c32(16u | 4u << 16));
6521
Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand::c32(1u),
6522
samples_log2);
6523
Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
6524
Operand::c32(28u | 4u << 16 /* offset=28, width=4 */));
6525
6526
Operand default_sample = Operand::c32(1u);
6527
if (ctx->options->robust_buffer_access) {
6528
/* Extract the second dword of the descriptor, if it's
6529
* all zero, then it's a null descriptor.
6530
*/
6531
Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
6532
Temp is_non_null_descriptor =
6533
bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand::zero());
6534
default_sample = Operand(is_non_null_descriptor);
6535
}
6536
6537
Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand::c32(14u));
6538
bld.sop2(aco_opcode::s_cselect_b32, dst, samples, default_sample, bld.scc(is_msaa));
6539
}
6540
6541
void
6542
visit_image_samples(isel_context* ctx, nir_intrinsic_instr* instr)
6543
{
6544
Builder bld(ctx->program, ctx->block);
6545
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6546
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6547
ACO_DESC_IMAGE, NULL, false);
6548
get_image_samples(ctx, Definition(dst), resource);
6549
}
6550
6551
void
6552
visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6553
{
6554
Builder bld(ctx->program, ctx->block);
6555
unsigned num_components = instr->num_components;
6556
6557
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6558
Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6559
6560
unsigned access = nir_intrinsic_access(instr);
6561
bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6562
unsigned size = instr->dest.ssa.bit_size / 8;
6563
6564
bool allow_smem = access & ACCESS_CAN_REORDER;
6565
6566
load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6567
nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
6568
get_memory_sync_info(instr, storage_buffer, 0));
6569
}
6570
6571
void
6572
visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6573
{
6574
Builder bld(ctx->program, ctx->block);
6575
Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6576
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6577
unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6578
Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6579
6580
Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6581
6582
memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6583
bool glc =
6584
nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6585
6586
unsigned write_count = 0;
6587
Temp write_datas[32];
6588
unsigned offsets[32];
6589
split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6590
write_datas, offsets);
6591
6592
/* GFX6-7 are affected by a hw bug that prevents address clamping to work
6593
* correctly when the SGPR offset is used.
6594
*/
6595
if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)
6596
offset = as_vgpr(ctx, offset);
6597
6598
for (unsigned i = 0; i < write_count; i++) {
6599
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6600
6601
aco_ptr<MUBUF_instruction> store{
6602
create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6603
store->operands[0] = Operand(rsrc);
6604
store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6605
store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6606
store->operands[3] = Operand(write_datas[i]);
6607
store->offset = offsets[i];
6608
store->offen = (offset.type() == RegType::vgpr);
6609
store->glc = glc;
6610
store->dlc = false;
6611
store->disable_wqm = true;
6612
store->sync = sync;
6613
ctx->program->needs_exact = true;
6614
ctx->block->instructions.emplace_back(std::move(store));
6615
}
6616
}
6617
6618
void
6619
visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6620
{
6621
Builder bld(ctx->program, ctx->block);
6622
bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6623
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6624
6625
if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
6626
data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6627
get_ssa_temp(ctx, instr->src[3].ssa), data);
6628
6629
Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6630
Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6631
6632
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6633
6634
aco_opcode op32, op64;
6635
switch (instr->intrinsic) {
6636
case nir_intrinsic_ssbo_atomic_add:
6637
op32 = aco_opcode::buffer_atomic_add;
6638
op64 = aco_opcode::buffer_atomic_add_x2;
6639
break;
6640
case nir_intrinsic_ssbo_atomic_imin:
6641
op32 = aco_opcode::buffer_atomic_smin;
6642
op64 = aco_opcode::buffer_atomic_smin_x2;
6643
break;
6644
case nir_intrinsic_ssbo_atomic_umin:
6645
op32 = aco_opcode::buffer_atomic_umin;
6646
op64 = aco_opcode::buffer_atomic_umin_x2;
6647
break;
6648
case nir_intrinsic_ssbo_atomic_imax:
6649
op32 = aco_opcode::buffer_atomic_smax;
6650
op64 = aco_opcode::buffer_atomic_smax_x2;
6651
break;
6652
case nir_intrinsic_ssbo_atomic_umax:
6653
op32 = aco_opcode::buffer_atomic_umax;
6654
op64 = aco_opcode::buffer_atomic_umax_x2;
6655
break;
6656
case nir_intrinsic_ssbo_atomic_and:
6657
op32 = aco_opcode::buffer_atomic_and;
6658
op64 = aco_opcode::buffer_atomic_and_x2;
6659
break;
6660
case nir_intrinsic_ssbo_atomic_or:
6661
op32 = aco_opcode::buffer_atomic_or;
6662
op64 = aco_opcode::buffer_atomic_or_x2;
6663
break;
6664
case nir_intrinsic_ssbo_atomic_xor:
6665
op32 = aco_opcode::buffer_atomic_xor;
6666
op64 = aco_opcode::buffer_atomic_xor_x2;
6667
break;
6668
case nir_intrinsic_ssbo_atomic_exchange:
6669
op32 = aco_opcode::buffer_atomic_swap;
6670
op64 = aco_opcode::buffer_atomic_swap_x2;
6671
break;
6672
case nir_intrinsic_ssbo_atomic_comp_swap:
6673
op32 = aco_opcode::buffer_atomic_cmpswap;
6674
op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6675
break;
6676
default:
6677
unreachable(
6678
"visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
6679
}
6680
aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6681
aco_ptr<MUBUF_instruction> mubuf{
6682
create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6683
mubuf->operands[0] = Operand(rsrc);
6684
mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6685
mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6686
mubuf->operands[3] = Operand(data);
6687
if (return_previous)
6688
mubuf->definitions[0] = Definition(dst);
6689
mubuf->offset = 0;
6690
mubuf->offen = (offset.type() == RegType::vgpr);
6691
mubuf->glc = return_previous;
6692
mubuf->dlc = false; /* Not needed for atomics */
6693
mubuf->disable_wqm = true;
6694
mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6695
ctx->program->needs_exact = true;
6696
ctx->block->instructions.emplace_back(std::move(mubuf));
6697
}
6698
6699
void
6700
visit_get_ssbo_size(isel_context* ctx, nir_intrinsic_instr* instr)
6701
{
6702
6703
Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
6704
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6705
bool non_uniform = dst.type() == RegType::vgpr;
6706
6707
Builder bld(ctx->program, ctx->block);
6708
if (non_uniform) {
6709
Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));
6710
Temp binding = emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1));
6711
Temp index = bld.vadd32(bld.def(v1), set_ptr, binding);
6712
index = convert_pointer_to_64_bit(ctx, index, non_uniform);
6713
6714
LoadEmitInfo info = {Operand(index), dst, 1, 4};
6715
info.align_mul = 4;
6716
info.const_offset = 8;
6717
emit_load(ctx, bld, info, global_load_params);
6718
} else {
6719
emit_extract_vector(ctx, load_buffer_rsrc(ctx, rsrc), 2, dst);
6720
}
6721
}
6722
6723
void
6724
visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6725
{
6726
Builder bld(ctx->program, ctx->block);
6727
unsigned num_components = instr->num_components;
6728
unsigned component_size = instr->dest.ssa.bit_size / 8;
6729
6730
LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),
6731
get_ssa_temp(ctx, &instr->dest.ssa), num_components, component_size};
6732
info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6733
info.align_mul = nir_intrinsic_align_mul(instr);
6734
info.align_offset = nir_intrinsic_align_offset(instr);
6735
info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6736
/* VMEM stores don't update the SMEM cache and it's difficult to prove that
6737
* it's safe to use SMEM */
6738
bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
6739
if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) ||
6740
!can_use_smem) {
6741
emit_load(ctx, bld, info, global_load_params);
6742
} else {
6743
info.offset = Operand(bld.as_uniform(info.offset));
6744
emit_load(ctx, bld, info, smem_load_params);
6745
}
6746
}
6747
6748
void
6749
visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6750
{
6751
Builder bld(ctx->program, ctx->block);
6752
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6753
unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6754
6755
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6756
Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
6757
memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6758
bool glc =
6759
nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6760
6761
if (ctx->options->chip_class >= GFX7)
6762
addr = as_vgpr(ctx, addr);
6763
6764
unsigned write_count = 0;
6765
Temp write_datas[32];
6766
unsigned offsets[32];
6767
split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6768
write_datas, offsets);
6769
6770
for (unsigned i = 0; i < write_count; i++) {
6771
if (ctx->options->chip_class >= GFX7) {
6772
unsigned offset = offsets[i];
6773
Temp store_addr = addr;
6774
if (offset > 0 && ctx->options->chip_class < GFX9) {
6775
Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
6776
Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
6777
Temp carry = bld.tmp(bld.lm);
6778
bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
6779
6780
bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0),
6781
bld.hint_vcc(Definition(carry)), Operand::c32(offset), addr0);
6782
bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
6783
Operand::zero(), addr1, carry)
6784
.def(1)
6785
.setHint(vcc);
6786
6787
store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
6788
6789
offset = 0;
6790
}
6791
6792
bool global = ctx->options->chip_class >= GFX9;
6793
aco_opcode op;
6794
switch (write_datas[i].bytes()) {
6795
case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6796
case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6797
case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6798
case 8:
6799
op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6800
break;
6801
case 12:
6802
op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6803
break;
6804
case 16:
6805
op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6806
break;
6807
default: unreachable("store_global not implemented for this size.");
6808
}
6809
6810
aco_ptr<FLAT_instruction> flat{
6811
create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6812
flat->operands[0] = Operand(store_addr);
6813
flat->operands[1] = Operand(s1);
6814
flat->operands[2] = Operand(write_datas[i]);
6815
flat->glc = glc;
6816
flat->dlc = false;
6817
flat->offset = offset;
6818
flat->disable_wqm = true;
6819
flat->sync = sync;
6820
ctx->program->needs_exact = true;
6821
ctx->block->instructions.emplace_back(std::move(flat));
6822
} else {
6823
assert(ctx->options->chip_class == GFX6);
6824
6825
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6826
6827
Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6828
6829
aco_ptr<MUBUF_instruction> mubuf{
6830
create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6831
mubuf->operands[0] = Operand(rsrc);
6832
mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6833
mubuf->operands[2] = Operand::zero();
6834
mubuf->operands[3] = Operand(write_datas[i]);
6835
mubuf->glc = glc;
6836
mubuf->dlc = false;
6837
mubuf->offset = offsets[i];
6838
mubuf->addr64 = addr.type() == RegType::vgpr;
6839
mubuf->disable_wqm = true;
6840
mubuf->sync = sync;
6841
ctx->program->needs_exact = true;
6842
ctx->block->instructions.emplace_back(std::move(mubuf));
6843
}
6844
}
6845
}
6846
6847
void
6848
visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6849
{
6850
Builder bld(ctx->program, ctx->block);
6851
bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6852
Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
6853
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6854
6855
if (ctx->options->chip_class >= GFX7)
6856
addr = as_vgpr(ctx, addr);
6857
6858
if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
6859
data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6860
get_ssa_temp(ctx, instr->src[2].ssa), data);
6861
6862
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6863
6864
aco_opcode op32, op64;
6865
6866
if (ctx->options->chip_class >= GFX7) {
6867
bool global = ctx->options->chip_class >= GFX9;
6868
switch (instr->intrinsic) {
6869
case nir_intrinsic_global_atomic_add:
6870
op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6871
op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6872
break;
6873
case nir_intrinsic_global_atomic_imin:
6874
op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6875
op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6876
break;
6877
case nir_intrinsic_global_atomic_umin:
6878
op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6879
op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6880
break;
6881
case nir_intrinsic_global_atomic_imax:
6882
op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6883
op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6884
break;
6885
case nir_intrinsic_global_atomic_umax:
6886
op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6887
op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6888
break;
6889
case nir_intrinsic_global_atomic_and:
6890
op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6891
op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6892
break;
6893
case nir_intrinsic_global_atomic_or:
6894
op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6895
op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6896
break;
6897
case nir_intrinsic_global_atomic_xor:
6898
op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6899
op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6900
break;
6901
case nir_intrinsic_global_atomic_exchange:
6902
op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6903
op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6904
break;
6905
case nir_intrinsic_global_atomic_comp_swap:
6906
op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6907
op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6908
break;
6909
default:
6910
unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
6911
"instructions.");
6912
}
6913
6914
aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6915
aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(
6916
op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6917
flat->operands[0] = Operand(addr);
6918
flat->operands[1] = Operand(s1);
6919
flat->operands[2] = Operand(data);
6920
if (return_previous)
6921
flat->definitions[0] = Definition(dst);
6922
flat->glc = return_previous;
6923
flat->dlc = false; /* Not needed for atomics */
6924
flat->offset = 0;
6925
flat->disable_wqm = true;
6926
flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6927
ctx->program->needs_exact = true;
6928
ctx->block->instructions.emplace_back(std::move(flat));
6929
} else {
6930
assert(ctx->options->chip_class == GFX6);
6931
6932
switch (instr->intrinsic) {
6933
case nir_intrinsic_global_atomic_add:
6934
op32 = aco_opcode::buffer_atomic_add;
6935
op64 = aco_opcode::buffer_atomic_add_x2;
6936
break;
6937
case nir_intrinsic_global_atomic_imin:
6938
op32 = aco_opcode::buffer_atomic_smin;
6939
op64 = aco_opcode::buffer_atomic_smin_x2;
6940
break;
6941
case nir_intrinsic_global_atomic_umin:
6942
op32 = aco_opcode::buffer_atomic_umin;
6943
op64 = aco_opcode::buffer_atomic_umin_x2;
6944
break;
6945
case nir_intrinsic_global_atomic_imax:
6946
op32 = aco_opcode::buffer_atomic_smax;
6947
op64 = aco_opcode::buffer_atomic_smax_x2;
6948
break;
6949
case nir_intrinsic_global_atomic_umax:
6950
op32 = aco_opcode::buffer_atomic_umax;
6951
op64 = aco_opcode::buffer_atomic_umax_x2;
6952
break;
6953
case nir_intrinsic_global_atomic_and:
6954
op32 = aco_opcode::buffer_atomic_and;
6955
op64 = aco_opcode::buffer_atomic_and_x2;
6956
break;
6957
case nir_intrinsic_global_atomic_or:
6958
op32 = aco_opcode::buffer_atomic_or;
6959
op64 = aco_opcode::buffer_atomic_or_x2;
6960
break;
6961
case nir_intrinsic_global_atomic_xor:
6962
op32 = aco_opcode::buffer_atomic_xor;
6963
op64 = aco_opcode::buffer_atomic_xor_x2;
6964
break;
6965
case nir_intrinsic_global_atomic_exchange:
6966
op32 = aco_opcode::buffer_atomic_swap;
6967
op64 = aco_opcode::buffer_atomic_swap_x2;
6968
break;
6969
case nir_intrinsic_global_atomic_comp_swap:
6970
op32 = aco_opcode::buffer_atomic_cmpswap;
6971
op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6972
break;
6973
default:
6974
unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
6975
"instructions.");
6976
}
6977
6978
Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6979
6980
aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6981
6982
aco_ptr<MUBUF_instruction> mubuf{
6983
create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6984
mubuf->operands[0] = Operand(rsrc);
6985
mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6986
mubuf->operands[2] = Operand::zero();
6987
mubuf->operands[3] = Operand(data);
6988
if (return_previous)
6989
mubuf->definitions[0] = Definition(dst);
6990
mubuf->glc = return_previous;
6991
mubuf->dlc = false;
6992
mubuf->offset = 0;
6993
mubuf->addr64 = addr.type() == RegType::vgpr;
6994
mubuf->disable_wqm = true;
6995
mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6996
ctx->program->needs_exact = true;
6997
ctx->block->instructions.emplace_back(std::move(mubuf));
6998
}
6999
}
7000
7001
void
7002
visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7003
{
7004
Builder bld(ctx->program, ctx->block);
7005
7006
Temp dst = get_ssa_temp(ctx, &intrin->dest.ssa);
7007
Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
7008
Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
7009
Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
7010
7011
bool swizzled = nir_intrinsic_is_swizzled(intrin);
7012
bool reorder = nir_intrinsic_can_reorder(intrin);
7013
bool slc = nir_intrinsic_slc_amd(intrin);
7014
7015
unsigned const_offset = nir_intrinsic_base(intrin);
7016
unsigned elem_size_bytes = intrin->dest.ssa.bit_size / 8u;
7017
unsigned num_components = intrin->dest.ssa.num_components;
7018
unsigned swizzle_element_size = swizzled ? (ctx->program->chip_class <= GFX8 ? 4 : 16) : 0;
7019
7020
load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7021
num_components, swizzle_element_size, !swizzled, reorder, slc);
7022
}
7023
7024
void
7025
visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7026
{
7027
Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7028
Temp descriptor = get_ssa_temp(ctx, intrin->src[1].ssa);
7029
Temp v_offset = get_ssa_temp(ctx, intrin->src[2].ssa);
7030
Temp s_offset = get_ssa_temp(ctx, intrin->src[3].ssa);
7031
7032
bool swizzled = nir_intrinsic_is_swizzled(intrin);
7033
bool slc = nir_intrinsic_slc_amd(intrin);
7034
7035
unsigned const_offset = nir_intrinsic_base(intrin);
7036
unsigned write_mask = nir_intrinsic_write_mask(intrin);
7037
unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
7038
7039
nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7040
memory_sync_info sync(mem_mode == nir_var_shader_out ? storage_vmem_output : storage_none);
7041
7042
store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7043
write_mask, !swizzled, sync, slc);
7044
}
7045
7046
sync_scope
7047
translate_nir_scope(nir_scope scope)
7048
{
7049
switch (scope) {
7050
case NIR_SCOPE_NONE:
7051
case NIR_SCOPE_INVOCATION: return scope_invocation;
7052
case NIR_SCOPE_SUBGROUP: return scope_subgroup;
7053
case NIR_SCOPE_WORKGROUP: return scope_workgroup;
7054
case NIR_SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7055
case NIR_SCOPE_DEVICE: return scope_device;
7056
case NIR_SCOPE_SHADER_CALL: unreachable("unsupported scope");
7057
}
7058
unreachable("invalid scope");
7059
}
7060
7061
void
7062
emit_scoped_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7063
{
7064
Builder bld(ctx->program, ctx->block);
7065
7066
unsigned semantics = 0;
7067
unsigned storage = 0;
7068
sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7069
sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7070
7071
/* We use shared storage for the following:
7072
* - compute shaders expose it in their API
7073
* - when tessellation is used, TCS and VS I/O is lowered to shared memory
7074
* - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7075
* - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7076
*/
7077
bool shared_storage_used = ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::LS ||
7078
ctx->stage.hw == HWStage::HS ||
7079
(ctx->stage.hw == HWStage::GS && ctx->program->chip_class >= GFX9) ||
7080
ctx->stage.hw == HWStage::NGG;
7081
7082
/* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7083
* They are allowed in CS, TCS, and in any NGG shader.
7084
*/
7085
ASSERTED bool workgroup_scope_allowed =
7086
ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::HS || ctx->stage.hw == HWStage::NGG;
7087
7088
unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7089
if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global))
7090
storage |= storage_buffer | storage_image; // TODO: split this when NIR gets nir_var_mem_image
7091
if (shared_storage_used && (nir_storage & nir_var_mem_shared))
7092
storage |= storage_shared;
7093
7094
unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7095
if (nir_semantics & NIR_MEMORY_ACQUIRE)
7096
semantics |= semantic_acquire | semantic_release;
7097
if (nir_semantics & NIR_MEMORY_RELEASE)
7098
semantics |= semantic_acquire | semantic_release;
7099
7100
assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7101
assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7102
7103
bld.barrier(aco_opcode::p_barrier,
7104
memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7105
exec_scope);
7106
}
7107
7108
void
7109
visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7110
{
7111
// TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
7112
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7113
Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7114
Builder bld(ctx->program, ctx->block);
7115
7116
unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
7117
unsigned num_components = instr->dest.ssa.num_components;
7118
unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7119
load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7120
}
7121
7122
void
7123
visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7124
{
7125
unsigned writemask = nir_intrinsic_write_mask(instr);
7126
Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7127
Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7128
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7129
7130
unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7131
store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7132
}
7133
7134
void
7135
visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7136
{
7137
unsigned offset = nir_intrinsic_base(instr);
7138
Builder bld(ctx->program, ctx->block);
7139
Operand m = load_lds_size_m0(bld);
7140
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7141
Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7142
7143
unsigned num_operands = 3;
7144
aco_opcode op32, op64, op32_rtn, op64_rtn;
7145
switch (instr->intrinsic) {
7146
case nir_intrinsic_shared_atomic_add:
7147
op32 = aco_opcode::ds_add_u32;
7148
op64 = aco_opcode::ds_add_u64;
7149
op32_rtn = aco_opcode::ds_add_rtn_u32;
7150
op64_rtn = aco_opcode::ds_add_rtn_u64;
7151
break;
7152
case nir_intrinsic_shared_atomic_imin:
7153
op32 = aco_opcode::ds_min_i32;
7154
op64 = aco_opcode::ds_min_i64;
7155
op32_rtn = aco_opcode::ds_min_rtn_i32;
7156
op64_rtn = aco_opcode::ds_min_rtn_i64;
7157
break;
7158
case nir_intrinsic_shared_atomic_umin:
7159
op32 = aco_opcode::ds_min_u32;
7160
op64 = aco_opcode::ds_min_u64;
7161
op32_rtn = aco_opcode::ds_min_rtn_u32;
7162
op64_rtn = aco_opcode::ds_min_rtn_u64;
7163
break;
7164
case nir_intrinsic_shared_atomic_imax:
7165
op32 = aco_opcode::ds_max_i32;
7166
op64 = aco_opcode::ds_max_i64;
7167
op32_rtn = aco_opcode::ds_max_rtn_i32;
7168
op64_rtn = aco_opcode::ds_max_rtn_i64;
7169
break;
7170
case nir_intrinsic_shared_atomic_umax:
7171
op32 = aco_opcode::ds_max_u32;
7172
op64 = aco_opcode::ds_max_u64;
7173
op32_rtn = aco_opcode::ds_max_rtn_u32;
7174
op64_rtn = aco_opcode::ds_max_rtn_u64;
7175
break;
7176
case nir_intrinsic_shared_atomic_and:
7177
op32 = aco_opcode::ds_and_b32;
7178
op64 = aco_opcode::ds_and_b64;
7179
op32_rtn = aco_opcode::ds_and_rtn_b32;
7180
op64_rtn = aco_opcode::ds_and_rtn_b64;
7181
break;
7182
case nir_intrinsic_shared_atomic_or:
7183
op32 = aco_opcode::ds_or_b32;
7184
op64 = aco_opcode::ds_or_b64;
7185
op32_rtn = aco_opcode::ds_or_rtn_b32;
7186
op64_rtn = aco_opcode::ds_or_rtn_b64;
7187
break;
7188
case nir_intrinsic_shared_atomic_xor:
7189
op32 = aco_opcode::ds_xor_b32;
7190
op64 = aco_opcode::ds_xor_b64;
7191
op32_rtn = aco_opcode::ds_xor_rtn_b32;
7192
op64_rtn = aco_opcode::ds_xor_rtn_b64;
7193
break;
7194
case nir_intrinsic_shared_atomic_exchange:
7195
op32 = aco_opcode::ds_write_b32;
7196
op64 = aco_opcode::ds_write_b64;
7197
op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7198
op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7199
break;
7200
case nir_intrinsic_shared_atomic_comp_swap:
7201
op32 = aco_opcode::ds_cmpst_b32;
7202
op64 = aco_opcode::ds_cmpst_b64;
7203
op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7204
op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7205
num_operands = 4;
7206
break;
7207
case nir_intrinsic_shared_atomic_fadd:
7208
op32 = aco_opcode::ds_add_f32;
7209
op32_rtn = aco_opcode::ds_add_rtn_f32;
7210
op64 = aco_opcode::num_opcodes;
7211
op64_rtn = aco_opcode::num_opcodes;
7212
break;
7213
default: unreachable("Unhandled shared atomic intrinsic");
7214
}
7215
7216
bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
7217
7218
aco_opcode op;
7219
if (data.size() == 1) {
7220
assert(instr->dest.ssa.bit_size == 32);
7221
op = return_previous ? op32_rtn : op32;
7222
} else {
7223
assert(instr->dest.ssa.bit_size == 64);
7224
op = return_previous ? op64_rtn : op64;
7225
}
7226
7227
if (offset > 65535) {
7228
address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7229
offset = 0;
7230
}
7231
7232
aco_ptr<DS_instruction> ds;
7233
ds.reset(
7234
create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
7235
ds->operands[0] = Operand(address);
7236
ds->operands[1] = Operand(data);
7237
if (num_operands == 4) {
7238
Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7239
ds->operands[2] = Operand(data2);
7240
}
7241
ds->operands[num_operands - 1] = m;
7242
ds->offset0 = offset;
7243
if (return_previous)
7244
ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
7245
ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7246
ctx->block->instructions.emplace_back(std::move(ds));
7247
}
7248
7249
Temp
7250
get_scratch_resource(isel_context* ctx)
7251
{
7252
Builder bld(ctx->program, ctx->block);
7253
Temp scratch_addr = ctx->program->private_segment_buffer;
7254
if (ctx->stage != compute_cs)
7255
scratch_addr =
7256
bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7257
7258
uint32_t rsrc_conf =
7259
S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
7260
7261
if (ctx->program->chip_class >= GFX10) {
7262
rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
7263
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
7264
} else if (ctx->program->chip_class <=
7265
GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
7266
rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7267
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7268
}
7269
7270
/* older generations need element size = 4 bytes. element size removed in GFX9 */
7271
if (ctx->program->chip_class <= GFX8)
7272
rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
7273
7274
return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u),
7275
Operand::c32(rsrc_conf));
7276
}
7277
7278
void
7279
visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7280
{
7281
Builder bld(ctx->program, ctx->block);
7282
Temp rsrc = get_scratch_resource(ctx);
7283
Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7284
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7285
7286
LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,
7287
instr->dest.ssa.bit_size / 8u, rsrc};
7288
info.align_mul = nir_intrinsic_align_mul(instr);
7289
info.align_offset = nir_intrinsic_align_offset(instr);
7290
info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
7291
info.sync = memory_sync_info(storage_scratch, semantic_private);
7292
info.soffset = ctx->program->scratch_offset;
7293
emit_load(ctx, bld, info, scratch_load_params);
7294
}
7295
7296
void
7297
visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7298
{
7299
Builder bld(ctx->program, ctx->block);
7300
Temp rsrc = get_scratch_resource(ctx);
7301
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7302
Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7303
7304
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7305
unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7306
7307
unsigned write_count = 0;
7308
Temp write_datas[32];
7309
unsigned offsets[32];
7310
unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
7311
split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7312
&write_count, write_datas, offsets);
7313
7314
for (unsigned i = 0; i < write_count; i++) {
7315
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7316
Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i],
7317
offsets[i], true, true);
7318
mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7319
}
7320
}
7321
7322
void
7323
visit_load_sample_mask_in(isel_context* ctx, nir_intrinsic_instr* instr)
7324
{
7325
uint8_t log2_ps_iter_samples;
7326
if (ctx->program->info->ps.uses_sample_shading) {
7327
log2_ps_iter_samples = util_logbase2(ctx->options->key.fs.num_samples);
7328
} else {
7329
log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
7330
}
7331
7332
Builder bld(ctx->program, ctx->block);
7333
7334
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7335
7336
if (log2_ps_iter_samples) {
7337
/* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */
7338
Temp sample_id =
7339
bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
7340
Operand::c32(8u), Operand::c32(4u));
7341
Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id,
7342
bld.copy(bld.def(v1), Operand::c32(1u)));
7343
bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask,
7344
get_arg(ctx, ctx->args->ac.sample_coverage));
7345
} else {
7346
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.sample_coverage));
7347
}
7348
}
7349
7350
void
7351
visit_emit_vertex_with_counter(isel_context* ctx, nir_intrinsic_instr* instr)
7352
{
7353
Builder bld(ctx->program, ctx->block);
7354
7355
unsigned stream = nir_intrinsic_stream_id(instr);
7356
Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7357
next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
7358
nir_const_value* next_vertex_cv = nir_src_as_const_value(instr->src[0]);
7359
7360
/* get GSVS ring */
7361
Temp gsvs_ring =
7362
bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer,
7363
Operand::c32(RING_GSVS_GS * 16u));
7364
7365
unsigned num_components = ctx->program->info->gs.num_stream_output_components[stream];
7366
7367
unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
7368
unsigned stream_offset = 0;
7369
for (unsigned i = 0; i < stream; i++) {
7370
unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] *
7371
ctx->shader->info.gs.vertices_out;
7372
stream_offset += prev_stride * ctx->program->wave_size;
7373
}
7374
7375
/* Limit on the stride field for <= GFX7. */
7376
assert(stride < (1 << 14));
7377
7378
Temp gsvs_dwords[4];
7379
for (unsigned i = 0; i < 4; i++)
7380
gsvs_dwords[i] = bld.tmp(s1);
7381
bld.pseudo(aco_opcode::p_split_vector, Definition(gsvs_dwords[0]), Definition(gsvs_dwords[1]),
7382
Definition(gsvs_dwords[2]), Definition(gsvs_dwords[3]), gsvs_ring);
7383
7384
if (stream_offset) {
7385
Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand::c32(stream_offset));
7386
7387
Temp carry = bld.tmp(s1);
7388
gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)),
7389
gsvs_dwords[0], stream_offset_tmp);
7390
gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc),
7391
gsvs_dwords[1], Operand::zero(), bld.scc(carry));
7392
}
7393
7394
gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1],
7395
Operand::c32(S_008F04_STRIDE(stride)));
7396
gsvs_dwords[2] = bld.copy(bld.def(s1), Operand::c32(ctx->program->wave_size));
7397
7398
gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), gsvs_dwords[0], gsvs_dwords[1],
7399
gsvs_dwords[2], gsvs_dwords[3]);
7400
7401
unsigned offset = 0;
7402
for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
7403
if (ctx->program->info->gs.output_streams[i] != stream)
7404
continue;
7405
7406
for (unsigned j = 0; j < 4; j++) {
7407
if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
7408
continue;
7409
7410
if (ctx->outputs.mask[i] & (1 << j)) {
7411
Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
7412
unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
7413
if (const_offset >= 4096u) {
7414
if (vaddr_offset.isUndefined())
7415
vaddr_offset = bld.copy(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u));
7416
else
7417
vaddr_offset = bld.vadd32(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u),
7418
vaddr_offset);
7419
const_offset %= 4096u;
7420
}
7421
7422
aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(
7423
aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
7424
mtbuf->operands[0] = Operand(gsvs_ring);
7425
mtbuf->operands[1] = vaddr_offset;
7426
mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->ac.gs2vs_offset));
7427
mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
7428
mtbuf->offen = !vaddr_offset.isUndefined();
7429
mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
7430
mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
7431
mtbuf->offset = const_offset;
7432
mtbuf->glc = true;
7433
mtbuf->slc = true;
7434
mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
7435
bld.insert(std::move(mtbuf));
7436
}
7437
7438
offset += ctx->shader->info.gs.vertices_out;
7439
}
7440
7441
/* outputs for the next vertex are undefined and keeping them around can
7442
* create invalid IR with control flow */
7443
ctx->outputs.mask[i] = 0;
7444
}
7445
7446
bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
7447
}
7448
7449
Temp
7450
emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src)
7451
{
7452
Builder bld(ctx->program, ctx->block);
7453
7454
if (cluster_size == 1) {
7455
return src;
7456
}
7457
if (op == nir_op_iand && cluster_size == 4) {
7458
/* subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) */
7459
Temp tmp =
7460
bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7461
return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
7462
bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
7463
} else if (op == nir_op_ior && cluster_size == 4) {
7464
/* subgroupClusteredOr(val, 4) -> wqm(val & exec) */
7465
return bld.sop1(
7466
Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
7467
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
7468
} else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
7469
/* subgroupAnd(val) -> (exec & ~val) == 0 */
7470
Temp tmp =
7471
bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
7472
.def(1)
7473
.getTemp();
7474
Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
7475
return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
7476
} else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
7477
/* subgroupOr(val) -> (val & exec) != 0 */
7478
Temp tmp =
7479
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))
7480
.def(1)
7481
.getTemp();
7482
return bool_to_vector_condition(ctx, tmp);
7483
} else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7484
/* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */
7485
Temp tmp =
7486
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7487
tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
7488
tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u))
7489
.def(1)
7490
.getTemp();
7491
return bool_to_vector_condition(ctx, tmp);
7492
} else {
7493
/* subgroupClustered{And,Or,Xor}(val, n):
7494
* lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32)
7495
* cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1)
7496
* subgroupClusteredAnd():
7497
* return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
7498
* subgroupClusteredOr():
7499
* return ((val & exec) >> cluster_offset) & cluster_mask != 0
7500
* subgroupClusteredXor():
7501
* return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
7502
*/
7503
Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1));
7504
Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1),
7505
Operand::c32(~uint32_t(cluster_size - 1)), lane_id);
7506
7507
Temp tmp;
7508
if (op == nir_op_iand)
7509
tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src,
7510
Operand(exec, bld.lm));
7511
else
7512
tmp =
7513
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7514
7515
uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
7516
7517
if (ctx->program->chip_class <= GFX7)
7518
tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
7519
else if (ctx->program->wave_size == 64)
7520
tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
7521
else
7522
tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
7523
tmp = emit_extract_vector(ctx, tmp, 0, v1);
7524
if (cluster_mask != 0xffffffff)
7525
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp);
7526
7527
if (op == nir_op_iand) {
7528
return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::c32(cluster_mask),
7529
tmp);
7530
} else if (op == nir_op_ior) {
7531
return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);
7532
} else if (op == nir_op_ixor) {
7533
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u),
7534
bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero()));
7535
return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);
7536
}
7537
assert(false);
7538
return Temp();
7539
}
7540
}
7541
7542
Temp
7543
emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src)
7544
{
7545
Builder bld(ctx->program, ctx->block);
7546
assert(src.regClass() == bld.lm);
7547
7548
/* subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
7549
* subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
7550
* subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
7551
*/
7552
Temp tmp;
7553
if (op == nir_op_iand)
7554
tmp =
7555
bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7556
else
7557
tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7558
7559
Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));
7560
7561
if (op == nir_op_iand)
7562
return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);
7563
else if (op == nir_op_ior)
7564
return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);
7565
else if (op == nir_op_ixor)
7566
return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(),
7567
bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt));
7568
7569
assert(false);
7570
return Temp();
7571
}
7572
7573
Temp
7574
emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src)
7575
{
7576
Builder bld(ctx->program, ctx->block);
7577
7578
/* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7579
* subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7580
* subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7581
*/
7582
Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7583
if (op == nir_op_iand)
7584
return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7585
else if (op == nir_op_ior)
7586
return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7587
else if (op == nir_op_ixor)
7588
return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7589
7590
assert(false);
7591
return Temp();
7592
}
7593
7594
ReduceOp
7595
get_reduce_op(nir_op op, unsigned bit_size)
7596
{
7597
switch (op) {
7598
#define CASEI(name) \
7599
case nir_op_##name: \
7600
return (bit_size == 32) ? name##32 \
7601
: (bit_size == 16) ? name##16 \
7602
: (bit_size == 8) ? name##8 \
7603
: name##64;
7604
#define CASEF(name) \
7605
case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7606
CASEI(iadd)
7607
CASEI(imul)
7608
CASEI(imin)
7609
CASEI(umin)
7610
CASEI(imax)
7611
CASEI(umax)
7612
CASEI(iand)
7613
CASEI(ior)
7614
CASEI(ixor)
7615
CASEF(fadd)
7616
CASEF(fmul)
7617
CASEF(fmin)
7618
CASEF(fmax)
7619
default: unreachable("unknown reduction op");
7620
#undef CASEI
7621
#undef CASEF
7622
}
7623
}
7624
7625
void
7626
emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7627
{
7628
Builder bld(ctx->program, ctx->block);
7629
Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7630
assert(dst.regClass().type() != RegType::vgpr);
7631
if (src.regClass().type() == RegType::vgpr)
7632
bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7633
else
7634
bld.copy(dst, src);
7635
}
7636
7637
void
7638
emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7639
{
7640
Builder bld(ctx->program, ctx->block);
7641
Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7642
7643
if (op == nir_op_fadd) {
7644
src_tmp = as_vgpr(ctx, src_tmp);
7645
Temp tmp = dst.regClass() == s1 ? bld.tmp(src_tmp.regClass()) : dst.getTemp();
7646
7647
if (src.ssa->bit_size == 16) {
7648
count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7649
bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7650
} else {
7651
assert(src.ssa->bit_size == 32);
7652
count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7653
bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7654
}
7655
7656
if (tmp != dst.getTemp())
7657
bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7658
7659
return;
7660
}
7661
7662
if (dst.regClass() == s1)
7663
src_tmp = bld.as_uniform(src_tmp);
7664
7665
if (op == nir_op_ixor && count.type() == RegType::sgpr)
7666
count =
7667
bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7668
else if (op == nir_op_ixor)
7669
count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7670
7671
assert(dst.getTemp().type() == count.type());
7672
7673
if (nir_src_is_const(src)) {
7674
if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)
7675
bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7676
else if (nir_src_as_uint(src) == 1)
7677
bld.copy(dst, count);
7678
else if (nir_src_as_uint(src) == 0 && dst.bytes() <= 2)
7679
bld.vop1(aco_opcode::v_mov_b32, dst, Operand::zero()); /* RA will use SDWA if possible */
7680
else if (nir_src_as_uint(src) == 0)
7681
bld.copy(dst, Operand::zero());
7682
else if (count.type() == RegType::vgpr)
7683
bld.v_mul_imm(dst, count, nir_src_as_uint(src));
7684
else
7685
bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7686
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
7687
bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7688
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
7689
bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7690
} else if (dst.getTemp().type() == RegType::vgpr) {
7691
bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7692
} else {
7693
bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7694
}
7695
}
7696
7697
bool
7698
emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7699
{
7700
nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7701
if (op == nir_op_imul || op == nir_op_fmul)
7702
return false;
7703
7704
if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7705
Builder bld(ctx->program, ctx->block);
7706
Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7707
unsigned bit_size = instr->src[0].ssa->bit_size;
7708
if (bit_size > 32)
7709
return false;
7710
7711
Temp thread_count =
7712
bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7713
7714
emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7715
} else {
7716
emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7717
}
7718
7719
return true;
7720
}
7721
7722
bool
7723
emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7724
{
7725
Builder bld(ctx->program, ctx->block);
7726
Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7727
nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7728
bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7729
7730
if (op == nir_op_imul || op == nir_op_fmul)
7731
return false;
7732
7733
if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7734
if (instr->src[0].ssa->bit_size > 32)
7735
return false;
7736
7737
Temp packed_tid;
7738
if (inc)
7739
packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7740
else
7741
packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7742
7743
emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7744
return true;
7745
}
7746
7747
assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7748
op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7749
7750
if (inc) {
7751
emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7752
return true;
7753
}
7754
7755
/* Copy the source and write the reduction operation identity to the first lane. */
7756
Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7757
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7758
ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7759
if (dst.bytes() == 8) {
7760
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7761
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7762
uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7763
uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7764
7765
lo =
7766
bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_lo)), lane, lo);
7767
hi =
7768
bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_hi)), lane, hi);
7769
bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7770
} else {
7771
uint32_t identity = get_reduction_identity(reduce_op, 0);
7772
bld.writelane(dst, bld.copy(bld.hint_m0(s1), Operand::c32(identity)), lane,
7773
as_vgpr(ctx, src));
7774
}
7775
7776
return true;
7777
}
7778
7779
Temp
7780
emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7781
Definition dst, Temp src)
7782
{
7783
assert(src.bytes() <= 8);
7784
assert(src.type() == RegType::vgpr);
7785
7786
Builder bld(ctx->program, ctx->block);
7787
7788
unsigned num_defs = 0;
7789
Definition defs[5];
7790
defs[num_defs++] = dst;
7791
defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7792
7793
/* scalar identity temporary */
7794
bool need_sitmp = (ctx->program->chip_class <= GFX7 || ctx->program->chip_class >= GFX10) &&
7795
aco_op != aco_opcode::p_reduce;
7796
if (aco_op == aco_opcode::p_exclusive_scan) {
7797
need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
7798
op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
7799
op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
7800
op == fmul64);
7801
}
7802
if (need_sitmp)
7803
defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
7804
7805
/* scc clobber */
7806
defs[num_defs++] = bld.def(s1, scc);
7807
7808
/* vcc clobber */
7809
bool clobber_vcc = false;
7810
if ((op == iadd32 || op == imul64) && ctx->program->chip_class < GFX9)
7811
clobber_vcc = true;
7812
if ((op == iadd8 || op == iadd16) && ctx->program->chip_class < GFX8)
7813
clobber_vcc = true;
7814
if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
7815
clobber_vcc = true;
7816
7817
if (clobber_vcc)
7818
defs[num_defs++] = bld.def(bld.lm, vcc);
7819
7820
Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
7821
aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
7822
reduce->operands[0] = Operand(src);
7823
/* setup_reduce_temp will update these undef operands if needed */
7824
reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7825
reduce->operands[2] = Operand(v1.as_linear());
7826
std::copy(defs, defs + num_defs, reduce->definitions.begin());
7827
7828
reduce->reduce_op = op;
7829
reduce->cluster_size = cluster_size;
7830
bld.insert(std::move(reduce));
7831
7832
return dst.getTemp();
7833
}
7834
7835
void
7836
emit_interp_center(isel_context* ctx, Temp dst, Temp pos1, Temp pos2)
7837
{
7838
Builder bld(ctx->program, ctx->block);
7839
Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
7840
Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);
7841
Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);
7842
7843
Temp ddx_1, ddx_2, ddy_1, ddy_2;
7844
uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7845
uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7846
uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7847
7848
/* Build DD X/Y */
7849
if (ctx->program->chip_class >= GFX8) {
7850
Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7851
ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7852
ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7853
Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7854
ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7855
ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7856
} else {
7857
Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7858
ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7859
ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7860
ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7861
ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);
7862
Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7863
ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7864
ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);
7865
ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7866
ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7867
}
7868
7869
/* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7870
aco_opcode mad =
7871
ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
7872
Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
7873
Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
7874
tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
7875
tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
7876
Temp wqm1 = bld.tmp(v1);
7877
emit_wqm(bld, tmp1, wqm1, true);
7878
Temp wqm2 = bld.tmp(v1);
7879
emit_wqm(bld, tmp2, wqm2, true);
7880
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
7881
return;
7882
}
7883
7884
Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
7885
void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt);
7886
static void create_vs_exports(isel_context* ctx);
7887
7888
void
7889
visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
7890
{
7891
Builder bld(ctx->program, ctx->block);
7892
switch (instr->intrinsic) {
7893
case nir_intrinsic_load_barycentric_sample:
7894
case nir_intrinsic_load_barycentric_pixel:
7895
case nir_intrinsic_load_barycentric_centroid: {
7896
glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
7897
Temp bary = Temp(0, s2);
7898
switch (mode) {
7899
case INTERP_MODE_SMOOTH:
7900
case INTERP_MODE_NONE:
7901
if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7902
bary = get_arg(ctx, ctx->args->ac.persp_center);
7903
else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7904
bary = ctx->persp_centroid;
7905
else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7906
bary = get_arg(ctx, ctx->args->ac.persp_sample);
7907
break;
7908
case INTERP_MODE_NOPERSPECTIVE:
7909
if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7910
bary = get_arg(ctx, ctx->args->ac.linear_center);
7911
else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7912
bary = ctx->linear_centroid;
7913
else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7914
bary = get_arg(ctx, ctx->args->ac.linear_sample);
7915
break;
7916
default: break;
7917
}
7918
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7919
Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7920
Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7921
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2));
7922
emit_split_vector(ctx, dst, 2);
7923
break;
7924
}
7925
case nir_intrinsic_load_barycentric_model: {
7926
Temp model = get_arg(ctx, ctx->args->ac.pull_model);
7927
7928
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7929
Temp p1 = emit_extract_vector(ctx, model, 0, v1);
7930
Temp p2 = emit_extract_vector(ctx, model, 1, v1);
7931
Temp p3 = emit_extract_vector(ctx, model, 2, v1);
7932
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2),
7933
Operand(p3));
7934
emit_split_vector(ctx, dst, 3);
7935
break;
7936
}
7937
case nir_intrinsic_load_barycentric_at_sample: {
7938
uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
7939
switch (ctx->options->key.fs.num_samples) {
7940
case 2: sample_pos_offset += 1 << 3; break;
7941
case 4: sample_pos_offset += 3 << 3; break;
7942
case 8: sample_pos_offset += 7 << 3; break;
7943
default: break;
7944
}
7945
Temp sample_pos;
7946
Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
7947
nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
7948
Temp private_segment_buffer = ctx->program->private_segment_buffer;
7949
// TODO: bounds checking?
7950
if (addr.type() == RegType::sgpr) {
7951
Operand offset;
7952
if (const_addr) {
7953
sample_pos_offset += const_addr->u32 << 3;
7954
offset = Operand::c32(sample_pos_offset);
7955
} else if (ctx->options->chip_class >= GFX9) {
7956
offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr,
7957
Operand::c32(sample_pos_offset));
7958
} else {
7959
offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr,
7960
Operand::c32(3u));
7961
offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
7962
Operand::c32(sample_pos_offset));
7963
}
7964
7965
Operand off = bld.copy(bld.def(s1), Operand(offset));
7966
sample_pos =
7967
bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
7968
7969
} else if (ctx->options->chip_class >= GFX9) {
7970
addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
7971
sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr,
7972
private_segment_buffer, sample_pos_offset);
7973
} else if (ctx->options->chip_class >= GFX7) {
7974
/* addr += private_segment_buffer + sample_pos_offset */
7975
Temp tmp0 = bld.tmp(s1);
7976
Temp tmp1 = bld.tmp(s1);
7977
bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1),
7978
private_segment_buffer);
7979
Definition scc_tmp = bld.def(s1, scc);
7980
tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0,
7981
Operand::c32(sample_pos_offset));
7982
tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1,
7983
Operand::zero(), bld.scc(scc_tmp.getTemp()));
7984
addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
7985
Temp pck0 = bld.tmp(v1);
7986
Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
7987
tmp1 = as_vgpr(ctx, tmp1);
7988
Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1),
7989
bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand::zero(), carry);
7990
addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
7991
7992
/* sample_pos = flat_load_dwordx2 addr */
7993
sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
7994
} else {
7995
assert(ctx->options->chip_class == GFX6);
7996
7997
uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7998
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7999
Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
8000
Operand::zero(), Operand::c32(rsrc_conf));
8001
8002
addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
8003
addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand::zero());
8004
8005
sample_pos = bld.tmp(v2);
8006
8007
aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(
8008
aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
8009
load->definitions[0] = Definition(sample_pos);
8010
load->operands[0] = Operand(rsrc);
8011
load->operands[1] = Operand(addr);
8012
load->operands[2] = Operand::zero();
8013
load->offset = sample_pos_offset;
8014
load->offen = 0;
8015
load->addr64 = true;
8016
load->glc = false;
8017
load->dlc = false;
8018
load->disable_wqm = false;
8019
ctx->block->instructions.emplace_back(std::move(load));
8020
}
8021
8022
/* sample_pos -= 0.5 */
8023
Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
8024
Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
8025
bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
8026
pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand::c32(0x3f000000u));
8027
pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand::c32(0x3f000000u));
8028
8029
emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
8030
break;
8031
}
8032
case nir_intrinsic_load_barycentric_at_offset: {
8033
Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8034
RegClass rc = RegClass(offset.type(), 1);
8035
Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
8036
bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
8037
emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
8038
break;
8039
}
8040
case nir_intrinsic_load_front_face: {
8041
bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8042
Operand::zero(), get_arg(ctx, ctx->args->ac.front_face))
8043
.def(0)
8044
.setHint(vcc);
8045
break;
8046
}
8047
case nir_intrinsic_load_view_index: {
8048
if (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::GS) ||
8049
ctx->stage.has(SWStage::TCS) || ctx->stage.has(SWStage::TES)) {
8050
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8051
bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
8052
break;
8053
}
8054
FALLTHROUGH;
8055
}
8056
case nir_intrinsic_load_layer_id: {
8057
unsigned idx = nir_intrinsic_base(instr);
8058
bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8059
Operand::c32(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
8060
break;
8061
}
8062
case nir_intrinsic_load_frag_coord: {
8063
emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
8064
break;
8065
}
8066
case nir_intrinsic_load_frag_shading_rate:
8067
emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8068
break;
8069
case nir_intrinsic_load_sample_pos: {
8070
Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
8071
Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
8072
bld.pseudo(
8073
aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8074
posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),
8075
posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());
8076
break;
8077
}
8078
case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8079
case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8080
case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8081
case nir_intrinsic_load_input:
8082
case nir_intrinsic_load_input_vertex: visit_load_input(ctx, instr); break;
8083
case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8084
case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8085
case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;
8086
case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8087
case nir_intrinsic_vulkan_resource_index: visit_load_resource(ctx, instr); break;
8088
case nir_intrinsic_terminate:
8089
case nir_intrinsic_discard: visit_discard(ctx, instr); break;
8090
case nir_intrinsic_terminate_if:
8091
case nir_intrinsic_discard_if: visit_discard_if(ctx, instr); break;
8092
case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8093
case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8094
case nir_intrinsic_shared_atomic_add:
8095
case nir_intrinsic_shared_atomic_imin:
8096
case nir_intrinsic_shared_atomic_umin:
8097
case nir_intrinsic_shared_atomic_imax:
8098
case nir_intrinsic_shared_atomic_umax:
8099
case nir_intrinsic_shared_atomic_and:
8100
case nir_intrinsic_shared_atomic_or:
8101
case nir_intrinsic_shared_atomic_xor:
8102
case nir_intrinsic_shared_atomic_exchange:
8103
case nir_intrinsic_shared_atomic_comp_swap:
8104
case nir_intrinsic_shared_atomic_fadd: visit_shared_atomic(ctx, instr); break;
8105
case nir_intrinsic_image_deref_load:
8106
case nir_intrinsic_image_deref_sparse_load: visit_image_load(ctx, instr); break;
8107
case nir_intrinsic_image_deref_store: visit_image_store(ctx, instr); break;
8108
case nir_intrinsic_image_deref_atomic_add:
8109
case nir_intrinsic_image_deref_atomic_umin:
8110
case nir_intrinsic_image_deref_atomic_imin:
8111
case nir_intrinsic_image_deref_atomic_umax:
8112
case nir_intrinsic_image_deref_atomic_imax:
8113
case nir_intrinsic_image_deref_atomic_and:
8114
case nir_intrinsic_image_deref_atomic_or:
8115
case nir_intrinsic_image_deref_atomic_xor:
8116
case nir_intrinsic_image_deref_atomic_exchange:
8117
case nir_intrinsic_image_deref_atomic_comp_swap: visit_image_atomic(ctx, instr); break;
8118
case nir_intrinsic_image_deref_size: visit_image_size(ctx, instr); break;
8119
case nir_intrinsic_image_deref_samples: visit_image_samples(ctx, instr); break;
8120
case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8121
case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8122
case nir_intrinsic_load_global: visit_load_global(ctx, instr); break;
8123
case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8124
case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8125
case nir_intrinsic_store_global: visit_store_global(ctx, instr); break;
8126
case nir_intrinsic_global_atomic_add:
8127
case nir_intrinsic_global_atomic_imin:
8128
case nir_intrinsic_global_atomic_umin:
8129
case nir_intrinsic_global_atomic_imax:
8130
case nir_intrinsic_global_atomic_umax:
8131
case nir_intrinsic_global_atomic_and:
8132
case nir_intrinsic_global_atomic_or:
8133
case nir_intrinsic_global_atomic_xor:
8134
case nir_intrinsic_global_atomic_exchange:
8135
case nir_intrinsic_global_atomic_comp_swap: visit_global_atomic(ctx, instr); break;
8136
case nir_intrinsic_ssbo_atomic_add:
8137
case nir_intrinsic_ssbo_atomic_imin:
8138
case nir_intrinsic_ssbo_atomic_umin:
8139
case nir_intrinsic_ssbo_atomic_imax:
8140
case nir_intrinsic_ssbo_atomic_umax:
8141
case nir_intrinsic_ssbo_atomic_and:
8142
case nir_intrinsic_ssbo_atomic_or:
8143
case nir_intrinsic_ssbo_atomic_xor:
8144
case nir_intrinsic_ssbo_atomic_exchange:
8145
case nir_intrinsic_ssbo_atomic_comp_swap: visit_atomic_ssbo(ctx, instr); break;
8146
case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8147
case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8148
case nir_intrinsic_get_ssbo_size: visit_get_ssbo_size(ctx, instr); break;
8149
case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break;
8150
case nir_intrinsic_load_num_workgroups: {
8151
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8152
bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
8153
emit_split_vector(ctx, dst, 3);
8154
break;
8155
}
8156
case nir_intrinsic_load_local_invocation_id: {
8157
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8158
bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
8159
emit_split_vector(ctx, dst, 3);
8160
break;
8161
}
8162
case nir_intrinsic_load_workgroup_id: {
8163
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8164
struct ac_arg* args = ctx->args->ac.workgroup_ids;
8165
bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8166
args[0].used ? Operand(get_arg(ctx, args[0])) : Operand::zero(),
8167
args[1].used ? Operand(get_arg(ctx, args[1])) : Operand::zero(),
8168
args[2].used ? Operand(get_arg(ctx, args[2])) : Operand::zero());
8169
emit_split_vector(ctx, dst, 3);
8170
break;
8171
}
8172
case nir_intrinsic_load_local_invocation_index: {
8173
if (ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS) {
8174
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8175
get_arg(ctx, ctx->args->ac.vs_rel_patch_id));
8176
break;
8177
} else if (ctx->stage.hw == HWStage::GS || ctx->stage.hw == HWStage::NGG) {
8178
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), thread_id_in_threadgroup(ctx));
8179
break;
8180
}
8181
8182
Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8183
8184
/* The tg_size bits [6:11] contain the subgroup id,
8185
* we need this multiplied by the wave size, and then OR the thread id to it.
8186
*/
8187
if (ctx->program->wave_size == 64) {
8188
/* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
8189
* feed that to v_or */
8190
Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8191
Operand::c32(0xfc0u), get_arg(ctx, ctx->args->ac.tg_size));
8192
bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num,
8193
id);
8194
} else {
8195
/* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
8196
Temp tg_num =
8197
bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8198
get_arg(ctx, ctx->args->ac.tg_size), Operand::c32(0x6u | (0x6u << 16)));
8199
bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8200
tg_num, Operand::c32(0x5u), id);
8201
}
8202
break;
8203
}
8204
case nir_intrinsic_load_subgroup_id: {
8205
if (ctx->stage == compute_cs) {
8206
bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8207
bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size),
8208
Operand::c32(0x6u | (0x6u << 16)));
8209
} else if (ctx->stage.hw == HWStage::NGG) {
8210
/* Get the id of the current wave within the threadgroup (workgroup) */
8211
bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8212
bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8213
Operand::c32(24u | (4u << 16)));
8214
} else {
8215
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::zero());
8216
}
8217
break;
8218
}
8219
case nir_intrinsic_load_subgroup_invocation: {
8220
emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8221
break;
8222
}
8223
case nir_intrinsic_load_num_subgroups: {
8224
if (ctx->stage == compute_cs)
8225
bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8226
bld.def(s1, scc), Operand::c32(0x3fu), get_arg(ctx, ctx->args->ac.tg_size));
8227
else if (ctx->stage.hw == HWStage::NGG)
8228
bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8229
bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8230
Operand::c32(28u | (4u << 16)));
8231
else
8232
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x1u));
8233
break;
8234
}
8235
case nir_intrinsic_ballot: {
8236
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8237
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8238
8239
if (instr->src[0].ssa->bit_size == 1) {
8240
assert(src.regClass() == bld.lm);
8241
} else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8242
src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8243
} else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8244
src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8245
} else {
8246
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8247
}
8248
8249
/* Make sure that all inactive lanes return zero.
8250
* Value-numbering might remove the comparison above */
8251
src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
8252
if (dst.size() != bld.lm.size()) {
8253
/* Wave32 with ballot size set to 64 */
8254
src =
8255
bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand::zero());
8256
}
8257
8258
emit_wqm(bld, src, dst);
8259
break;
8260
}
8261
case nir_intrinsic_shuffle:
8262
case nir_intrinsic_read_invocation: {
8263
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8264
if (!nir_src_is_divergent(instr->src[0])) {
8265
emit_uniform_subgroup(ctx, instr, src);
8266
} else {
8267
Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8268
if (instr->intrinsic == nir_intrinsic_read_invocation ||
8269
!nir_src_is_divergent(instr->src[1]))
8270
tid = bld.as_uniform(tid);
8271
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8272
8273
if (instr->dest.ssa.bit_size != 1)
8274
src = as_vgpr(ctx, src);
8275
8276
if (src.regClass() == v1b || src.regClass() == v2b) {
8277
Temp tmp = bld.tmp(v1);
8278
tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp);
8279
if (dst.type() == RegType::vgpr)
8280
bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8281
bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8282
else
8283
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8284
} else if (src.regClass() == v1) {
8285
emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), dst);
8286
} else if (src.regClass() == v2) {
8287
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8288
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8289
lo = emit_wqm(bld, emit_bpermute(ctx, bld, tid, lo));
8290
hi = emit_wqm(bld, emit_bpermute(ctx, bld, tid, hi));
8291
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8292
emit_split_vector(ctx, dst, 2);
8293
} else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
8294
assert(src.regClass() == bld.lm);
8295
Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
8296
bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8297
} else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
8298
assert(src.regClass() == bld.lm);
8299
Temp tmp;
8300
if (ctx->program->chip_class <= GFX7)
8301
tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
8302
else if (ctx->program->wave_size == 64)
8303
tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
8304
else
8305
tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
8306
tmp = emit_extract_vector(ctx, tmp, 0, v1);
8307
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp);
8308
emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp),
8309
dst);
8310
} else {
8311
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8312
}
8313
}
8314
break;
8315
}
8316
case nir_intrinsic_load_sample_id: {
8317
bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8318
get_arg(ctx, ctx->args->ac.ancillary), Operand::c32(8u), Operand::c32(4u));
8319
break;
8320
}
8321
case nir_intrinsic_load_sample_mask_in: {
8322
visit_load_sample_mask_in(ctx, instr);
8323
break;
8324
}
8325
case nir_intrinsic_read_first_invocation: {
8326
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8327
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8328
if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
8329
emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst);
8330
} else if (src.regClass() == v2) {
8331
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8332
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8333
lo = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
8334
hi = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
8335
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8336
emit_split_vector(ctx, dst, 2);
8337
} else if (instr->dest.ssa.bit_size == 1) {
8338
assert(src.regClass() == bld.lm);
8339
Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8340
bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8341
bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8342
} else {
8343
bld.copy(Definition(dst), src);
8344
}
8345
break;
8346
}
8347
case nir_intrinsic_vote_all: {
8348
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8349
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8350
assert(src.regClass() == bld.lm);
8351
assert(dst.regClass() == bld.lm);
8352
8353
Temp tmp =
8354
bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
8355
.def(1)
8356
.getTemp();
8357
Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
8358
bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8359
break;
8360
}
8361
case nir_intrinsic_vote_any: {
8362
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8363
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8364
assert(src.regClass() == bld.lm);
8365
assert(dst.regClass() == bld.lm);
8366
8367
Temp tmp = bool_to_scalar_condition(ctx, src);
8368
bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8369
break;
8370
}
8371
case nir_intrinsic_reduce:
8372
case nir_intrinsic_inclusive_scan:
8373
case nir_intrinsic_exclusive_scan: {
8374
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8375
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8376
nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8377
unsigned cluster_size =
8378
instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8379
cluster_size = util_next_power_of_two(
8380
MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8381
8382
if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size &&
8383
instr->dest.ssa.bit_size != 1) {
8384
/* We use divergence analysis to assign the regclass, so check if it's
8385
* working as expected */
8386
ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8387
if (instr->intrinsic == nir_intrinsic_inclusive_scan)
8388
expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor;
8389
assert(nir_dest_is_divergent(instr->dest) == expected_divergent);
8390
8391
if (instr->intrinsic == nir_intrinsic_reduce) {
8392
if (emit_uniform_reduce(ctx, instr))
8393
break;
8394
} else if (emit_uniform_scan(ctx, instr)) {
8395
break;
8396
}
8397
}
8398
8399
if (instr->dest.ssa.bit_size == 1) {
8400
if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
8401
op = nir_op_iand;
8402
else if (op == nir_op_iadd)
8403
op = nir_op_ixor;
8404
else if (op == nir_op_umax || op == nir_op_imax)
8405
op = nir_op_ior;
8406
assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
8407
8408
switch (instr->intrinsic) {
8409
case nir_intrinsic_reduce:
8410
emit_wqm(bld, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
8411
break;
8412
case nir_intrinsic_exclusive_scan:
8413
emit_wqm(bld, emit_boolean_exclusive_scan(ctx, op, src), dst);
8414
break;
8415
case nir_intrinsic_inclusive_scan:
8416
emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst);
8417
break;
8418
default: assert(false);
8419
}
8420
} else if (cluster_size == 1) {
8421
bld.copy(Definition(dst), src);
8422
} else {
8423
unsigned bit_size = instr->src[0].ssa->bit_size;
8424
8425
src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8426
8427
ReduceOp reduce_op = get_reduce_op(op, bit_size);
8428
8429
aco_opcode aco_op;
8430
switch (instr->intrinsic) {
8431
case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8432
case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8433
case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8434
default: unreachable("unknown reduce intrinsic");
8435
}
8436
8437
Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size,
8438
bld.def(dst.regClass()), src);
8439
emit_wqm(bld, tmp_dst, dst);
8440
}
8441
break;
8442
}
8443
case nir_intrinsic_quad_broadcast: {
8444
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8445
if (!nir_dest_is_divergent(instr->dest)) {
8446
emit_uniform_subgroup(ctx, instr, src);
8447
} else {
8448
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8449
unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
8450
uint32_t dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8451
8452
if (instr->dest.ssa.bit_size != 1)
8453
src = as_vgpr(ctx, src);
8454
8455
if (instr->dest.ssa.bit_size == 1) {
8456
assert(src.regClass() == bld.lm);
8457
assert(dst.regClass() == bld.lm);
8458
uint32_t half_mask = 0x11111111u << lane;
8459
Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
8460
Operand::c32(half_mask), Operand::c32(half_mask));
8461
Temp tmp = bld.tmp(bld.lm);
8462
bld.sop1(Builder::s_wqm, Definition(tmp),
8463
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
8464
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src,
8465
Operand(exec, bld.lm))));
8466
emit_wqm(bld, tmp, dst);
8467
} else if (instr->dest.ssa.bit_size == 8) {
8468
Temp tmp = bld.tmp(v1);
8469
if (ctx->program->chip_class >= GFX8)
8470
emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
8471
else
8472
emit_wqm(bld,
8473
bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl),
8474
tmp);
8475
bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
8476
} else if (instr->dest.ssa.bit_size == 16) {
8477
Temp tmp = bld.tmp(v1);
8478
if (ctx->program->chip_class >= GFX8)
8479
emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
8480
else
8481
emit_wqm(bld,
8482
bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl),
8483
tmp);
8484
bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
8485
} else if (instr->dest.ssa.bit_size == 32) {
8486
if (ctx->program->chip_class >= GFX8)
8487
emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);
8488
else
8489
emit_wqm(bld,
8490
bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl),
8491
dst);
8492
} else if (instr->dest.ssa.bit_size == 64) {
8493
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8494
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8495
if (ctx->program->chip_class >= GFX8) {
8496
lo = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
8497
hi = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
8498
} else {
8499
lo = emit_wqm(
8500
bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl));
8501
hi = emit_wqm(
8502
bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl));
8503
}
8504
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8505
emit_split_vector(ctx, dst, 2);
8506
} else {
8507
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8508
}
8509
}
8510
break;
8511
}
8512
case nir_intrinsic_quad_swap_horizontal:
8513
case nir_intrinsic_quad_swap_vertical:
8514
case nir_intrinsic_quad_swap_diagonal:
8515
case nir_intrinsic_quad_swizzle_amd: {
8516
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8517
if (!nir_dest_is_divergent(instr->dest)) {
8518
emit_uniform_subgroup(ctx, instr, src);
8519
break;
8520
}
8521
uint16_t dpp_ctrl = 0;
8522
switch (instr->intrinsic) {
8523
case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8524
case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8525
case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8526
case nir_intrinsic_quad_swizzle_amd: dpp_ctrl = nir_intrinsic_swizzle_mask(instr); break;
8527
default: break;
8528
}
8529
if (ctx->program->chip_class < GFX8)
8530
dpp_ctrl |= (1 << 15);
8531
8532
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8533
8534
if (instr->dest.ssa.bit_size != 1)
8535
src = as_vgpr(ctx, src);
8536
8537
if (instr->dest.ssa.bit_size == 1) {
8538
assert(src.regClass() == bld.lm);
8539
src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8540
Operand::c32(-1), src);
8541
if (ctx->program->chip_class >= GFX8)
8542
src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
8543
else
8544
src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
8545
Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8546
emit_wqm(bld, tmp, dst);
8547
} else if (instr->dest.ssa.bit_size == 8) {
8548
Temp tmp = bld.tmp(v1);
8549
if (ctx->program->chip_class >= GFX8)
8550
emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
8551
else
8552
emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
8553
bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
8554
} else if (instr->dest.ssa.bit_size == 16) {
8555
Temp tmp = bld.tmp(v1);
8556
if (ctx->program->chip_class >= GFX8)
8557
emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
8558
else
8559
emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
8560
bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
8561
} else if (instr->dest.ssa.bit_size == 32) {
8562
Temp tmp;
8563
if (ctx->program->chip_class >= GFX8)
8564
tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
8565
else
8566
tmp = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
8567
emit_wqm(bld, tmp, dst);
8568
} else if (instr->dest.ssa.bit_size == 64) {
8569
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8570
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8571
if (ctx->program->chip_class >= GFX8) {
8572
lo = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
8573
hi = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
8574
} else {
8575
lo = emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, dpp_ctrl));
8576
hi = emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, dpp_ctrl));
8577
}
8578
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8579
emit_split_vector(ctx, dst, 2);
8580
} else {
8581
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8582
}
8583
break;
8584
}
8585
case nir_intrinsic_masked_swizzle_amd: {
8586
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8587
if (!nir_dest_is_divergent(instr->dest)) {
8588
emit_uniform_subgroup(ctx, instr, src);
8589
break;
8590
}
8591
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8592
uint32_t mask = nir_intrinsic_swizzle_mask(instr);
8593
8594
if (instr->dest.ssa.bit_size != 1)
8595
src = as_vgpr(ctx, src);
8596
8597
if (instr->dest.ssa.bit_size == 1) {
8598
assert(src.regClass() == bld.lm);
8599
src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8600
Operand::c32(-1), src);
8601
src = emit_masked_swizzle(ctx, bld, src, mask);
8602
Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8603
emit_wqm(bld, tmp, dst);
8604
} else if (dst.regClass() == v1b) {
8605
Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8606
emit_extract_vector(ctx, tmp, 0, dst);
8607
} else if (dst.regClass() == v2b) {
8608
Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8609
emit_extract_vector(ctx, tmp, 0, dst);
8610
} else if (dst.regClass() == v1) {
8611
emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask), dst);
8612
} else if (dst.regClass() == v2) {
8613
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8614
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8615
lo = emit_wqm(bld, emit_masked_swizzle(ctx, bld, lo, mask));
8616
hi = emit_wqm(bld, emit_masked_swizzle(ctx, bld, hi, mask));
8617
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8618
emit_split_vector(ctx, dst, 2);
8619
} else {
8620
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8621
}
8622
break;
8623
}
8624
case nir_intrinsic_write_invocation_amd: {
8625
Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8626
Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8627
Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8628
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8629
if (dst.regClass() == v1) {
8630
/* src2 is ignored for writelane. RA assigns the same reg for dst */
8631
emit_wqm(bld, bld.writelane(bld.def(v1), val, lane, src), dst);
8632
} else if (dst.regClass() == v2) {
8633
Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8634
Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8635
bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8636
bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8637
Temp lo = emit_wqm(bld, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
8638
Temp hi = emit_wqm(bld, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
8639
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8640
emit_split_vector(ctx, dst, 2);
8641
} else {
8642
isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8643
}
8644
break;
8645
}
8646
case nir_intrinsic_mbcnt_amd: {
8647
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8648
Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8649
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8650
/* Fit 64-bit mask for wave32 */
8651
src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8652
Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src), Operand(add_src));
8653
emit_wqm(bld, wqm_tmp, dst);
8654
break;
8655
}
8656
case nir_intrinsic_byte_permute_amd: {
8657
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8658
assert(dst.regClass() == v1);
8659
assert(ctx->program->chip_class >= GFX8);
8660
bld.vop3(aco_opcode::v_perm_b32, Definition(dst), get_ssa_temp(ctx, instr->src[0].ssa),
8661
as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)),
8662
as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
8663
break;
8664
}
8665
case nir_intrinsic_lane_permute_16_amd: {
8666
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8667
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8668
assert(ctx->program->chip_class >= GFX10);
8669
8670
if (src.regClass() == s1) {
8671
bld.copy(Definition(dst), src);
8672
} else if (dst.regClass() == v1 && src.regClass() == v1) {
8673
bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
8674
bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
8675
bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
8676
} else {
8677
isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
8678
}
8679
break;
8680
}
8681
case nir_intrinsic_load_helper_invocation:
8682
case nir_intrinsic_is_helper_invocation: {
8683
/* load_helper() after demote() get lowered to is_helper().
8684
* Otherwise, these two behave the same. */
8685
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8686
bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
8687
ctx->block->kind |= block_kind_needs_lowering;
8688
ctx->program->needs_exact = true;
8689
break;
8690
}
8691
case nir_intrinsic_demote:
8692
bld.pseudo(aco_opcode::p_demote_to_helper, Operand::c32(-1u));
8693
8694
if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8695
ctx->cf_info.exec_potentially_empty_discard = true;
8696
ctx->block->kind |= block_kind_uses_demote;
8697
ctx->program->needs_exact = true;
8698
break;
8699
case nir_intrinsic_demote_if: {
8700
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8701
assert(src.regClass() == bld.lm);
8702
Temp cond =
8703
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8704
bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8705
8706
if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8707
ctx->cf_info.exec_potentially_empty_discard = true;
8708
ctx->block->kind |= block_kind_uses_demote;
8709
ctx->program->needs_exact = true;
8710
break;
8711
}
8712
case nir_intrinsic_first_invocation: {
8713
emit_wqm(bld, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
8714
get_ssa_temp(ctx, &instr->dest.ssa));
8715
break;
8716
}
8717
case nir_intrinsic_last_invocation: {
8718
Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
8719
Temp last = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc),
8720
Operand::c32(ctx->program->wave_size - 1u), flbit);
8721
emit_wqm(bld, last, get_ssa_temp(ctx, &instr->dest.ssa));
8722
break;
8723
}
8724
case nir_intrinsic_elect: {
8725
Temp first = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
8726
emit_wqm(
8727
bld, bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), Operand::c32(1u), first),
8728
get_ssa_temp(ctx, &instr->dest.ssa));
8729
break;
8730
}
8731
case nir_intrinsic_shader_clock: {
8732
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8733
if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP &&
8734
ctx->options->chip_class >= GFX10_3) {
8735
/* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
8736
Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
8737
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
8738
} else {
8739
aco_opcode opcode = nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE
8740
? aco_opcode::s_memrealtime
8741
: aco_opcode::s_memtime;
8742
bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
8743
}
8744
emit_split_vector(ctx, dst, 2);
8745
break;
8746
}
8747
case nir_intrinsic_load_vertex_id_zero_base: {
8748
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8749
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
8750
break;
8751
}
8752
case nir_intrinsic_load_first_vertex: {
8753
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8754
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
8755
break;
8756
}
8757
case nir_intrinsic_load_base_instance: {
8758
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8759
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
8760
break;
8761
}
8762
case nir_intrinsic_load_instance_id: {
8763
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8764
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
8765
break;
8766
}
8767
case nir_intrinsic_load_draw_id: {
8768
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8769
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
8770
break;
8771
}
8772
case nir_intrinsic_load_invocation_id: {
8773
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8774
8775
if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
8776
if (ctx->options->chip_class >= GFX10)
8777
bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),
8778
get_arg(ctx, ctx->args->ac.gs_invocation_id));
8779
else
8780
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8781
} else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
8782
bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
8783
Operand::c32(8u), Operand::c32(5u));
8784
} else {
8785
unreachable("Unsupported stage for load_invocation_id");
8786
}
8787
8788
break;
8789
}
8790
case nir_intrinsic_load_primitive_id: {
8791
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8792
8793
switch (ctx->shader->info.stage) {
8794
case MESA_SHADER_GEOMETRY:
8795
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8796
break;
8797
case MESA_SHADER_TESS_CTRL:
8798
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
8799
break;
8800
case MESA_SHADER_TESS_EVAL:
8801
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
8802
break;
8803
default:
8804
if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) {
8805
/* In case of NGG, the GS threads always have the primitive ID
8806
* even if there is no SW GS. */
8807
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8808
break;
8809
}
8810
unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
8811
}
8812
8813
break;
8814
}
8815
case nir_intrinsic_load_patch_vertices_in: {
8816
assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||
8817
ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
8818
8819
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8820
bld.copy(Definition(dst), Operand::c32(ctx->args->options->key.tcs.input_vertices));
8821
break;
8822
}
8823
case nir_intrinsic_emit_vertex_with_counter: {
8824
assert(ctx->stage.hw == HWStage::GS);
8825
visit_emit_vertex_with_counter(ctx, instr);
8826
break;
8827
}
8828
case nir_intrinsic_end_primitive_with_counter: {
8829
if (ctx->stage.hw != HWStage::NGG) {
8830
unsigned stream = nir_intrinsic_stream_id(instr);
8831
bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1,
8832
sendmsg_gs(true, false, stream));
8833
}
8834
break;
8835
}
8836
case nir_intrinsic_set_vertex_and_primitive_count: {
8837
assert(ctx->stage.hw == HWStage::GS);
8838
/* unused in the legacy pipeline, the HW keeps track of this for us */
8839
break;
8840
}
8841
case nir_intrinsic_load_tess_rel_patch_id_amd: {
8842
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_tess_rel_patch_id(ctx));
8843
break;
8844
}
8845
case nir_intrinsic_load_ring_tess_factors_amd: {
8846
bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8847
ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_FACTOR * 16u));
8848
break;
8849
}
8850
case nir_intrinsic_load_ring_tess_factors_offset_amd: {
8851
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8852
get_arg(ctx, ctx->args->ac.tcs_factor_offset));
8853
break;
8854
}
8855
case nir_intrinsic_load_ring_tess_offchip_amd: {
8856
bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8857
ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_OFFCHIP * 16u));
8858
break;
8859
}
8860
case nir_intrinsic_load_ring_tess_offchip_offset_amd: {
8861
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8862
get_arg(ctx, ctx->args->ac.tess_offchip_offset));
8863
break;
8864
}
8865
case nir_intrinsic_load_ring_esgs_amd: {
8866
unsigned ring = ctx->stage.hw == HWStage::ES ? RING_ESGS_VS : RING_ESGS_GS;
8867
bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8868
ctx->program->private_segment_buffer, Operand::c32(ring * 16u));
8869
break;
8870
}
8871
case nir_intrinsic_load_ring_es2gs_offset_amd: {
8872
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8873
get_arg(ctx, ctx->args->ac.es2gs_offset));
8874
break;
8875
}
8876
case nir_intrinsic_load_gs_vertex_offset_amd: {
8877
unsigned b = nir_intrinsic_base(instr);
8878
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8879
get_arg(ctx, ctx->args->ac.gs_vtx_offset[b]));
8880
break;
8881
}
8882
case nir_intrinsic_has_input_vertex_amd:
8883
case nir_intrinsic_has_input_primitive_amd: {
8884
assert(ctx->stage.hw == HWStage::NGG);
8885
unsigned i = instr->intrinsic == nir_intrinsic_has_input_vertex_amd ? 0 : 1;
8886
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), merged_wave_info_to_mask(ctx, i));
8887
break;
8888
}
8889
case nir_intrinsic_load_workgroup_num_input_vertices_amd:
8890
case nir_intrinsic_load_workgroup_num_input_primitives_amd: {
8891
assert(ctx->stage.hw == HWStage::NGG);
8892
unsigned pos =
8893
instr->intrinsic == nir_intrinsic_load_workgroup_num_input_vertices_amd ? 12 : 22;
8894
bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8895
bld.def(s1, scc), get_arg(ctx, ctx->args->ac.gs_tg_info),
8896
Operand::c32(pos | (9u << 16u)));
8897
break;
8898
}
8899
case nir_intrinsic_load_initial_edgeflag_amd: {
8900
assert(ctx->stage.hw == HWStage::NGG);
8901
assert(nir_src_is_const(instr->src[0]));
8902
unsigned i = nir_src_as_uint(instr->src[0]);
8903
8904
Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
8905
bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8906
gs_invocation_id, Operand::c32(8u + i), Operand::c32(1u));
8907
break;
8908
}
8909
case nir_intrinsic_load_packed_passthrough_primitive_amd: {
8910
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8911
get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]));
8912
break;
8913
}
8914
case nir_intrinsic_export_vertex_amd: {
8915
ctx->block->kind |= block_kind_export_end;
8916
create_vs_exports(ctx);
8917
break;
8918
}
8919
case nir_intrinsic_export_primitive_amd: {
8920
assert(ctx->stage.hw == HWStage::NGG);
8921
Temp prim_exp_arg = get_ssa_temp(ctx, instr->src[0].ssa);
8922
bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
8923
1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */,
8924
true /* done */, false /* valid mask */);
8925
break;
8926
}
8927
case nir_intrinsic_alloc_vertices_and_primitives_amd: {
8928
assert(ctx->stage.hw == HWStage::NGG);
8929
Temp num_vertices = get_ssa_temp(ctx, instr->src[0].ssa);
8930
Temp num_primitives = get_ssa_temp(ctx, instr->src[1].ssa);
8931
ngg_emit_sendmsg_gs_alloc_req(ctx, num_vertices, num_primitives);
8932
break;
8933
}
8934
case nir_intrinsic_gds_atomic_add_amd: {
8935
Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
8936
Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
8937
Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
8938
Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
8939
bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
8940
true);
8941
break;
8942
}
8943
case nir_intrinsic_load_shader_query_enabled_amd: {
8944
unsigned cmp_bit = 0;
8945
Temp shader_query_enabled =
8946
bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),
8947
get_arg(ctx, ctx->args->ngg_gs_state), Operand::c32(cmp_bit));
8948
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8949
bool_to_vector_condition(ctx, shader_query_enabled));
8950
break;
8951
}
8952
case nir_intrinsic_load_cull_front_face_enabled_amd:
8953
case nir_intrinsic_load_cull_back_face_enabled_amd:
8954
case nir_intrinsic_load_cull_ccw_amd:
8955
case nir_intrinsic_load_cull_small_primitives_enabled_amd: {
8956
unsigned cmp_bit;
8957
if (instr->intrinsic == nir_intrinsic_load_cull_front_face_enabled_amd)
8958
cmp_bit = 0;
8959
else if (instr->intrinsic == nir_intrinsic_load_cull_back_face_enabled_amd)
8960
cmp_bit = 1;
8961
else if (instr->intrinsic == nir_intrinsic_load_cull_ccw_amd)
8962
cmp_bit = 2;
8963
else if (instr->intrinsic == nir_intrinsic_load_cull_small_primitives_enabled_amd)
8964
cmp_bit = 3;
8965
else
8966
unreachable("unimplemented culling intrinsic");
8967
8968
Builder::Result enabled =
8969
bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),
8970
get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(cmp_bit));
8971
enabled.instr->definitions[0].setNoCSE(true);
8972
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8973
bool_to_vector_condition(ctx, enabled));
8974
break;
8975
}
8976
case nir_intrinsic_load_sbt_amd: visit_load_sbt_amd(ctx, instr); break;
8977
case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
8978
case nir_intrinsic_load_cull_any_enabled_amd: {
8979
Builder::Result cull_any_enabled =
8980
bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8981
get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(0x00ffffffu));
8982
cull_any_enabled.instr->definitions[1].setNoCSE(true);
8983
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8984
bool_to_vector_condition(ctx, cull_any_enabled.def(1).getTemp()));
8985
break;
8986
}
8987
case nir_intrinsic_load_cull_small_prim_precision_amd: {
8988
/* Exponent is 8-bit signed int, move that into a signed 32-bit int. */
8989
Temp exponent = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc),
8990
get_arg(ctx, ctx->args->ngg_gs_state), Operand::c32(24u));
8991
/* small_prim_precision = 1.0 * 2^X */
8992
bld.vop3(aco_opcode::v_ldexp_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8993
Operand::c32(0x3f800000u), Operand(exponent));
8994
break;
8995
}
8996
case nir_intrinsic_load_viewport_x_scale: {
8997
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8998
get_arg(ctx, ctx->args->ngg_viewport_scale[0]));
8999
break;
9000
}
9001
case nir_intrinsic_load_viewport_y_scale: {
9002
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
9003
get_arg(ctx, ctx->args->ngg_viewport_scale[1]));
9004
break;
9005
}
9006
case nir_intrinsic_load_viewport_x_offset: {
9007
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
9008
get_arg(ctx, ctx->args->ngg_viewport_translate[0]));
9009
break;
9010
}
9011
case nir_intrinsic_load_viewport_y_offset: {
9012
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
9013
get_arg(ctx, ctx->args->ngg_viewport_translate[1]));
9014
break;
9015
}
9016
case nir_intrinsic_overwrite_vs_arguments_amd: {
9017
ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9018
ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9019
break;
9020
}
9021
case nir_intrinsic_overwrite_tes_arguments_amd: {
9022
ctx->arg_temps[ctx->args->ac.tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9023
ctx->arg_temps[ctx->args->ac.tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9024
ctx->arg_temps[ctx->args->ac.tes_rel_patch_id.arg_index] =
9025
get_ssa_temp(ctx, instr->src[2].ssa);
9026
ctx->arg_temps[ctx->args->ac.tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
9027
break;
9028
}
9029
case nir_intrinsic_overwrite_subgroup_num_vertices_and_primitives_amd: {
9030
Temp old_merged_wave_info = get_arg(ctx, ctx->args->ac.merged_wave_info);
9031
Temp num_vertices = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9032
Temp num_primitives = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
9033
Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), num_primitives,
9034
Operand::c32(8u));
9035
tmp = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), tmp, num_vertices);
9036
ctx->arg_temps[ctx->args->ac.merged_wave_info.arg_index] =
9037
bld.sop2(aco_opcode::s_pack_lh_b32_b16, bld.def(s1), tmp, old_merged_wave_info);
9038
break;
9039
}
9040
default:
9041
isel_err(&instr->instr, "Unimplemented intrinsic instr");
9042
abort();
9043
9044
break;
9045
}
9046
}
9047
9048
void
9049
tex_fetch_ptrs(isel_context* ctx, nir_tex_instr* instr, Temp* res_ptr, Temp* samp_ptr,
9050
Temp* fmask_ptr, enum glsl_base_type* stype)
9051
{
9052
nir_deref_instr* texture_deref_instr = NULL;
9053
nir_deref_instr* sampler_deref_instr = NULL;
9054
int plane = -1;
9055
9056
for (unsigned i = 0; i < instr->num_srcs; i++) {
9057
switch (instr->src[i].src_type) {
9058
case nir_tex_src_texture_deref:
9059
texture_deref_instr = nir_src_as_deref(instr->src[i].src);
9060
break;
9061
case nir_tex_src_sampler_deref:
9062
sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
9063
break;
9064
case nir_tex_src_plane: plane = nir_src_as_int(instr->src[i].src); break;
9065
default: break;
9066
}
9067
}
9068
9069
*stype = glsl_get_sampler_result_type(texture_deref_instr->type);
9070
9071
if (!sampler_deref_instr)
9072
sampler_deref_instr = texture_deref_instr;
9073
9074
if (plane >= 0) {
9075
assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical);
9076
assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
9077
*res_ptr = get_sampler_desc(ctx, texture_deref_instr,
9078
(aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false);
9079
} else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9080
*res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false);
9081
} else if (instr->op == nir_texop_fragment_mask_fetch) {
9082
*res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);
9083
} else {
9084
*res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false);
9085
}
9086
if (samp_ptr) {
9087
*samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false);
9088
9089
if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
9090
/* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */
9091
Builder bld(ctx->program, ctx->block);
9092
9093
/* to avoid unnecessary moves, we split and recombine sampler and image */
9094
Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),
9095
bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
9096
Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
9097
bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
9098
Definition(img[2]), Definition(img[3]), Definition(img[4]), Definition(img[5]),
9099
Definition(img[6]), Definition(img[7]), *res_ptr);
9100
bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
9101
Definition(samp[2]), Definition(samp[3]), *samp_ptr);
9102
9103
samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
9104
*res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), img[0], img[1], img[2],
9105
img[3], img[4], img[5], img[6], img[7]);
9106
*samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), samp[0], samp[1], samp[2],
9107
samp[3]);
9108
}
9109
}
9110
if (fmask_ptr && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_samples_identical))
9111
*fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);
9112
}
9113
9114
void
9115
build_cube_select(isel_context* ctx, Temp ma, Temp id, Temp deriv, Temp* out_ma, Temp* out_sc,
9116
Temp* out_tc)
9117
{
9118
Builder bld(ctx->program, ctx->block);
9119
9120
Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
9121
Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
9122
Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
9123
9124
Operand neg_one = Operand::c32(0xbf800000u);
9125
Operand one = Operand::c32(0x3f800000u);
9126
Operand two = Operand::c32(0x40000000u);
9127
Operand four = Operand::c32(0x40800000u);
9128
9129
Temp is_ma_positive =
9130
bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), ma);
9131
Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
9132
Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::zero(), sgn_ma);
9133
9134
Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
9135
Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
9136
is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
9137
Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)),
9138
bld.def(s1, scc), is_ma_z, is_ma_y);
9139
9140
/* select sc */
9141
Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
9142
Temp sgn = bld.vop2_e64(
9143
aco_opcode::v_cndmask_b32, bld.def(v1),
9144
bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), one, is_ma_y);
9145
*out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
9146
9147
/* select tc */
9148
tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
9149
sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
9150
*out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
9151
9152
/* select ma */
9153
tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9154
bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
9155
deriv_z, is_ma_z);
9156
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffffu), tmp);
9157
*out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
9158
}
9159
9160
void
9161
prepare_cube_coords(isel_context* ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy,
9162
bool is_deriv, bool is_array)
9163
{
9164
Builder bld(ctx->program, ctx->block);
9165
Temp ma, tc, sc, id;
9166
aco_opcode madak =
9167
ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32;
9168
aco_opcode madmk =
9169
ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32;
9170
9171
if (is_array) {
9172
coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
9173
9174
/* see comment in ac_prepare_cube_coords() */
9175
if (ctx->options->chip_class <= GFX8)
9176
coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), coords[3]);
9177
}
9178
9179
ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9180
9181
aco_ptr<VOP3_instruction> vop3a{
9182
create_instruction<VOP3_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
9183
vop3a->operands[0] = Operand(ma);
9184
vop3a->abs[0] = true;
9185
Temp invma = bld.tmp(v1);
9186
vop3a->definitions[0] = Definition(invma);
9187
ctx->block->instructions.emplace_back(std::move(vop3a));
9188
9189
sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9190
if (!is_deriv)
9191
sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand::c32(0x3fc00000u /*1.5*/));
9192
9193
tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9194
if (!is_deriv)
9195
tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand::c32(0x3fc00000u /*1.5*/));
9196
9197
id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9198
9199
if (is_deriv) {
9200
sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
9201
tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
9202
9203
for (unsigned i = 0; i < 2; i++) {
9204
/* see comment in ac_prepare_cube_coords() */
9205
Temp deriv_ma;
9206
Temp deriv_sc, deriv_tc;
9207
build_cube_select(ctx, ma, id, i ? *ddy : *ddx, &deriv_ma, &deriv_sc, &deriv_tc);
9208
9209
deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
9210
9211
Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
9212
bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
9213
bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
9214
Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
9215
bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
9216
bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
9217
*(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
9218
}
9219
9220
sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), sc);
9221
tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), tc);
9222
}
9223
9224
if (is_array)
9225
id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand::c32(0x41000000u /*8.0*/));
9226
coords.resize(3);
9227
coords[0] = sc;
9228
coords[1] = tc;
9229
coords[2] = id;
9230
}
9231
9232
void
9233
get_const_vec(nir_ssa_def* vec, nir_const_value* cv[4])
9234
{
9235
if (vec->parent_instr->type != nir_instr_type_alu)
9236
return;
9237
nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9238
if (vec_instr->op != nir_op_vec(vec->num_components))
9239
return;
9240
9241
for (unsigned i = 0; i < vec->num_components; i++) {
9242
cv[i] =
9243
vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9244
}
9245
}
9246
9247
void
9248
visit_tex(isel_context* ctx, nir_tex_instr* instr)
9249
{
9250
Builder bld(ctx->program, ctx->block);
9251
bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9252
has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9253
has_sample_index = false, has_clamped_lod = false;
9254
Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
9255
lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
9256
clamped_lod = Temp();
9257
std::vector<Temp> coords;
9258
std::vector<Temp> derivs;
9259
nir_const_value* sample_index_cv = NULL;
9260
nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9261
enum glsl_base_type stype;
9262
tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
9263
9264
bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
9265
(stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
9266
bool tg4_integer_cube_workaround =
9267
tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9268
9269
for (unsigned i = 0; i < instr->num_srcs; i++) {
9270
switch (instr->src[i].src_type) {
9271
case nir_tex_src_coord: {
9272
Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9273
for (unsigned j = 0; j < coord.size(); j++)
9274
coords.emplace_back(emit_extract_vector(ctx, coord, j, v1));
9275
break;
9276
}
9277
case nir_tex_src_bias:
9278
bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9279
has_bias = true;
9280
break;
9281
case nir_tex_src_lod: {
9282
if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9283
level_zero = true;
9284
} else {
9285
lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
9286
has_lod = true;
9287
}
9288
break;
9289
}
9290
case nir_tex_src_min_lod:
9291
clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
9292
has_clamped_lod = true;
9293
break;
9294
case nir_tex_src_comparator:
9295
if (instr->is_shadow) {
9296
compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9297
has_compare = true;
9298
}
9299
break;
9300
case nir_tex_src_offset:
9301
offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9302
get_const_vec(instr->src[i].src.ssa, const_offset);
9303
has_offset = true;
9304
break;
9305
case nir_tex_src_ddx:
9306
ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
9307
has_ddx = true;
9308
break;
9309
case nir_tex_src_ddy:
9310
ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
9311
has_ddy = true;
9312
break;
9313
case nir_tex_src_ms_index:
9314
sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
9315
sample_index_cv = nir_src_as_const_value(instr->src[i].src);
9316
has_sample_index = true;
9317
break;
9318
case nir_tex_src_texture_offset:
9319
case nir_tex_src_sampler_offset:
9320
default: break;
9321
}
9322
}
9323
9324
if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9325
return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa));
9326
9327
if (instr->op == nir_texop_texture_samples) {
9328
get_image_samples(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), resource);
9329
return;
9330
}
9331
9332
if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
9333
aco_ptr<Instruction> tmp_instr;
9334
Temp acc, pack = Temp();
9335
9336
uint32_t pack_const = 0;
9337
for (unsigned i = 0; i < offset.size(); i++) {
9338
if (!const_offset[i])
9339
continue;
9340
pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9341
}
9342
9343
if (offset.type() == RegType::sgpr) {
9344
for (unsigned i = 0; i < offset.size(); i++) {
9345
if (const_offset[i])
9346
continue;
9347
9348
acc = emit_extract_vector(ctx, offset, i, s1);
9349
acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9350
Operand::c32(0x3Fu));
9351
9352
if (i) {
9353
acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9354
Operand::c32(8u * i));
9355
}
9356
9357
if (pack == Temp()) {
9358
pack = acc;
9359
} else {
9360
pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9361
}
9362
}
9363
9364
if (pack_const && pack != Temp())
9365
pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9366
Operand::c32(pack_const), pack);
9367
} else {
9368
for (unsigned i = 0; i < offset.size(); i++) {
9369
if (const_offset[i])
9370
continue;
9371
9372
acc = emit_extract_vector(ctx, offset, i, v1);
9373
acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9374
9375
if (i) {
9376
acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9377
}
9378
9379
if (pack == Temp()) {
9380
pack = acc;
9381
} else {
9382
pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9383
}
9384
}
9385
9386
if (pack_const && pack != Temp())
9387
pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9388
}
9389
if (pack_const && pack == Temp())
9390
offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9391
else if (pack == Temp())
9392
has_offset = false;
9393
else
9394
offset = pack;
9395
}
9396
9397
if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
9398
prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd,
9399
instr->is_array && instr->op != nir_texop_lod);
9400
9401
/* pack derivatives */
9402
if (has_ddx || has_ddy) {
9403
if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
9404
assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);
9405
Temp zero = bld.copy(bld.def(v1), Operand::zero());
9406
derivs = {ddx, zero, ddy, zero};
9407
} else {
9408
for (unsigned i = 0; has_ddx && i < ddx.size(); i++)
9409
derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));
9410
for (unsigned i = 0; has_ddy && i < ddy.size(); i++)
9411
derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));
9412
}
9413
has_derivs = true;
9414
}
9415
9416
if (instr->coord_components > 1 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
9417
instr->is_array && instr->op != nir_texop_txf)
9418
coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
9419
9420
if (instr->coord_components > 2 &&
9421
(instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9422
instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
9423
instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
9424
instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms &&
9425
instr->op != nir_texop_fragment_fetch && instr->op != nir_texop_fragment_mask_fetch)
9426
coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
9427
9428
if (ctx->options->chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
9429
instr->op != nir_texop_lod && instr->coord_components) {
9430
assert(coords.size() > 0 && coords.size() < 3);
9431
9432
coords.insert(std::next(coords.begin()),
9433
bld.copy(bld.def(v1), instr->op == nir_texop_txf ? Operand::c32(0)
9434
: Operand::c32(0x3f000000)));
9435
}
9436
9437
bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
9438
9439
if (instr->op == nir_texop_samples_identical)
9440
resource = fmask_ptr;
9441
9442
else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9443
instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
9444
instr->op != nir_texop_txs && instr->op != nir_texop_fragment_fetch &&
9445
instr->op != nir_texop_fragment_mask_fetch) {
9446
assert(has_sample_index);
9447
Operand op(sample_index);
9448
if (sample_index_cv)
9449
op = Operand::c32(sample_index_cv->u32);
9450
sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
9451
}
9452
9453
if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
9454
for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
9455
Temp off = emit_extract_vector(ctx, offset, i, v1);
9456
coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
9457
}
9458
has_offset = false;
9459
}
9460
9461
/* Build tex instruction */
9462
unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa) & 0xf;
9463
if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9464
dmask = u_bit_consecutive(0, util_last_bit(dmask));
9465
if (instr->is_sparse)
9466
dmask = MAX2(dmask, 1) | 0x10;
9467
unsigned dim =
9468
ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
9469
? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
9470
: 0;
9471
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9472
Temp tmp_dst = dst;
9473
9474
/* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9475
if (instr->op == nir_texop_tg4) {
9476
assert(instr->dest.ssa.num_components == (4 + instr->is_sparse));
9477
if (instr->is_shadow)
9478
dmask = 1;
9479
else
9480
dmask = 1 << instr->component;
9481
if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9482
tmp_dst = bld.tmp(instr->is_sparse ? v5 : v4);
9483
} else if (instr->op == nir_texop_samples_identical) {
9484
tmp_dst = bld.tmp(v1);
9485
} else if (util_bitcount(dmask) != instr->dest.ssa.num_components ||
9486
dst.type() == RegType::sgpr) {
9487
tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
9488
}
9489
9490
if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
9491
if (!has_lod)
9492
lod = bld.copy(bld.def(v1), Operand::zero());
9493
9494
bool div_by_6 = instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
9495
instr->is_array && (dmask & (1 << 2));
9496
if (tmp_dst.id() == dst.id() && div_by_6)
9497
tmp_dst = bld.tmp(tmp_dst.regClass());
9498
9499
MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(tmp_dst),
9500
resource, Operand(s4), std::vector<Temp>{lod});
9501
if (ctx->options->chip_class == GFX9 && instr->op == nir_texop_txs &&
9502
instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) {
9503
tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
9504
} else if (instr->op == nir_texop_query_levels) {
9505
tex->dmask = 1 << 3;
9506
} else {
9507
tex->dmask = dmask;
9508
}
9509
tex->da = da;
9510
tex->dim = dim;
9511
9512
if (div_by_6) {
9513
/* divide 3rd value by 6 by multiplying with magic number */
9514
emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9515
Temp c = bld.copy(bld.def(s1), Operand::c32(0x2AAAAAAB));
9516
Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1),
9517
emit_extract_vector(ctx, tmp_dst, 2, v1), c);
9518
assert(instr->dest.ssa.num_components == 3);
9519
Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
9520
tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
9521
emit_extract_vector(ctx, tmp_dst, 0, v1),
9522
emit_extract_vector(ctx, tmp_dst, 1, v1), by_6);
9523
}
9524
9525
expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9526
return;
9527
}
9528
9529
Temp tg4_compare_cube_wa64 = Temp();
9530
9531
if (tg4_integer_workarounds) {
9532
Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9533
Temp size = bld.tmp(v2);
9534
MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(size),
9535
resource, Operand(s4), std::vector<Temp>{tg4_lod});
9536
tex->dim = dim;
9537
tex->dmask = 0x3;
9538
tex->da = da;
9539
emit_split_vector(ctx, size, size.size());
9540
9541
Temp half_texel[2];
9542
for (unsigned i = 0; i < 2; i++) {
9543
half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9544
half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9545
half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9546
half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9547
Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9548
}
9549
9550
if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9551
/* In vulkan, whether the sampler uses unnormalized
9552
* coordinates or not is a dynamic property of the
9553
* sampler. Hence, to figure out whether or not we
9554
* need to divide by the texture size, we need to test
9555
* the sampler at runtime. This tests the bit set by
9556
* radv_init_sampler().
9557
*/
9558
unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9559
Temp not_needed =
9560
bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand::c32(bit_idx));
9561
9562
not_needed = bool_to_vector_condition(ctx, not_needed);
9563
half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9564
Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9565
half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9566
Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9567
}
9568
9569
Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9570
bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9571
9572
if (tg4_integer_cube_workaround) {
9573
/* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9574
Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9575
aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
9576
aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9577
split->operands[0] = Operand(resource);
9578
for (unsigned i = 0; i < resource.size(); i++) {
9579
desc[i] = bld.tmp(s1);
9580
split->definitions[i] = Definition(desc[i]);
9581
}
9582
ctx->block->instructions.emplace_back(std::move(split));
9583
9584
Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9585
Operand::c32(20u | (6u << 16)));
9586
Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9587
Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9588
9589
Temp nfmt;
9590
if (stype == GLSL_TYPE_UINT) {
9591
nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9592
Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9593
Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9594
} else {
9595
nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9596
Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9597
Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9598
}
9599
tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9600
bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9601
9602
nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9603
Operand::c32(26u));
9604
9605
desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9606
Operand::c32(C_008F14_NUM_FORMAT));
9607
desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9608
9609
aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
9610
aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9611
for (unsigned i = 0; i < resource.size(); i++)
9612
vec->operands[i] = Operand(desc[i]);
9613
resource = bld.tmp(resource.regClass());
9614
vec->definitions[0] = Definition(resource);
9615
ctx->block->instructions.emplace_back(std::move(vec));
9616
9617
new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9618
tg4_compare_cube_wa64);
9619
new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9620
tg4_compare_cube_wa64);
9621
}
9622
coords[0] = new_coords[0];
9623
coords[1] = new_coords[1];
9624
}
9625
9626
if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9627
// FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9628
// ac_build_buffer_load_format_gfx9_safe()
9629
9630
assert(coords.size() == 1);
9631
aco_opcode op;
9632
switch (util_last_bit(dmask & 0xf)) {
9633
case 1: op = aco_opcode::buffer_load_format_x; break;
9634
case 2: op = aco_opcode::buffer_load_format_xy; break;
9635
case 3: op = aco_opcode::buffer_load_format_xyz; break;
9636
case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9637
default: unreachable("Tex instruction loads more than 4 components.");
9638
}
9639
9640
aco_ptr<MUBUF_instruction> mubuf{
9641
create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9642
mubuf->operands[0] = Operand(resource);
9643
mubuf->operands[1] = Operand(coords[0]);
9644
mubuf->operands[2] = Operand::c32(0);
9645
mubuf->definitions[0] = Definition(tmp_dst);
9646
mubuf->idxen = true;
9647
mubuf->tfe = instr->is_sparse;
9648
if (mubuf->tfe)
9649
mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9650
ctx->block->instructions.emplace_back(std::move(mubuf));
9651
9652
expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9653
return;
9654
}
9655
9656
/* gather MIMG address components */
9657
std::vector<Temp> args;
9658
unsigned wqm_mask = 0;
9659
if (has_offset) {
9660
wqm_mask |= u_bit_consecutive(args.size(), 1);
9661
args.emplace_back(offset);
9662
}
9663
if (has_bias)
9664
args.emplace_back(bias);
9665
if (has_compare)
9666
args.emplace_back(compare);
9667
if (has_derivs)
9668
args.insert(args.end(), derivs.begin(), derivs.end());
9669
9670
wqm_mask |= u_bit_consecutive(args.size(), coords.size());
9671
args.insert(args.end(), coords.begin(), coords.end());
9672
9673
if (has_sample_index)
9674
args.emplace_back(sample_index);
9675
if (has_lod)
9676
args.emplace_back(lod);
9677
if (has_clamped_lod)
9678
args.emplace_back(clamped_lod);
9679
9680
if (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms ||
9681
instr->op == nir_texop_samples_identical || instr->op == nir_texop_fragment_fetch ||
9682
instr->op == nir_texop_fragment_mask_fetch) {
9683
aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9684
instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
9685
? aco_opcode::image_load
9686
: aco_opcode::image_load_mip;
9687
Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9688
MIMG_instruction* tex =
9689
emit_mimg(bld, op, Definition(tmp_dst), resource, Operand(s4), args, 0, vdata);
9690
tex->dim = dim;
9691
tex->dmask = dmask & 0xf;
9692
tex->unrm = true;
9693
tex->da = da;
9694
tex->tfe = instr->is_sparse;
9695
9696
if (instr->op == nir_texop_samples_identical) {
9697
assert(dmask == 1 && dst.regClass() == bld.lm);
9698
assert(dst.id() != tmp_dst.id());
9699
9700
bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand::zero(), tmp_dst)
9701
.def(0)
9702
.setHint(vcc);
9703
} else {
9704
expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9705
}
9706
return;
9707
}
9708
9709
// TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
9710
aco_opcode opcode = aco_opcode::image_sample;
9711
if (has_offset) { /* image_sample_*_o */
9712
if (has_clamped_lod) {
9713
if (has_compare) {
9714
opcode = aco_opcode::image_sample_c_cl_o;
9715
if (has_derivs)
9716
opcode = aco_opcode::image_sample_c_d_cl_o;
9717
if (has_bias)
9718
opcode = aco_opcode::image_sample_c_b_cl_o;
9719
} else {
9720
opcode = aco_opcode::image_sample_cl_o;
9721
if (has_derivs)
9722
opcode = aco_opcode::image_sample_d_cl_o;
9723
if (has_bias)
9724
opcode = aco_opcode::image_sample_b_cl_o;
9725
}
9726
} else if (has_compare) {
9727
opcode = aco_opcode::image_sample_c_o;
9728
if (has_derivs)
9729
opcode = aco_opcode::image_sample_c_d_o;
9730
if (has_bias)
9731
opcode = aco_opcode::image_sample_c_b_o;
9732
if (level_zero)
9733
opcode = aco_opcode::image_sample_c_lz_o;
9734
if (has_lod)
9735
opcode = aco_opcode::image_sample_c_l_o;
9736
} else {
9737
opcode = aco_opcode::image_sample_o;
9738
if (has_derivs)
9739
opcode = aco_opcode::image_sample_d_o;
9740
if (has_bias)
9741
opcode = aco_opcode::image_sample_b_o;
9742
if (level_zero)
9743
opcode = aco_opcode::image_sample_lz_o;
9744
if (has_lod)
9745
opcode = aco_opcode::image_sample_l_o;
9746
}
9747
} else if (has_clamped_lod) { /* image_sample_*_cl */
9748
if (has_compare) {
9749
opcode = aco_opcode::image_sample_c_cl;
9750
if (has_derivs)
9751
opcode = aco_opcode::image_sample_c_d_cl;
9752
if (has_bias)
9753
opcode = aco_opcode::image_sample_c_b_cl;
9754
} else {
9755
opcode = aco_opcode::image_sample_cl;
9756
if (has_derivs)
9757
opcode = aco_opcode::image_sample_d_cl;
9758
if (has_bias)
9759
opcode = aco_opcode::image_sample_b_cl;
9760
}
9761
} else { /* no offset */
9762
if (has_compare) {
9763
opcode = aco_opcode::image_sample_c;
9764
if (has_derivs)
9765
opcode = aco_opcode::image_sample_c_d;
9766
if (has_bias)
9767
opcode = aco_opcode::image_sample_c_b;
9768
if (level_zero)
9769
opcode = aco_opcode::image_sample_c_lz;
9770
if (has_lod)
9771
opcode = aco_opcode::image_sample_c_l;
9772
} else {
9773
opcode = aco_opcode::image_sample;
9774
if (has_derivs)
9775
opcode = aco_opcode::image_sample_d;
9776
if (has_bias)
9777
opcode = aco_opcode::image_sample_b;
9778
if (level_zero)
9779
opcode = aco_opcode::image_sample_lz;
9780
if (has_lod)
9781
opcode = aco_opcode::image_sample_l;
9782
}
9783
}
9784
9785
if (instr->op == nir_texop_tg4) {
9786
if (has_offset) { /* image_gather4_*_o */
9787
if (has_compare) {
9788
opcode = aco_opcode::image_gather4_c_lz_o;
9789
if (has_lod)
9790
opcode = aco_opcode::image_gather4_c_l_o;
9791
if (has_bias)
9792
opcode = aco_opcode::image_gather4_c_b_o;
9793
} else {
9794
opcode = aco_opcode::image_gather4_lz_o;
9795
if (has_lod)
9796
opcode = aco_opcode::image_gather4_l_o;
9797
if (has_bias)
9798
opcode = aco_opcode::image_gather4_b_o;
9799
}
9800
} else {
9801
if (has_compare) {
9802
opcode = aco_opcode::image_gather4_c_lz;
9803
if (has_lod)
9804
opcode = aco_opcode::image_gather4_c_l;
9805
if (has_bias)
9806
opcode = aco_opcode::image_gather4_c_b;
9807
} else {
9808
opcode = aco_opcode::image_gather4_lz;
9809
if (has_lod)
9810
opcode = aco_opcode::image_gather4_l;
9811
if (has_bias)
9812
opcode = aco_opcode::image_gather4_b;
9813
}
9814
}
9815
} else if (instr->op == nir_texop_lod) {
9816
opcode = aco_opcode::image_get_lod;
9817
}
9818
9819
bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
9820
!level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
9821
instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
9822
9823
Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9824
MIMG_instruction* tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource, Operand(sampler),
9825
args, implicit_derivs ? wqm_mask : 0, vdata);
9826
tex->dim = dim;
9827
tex->dmask = dmask & 0xf;
9828
tex->da = da;
9829
tex->tfe = instr->is_sparse;
9830
9831
if (tg4_integer_cube_workaround) {
9832
assert(tmp_dst.id() != dst.id());
9833
assert(tmp_dst.size() == dst.size());
9834
9835
emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9836
Temp val[4];
9837
for (unsigned i = 0; i < 4; i++) {
9838
val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9839
Temp cvt_val;
9840
if (stype == GLSL_TYPE_UINT)
9841
cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9842
else
9843
cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9844
val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
9845
tg4_compare_cube_wa64);
9846
}
9847
9848
Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
9849
if (instr->is_sparse)
9850
tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9851
val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
9852
else
9853
tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9854
val[3]);
9855
}
9856
unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
9857
expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
9858
}
9859
9860
Operand
9861
get_phi_operand(isel_context* ctx, nir_ssa_def* ssa, RegClass rc, bool logical)
9862
{
9863
Temp tmp = get_ssa_temp(ctx, ssa);
9864
if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
9865
return Operand(rc);
9866
} else if (logical && ssa->bit_size == 1 &&
9867
ssa->parent_instr->type == nir_instr_type_load_const) {
9868
if (ctx->program->wave_size == 64)
9869
return Operand::c64(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX
9870
: 0u);
9871
else
9872
return Operand::c32(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX
9873
: 0u);
9874
} else {
9875
return Operand(tmp);
9876
}
9877
}
9878
9879
void
9880
visit_phi(isel_context* ctx, nir_phi_instr* instr)
9881
{
9882
aco_ptr<Pseudo_instruction> phi;
9883
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9884
assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9885
9886
bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
9887
logical |= (ctx->block->kind & block_kind_merge) != 0;
9888
aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
9889
9890
/* we want a sorted list of sources, since the predecessor list is also sorted */
9891
std::map<unsigned, nir_ssa_def*> phi_src;
9892
nir_foreach_phi_src (src, instr)
9893
phi_src[src->pred->index] = src->src.ssa;
9894
9895
std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
9896
unsigned num_operands = 0;
9897
Operand* const operands = (Operand*)alloca(
9898
(std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));
9899
unsigned num_defined = 0;
9900
unsigned cur_pred_idx = 0;
9901
for (std::pair<unsigned, nir_ssa_def*> src : phi_src) {
9902
if (cur_pred_idx < preds.size()) {
9903
/* handle missing preds (IF merges with discard/break) and extra preds
9904
* (loop exit with discard) */
9905
unsigned block = ctx->cf_info.nir_to_aco[src.first];
9906
unsigned skipped = 0;
9907
while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
9908
skipped++;
9909
if (cur_pred_idx + skipped < preds.size()) {
9910
for (unsigned i = 0; i < skipped; i++)
9911
operands[num_operands++] = Operand(dst.regClass());
9912
cur_pred_idx += skipped;
9913
} else {
9914
continue;
9915
}
9916
}
9917
/* Handle missing predecessors at the end. This shouldn't happen with loop
9918
* headers and we can't ignore these sources for loop header phis. */
9919
if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
9920
continue;
9921
cur_pred_idx++;
9922
Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
9923
operands[num_operands++] = op;
9924
num_defined += !op.isUndefined();
9925
}
9926
/* handle block_kind_continue_or_break at loop exit blocks */
9927
while (cur_pred_idx++ < preds.size())
9928
operands[num_operands++] = Operand(dst.regClass());
9929
9930
/* If the loop ends with a break, still add a linear continue edge in case
9931
* that break is divergent or continue_or_break is used. We'll either remove
9932
* this operand later in visit_loop() if it's not necessary or replace the
9933
* undef with something correct. */
9934
if (!logical && ctx->block->kind & block_kind_loop_header) {
9935
nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
9936
nir_block* last = nir_loop_last_block(loop);
9937
if (last->successors[0] != instr->instr.block)
9938
operands[num_operands++] = Operand(RegClass());
9939
}
9940
9941
/* we can use a linear phi in some cases if one src is undef */
9942
if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
9943
phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO,
9944
num_operands, 1));
9945
9946
Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
9947
Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];
9948
assert(invert->kind & block_kind_invert);
9949
9950
unsigned then_block = invert->linear_preds[0];
9951
9952
Block* insert_block = NULL;
9953
for (unsigned i = 0; i < num_operands; i++) {
9954
Operand op = operands[i];
9955
if (op.isUndefined())
9956
continue;
9957
insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
9958
phi->operands[0] = op;
9959
break;
9960
}
9961
assert(insert_block); /* should be handled by the "num_defined == 0" case above */
9962
phi->operands[1] = Operand(dst.regClass());
9963
phi->definitions[0] = Definition(dst);
9964
insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
9965
return;
9966
}
9967
9968
phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9969
for (unsigned i = 0; i < num_operands; i++)
9970
phi->operands[i] = operands[i];
9971
phi->definitions[0] = Definition(dst);
9972
ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9973
}
9974
9975
void
9976
visit_undef(isel_context* ctx, nir_ssa_undef_instr* instr)
9977
{
9978
Temp dst = get_ssa_temp(ctx, &instr->def);
9979
9980
assert(dst.type() == RegType::sgpr);
9981
9982
if (dst.size() == 1) {
9983
Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
9984
} else {
9985
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9986
aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9987
for (unsigned i = 0; i < dst.size(); i++)
9988
vec->operands[i] = Operand::zero();
9989
vec->definitions[0] = Definition(dst);
9990
ctx->block->instructions.emplace_back(std::move(vec));
9991
}
9992
}
9993
9994
void
9995
begin_loop(isel_context* ctx, loop_context* lc)
9996
{
9997
// TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9998
append_logical_end(ctx->block);
9999
ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
10000
Builder bld(ctx->program, ctx->block);
10001
bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10002
unsigned loop_preheader_idx = ctx->block->index;
10003
10004
lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
10005
10006
ctx->program->next_loop_depth++;
10007
10008
Block* loop_header = ctx->program->create_and_insert_block();
10009
loop_header->kind |= block_kind_loop_header;
10010
add_edge(loop_preheader_idx, loop_header);
10011
ctx->block = loop_header;
10012
10013
append_logical_start(ctx->block);
10014
10015
lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
10016
lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
10017
lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
10018
lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
10019
lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
10020
}
10021
10022
void
10023
end_loop(isel_context* ctx, loop_context* lc)
10024
{
10025
// TODO: what if a loop ends with a unconditional or uniformly branched continue
10026
// and this branch is never taken?
10027
if (!ctx->cf_info.has_branch) {
10028
unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10029
Builder bld(ctx->program, ctx->block);
10030
append_logical_end(ctx->block);
10031
10032
if (ctx->cf_info.exec_potentially_empty_discard ||
10033
ctx->cf_info.exec_potentially_empty_break) {
10034
/* Discards can result in code running with an empty exec mask.
10035
* This would result in divergent breaks not ever being taken. As a
10036
* workaround, break the loop when the loop mask is empty instead of
10037
* always continuing. */
10038
ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
10039
unsigned block_idx = ctx->block->index;
10040
10041
/* create helper blocks to avoid critical edges */
10042
Block* break_block = ctx->program->create_and_insert_block();
10043
break_block->kind = block_kind_uniform;
10044
bld.reset(break_block);
10045
bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10046
add_linear_edge(block_idx, break_block);
10047
add_linear_edge(break_block->index, &lc->loop_exit);
10048
10049
Block* continue_block = ctx->program->create_and_insert_block();
10050
continue_block->kind = block_kind_uniform;
10051
bld.reset(continue_block);
10052
bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10053
add_linear_edge(block_idx, continue_block);
10054
add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
10055
10056
if (!ctx->cf_info.parent_loop.has_divergent_branch)
10057
add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
10058
ctx->block = &ctx->program->blocks[block_idx];
10059
} else {
10060
ctx->block->kind |= (block_kind_continue | block_kind_uniform);
10061
if (!ctx->cf_info.parent_loop.has_divergent_branch)
10062
add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10063
else
10064
add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10065
}
10066
10067
bld.reset(ctx->block);
10068
bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10069
}
10070
10071
ctx->cf_info.has_branch = false;
10072
ctx->program->next_loop_depth--;
10073
10074
// TODO: if the loop has not a single exit, we must add one °°
10075
/* emit loop successor block */
10076
ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
10077
append_logical_start(ctx->block);
10078
10079
#if 0
10080
// TODO: check if it is beneficial to not branch on continues
10081
/* trim linear phis in loop header */
10082
for (auto&& instr : loop_entry->instructions) {
10083
if (instr->opcode == aco_opcode::p_linear_phi) {
10084
aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
10085
new_phi->definitions[0] = instr->definitions[0];
10086
for (unsigned i = 0; i < new_phi->operands.size(); i++)
10087
new_phi->operands[i] = instr->operands[i];
10088
/* check that the remaining operands are all the same */
10089
for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
10090
assert(instr->operands[i].tempId() == instr->operands.back().tempId());
10091
instr.swap(new_phi);
10092
} else if (instr->opcode == aco_opcode::p_phi) {
10093
continue;
10094
} else {
10095
break;
10096
}
10097
}
10098
#endif
10099
10100
ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
10101
ctx->cf_info.parent_loop.exit = lc->exit_old;
10102
ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
10103
ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
10104
ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
10105
if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
10106
ctx->cf_info.exec_potentially_empty_discard = false;
10107
}
10108
10109
void
10110
emit_loop_jump(isel_context* ctx, bool is_break)
10111
{
10112
Builder bld(ctx->program, ctx->block);
10113
Block* logical_target;
10114
append_logical_end(ctx->block);
10115
unsigned idx = ctx->block->index;
10116
10117
if (is_break) {
10118
logical_target = ctx->cf_info.parent_loop.exit;
10119
add_logical_edge(idx, logical_target);
10120
ctx->block->kind |= block_kind_break;
10121
10122
if (!ctx->cf_info.parent_if.is_divergent &&
10123
!ctx->cf_info.parent_loop.has_divergent_continue) {
10124
/* uniform break - directly jump out of the loop */
10125
ctx->block->kind |= block_kind_uniform;
10126
ctx->cf_info.has_branch = true;
10127
bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10128
add_linear_edge(idx, logical_target);
10129
return;
10130
}
10131
ctx->cf_info.parent_loop.has_divergent_branch = true;
10132
} else {
10133
logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10134
add_logical_edge(idx, logical_target);
10135
ctx->block->kind |= block_kind_continue;
10136
10137
if (!ctx->cf_info.parent_if.is_divergent) {
10138
/* uniform continue - directly jump to the loop header */
10139
ctx->block->kind |= block_kind_uniform;
10140
ctx->cf_info.has_branch = true;
10141
bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10142
add_linear_edge(idx, logical_target);
10143
return;
10144
}
10145
10146
/* for potential uniform breaks after this continue,
10147
we must ensure that they are handled correctly */
10148
ctx->cf_info.parent_loop.has_divergent_continue = true;
10149
ctx->cf_info.parent_loop.has_divergent_branch = true;
10150
}
10151
10152
if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
10153
ctx->cf_info.exec_potentially_empty_break = true;
10154
ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth;
10155
}
10156
10157
/* remove critical edges from linear CFG */
10158
bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10159
Block* break_block = ctx->program->create_and_insert_block();
10160
break_block->kind |= block_kind_uniform;
10161
add_linear_edge(idx, break_block);
10162
/* the loop_header pointer might be invalidated by this point */
10163
if (!is_break)
10164
logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10165
add_linear_edge(break_block->index, logical_target);
10166
bld.reset(break_block);
10167
bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10168
10169
Block* continue_block = ctx->program->create_and_insert_block();
10170
add_linear_edge(idx, continue_block);
10171
append_logical_start(continue_block);
10172
ctx->block = continue_block;
10173
}
10174
10175
void
10176
emit_loop_break(isel_context* ctx)
10177
{
10178
emit_loop_jump(ctx, true);
10179
}
10180
10181
void
10182
emit_loop_continue(isel_context* ctx)
10183
{
10184
emit_loop_jump(ctx, false);
10185
}
10186
10187
void
10188
visit_jump(isel_context* ctx, nir_jump_instr* instr)
10189
{
10190
/* visit_block() would usually do this but divergent jumps updates ctx->block */
10191
ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
10192
10193
switch (instr->type) {
10194
case nir_jump_break: emit_loop_break(ctx); break;
10195
case nir_jump_continue: emit_loop_continue(ctx); break;
10196
default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
10197
}
10198
}
10199
10200
void
10201
visit_block(isel_context* ctx, nir_block* block)
10202
{
10203
nir_foreach_instr (instr, block) {
10204
switch (instr->type) {
10205
case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10206
case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10207
case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10208
case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10209
case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10210
case nir_instr_type_ssa_undef: visit_undef(ctx, nir_instr_as_ssa_undef(instr)); break;
10211
case nir_instr_type_deref: break;
10212
case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10213
default: isel_err(instr, "Unknown NIR instr type");
10214
}
10215
}
10216
10217
if (!ctx->cf_info.parent_loop.has_divergent_branch)
10218
ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
10219
}
10220
10221
static Operand
10222
create_continue_phis(isel_context* ctx, unsigned first, unsigned last,
10223
aco_ptr<Instruction>& header_phi, Operand* vals)
10224
{
10225
vals[0] = Operand(header_phi->definitions[0].getTemp());
10226
RegClass rc = vals[0].regClass();
10227
10228
unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
10229
10230
unsigned next_pred = 1;
10231
10232
for (unsigned idx = first + 1; idx <= last; idx++) {
10233
Block& block = ctx->program->blocks[idx];
10234
if (block.loop_nest_depth != loop_nest_depth) {
10235
vals[idx - first] = vals[idx - 1 - first];
10236
continue;
10237
}
10238
10239
if ((block.kind & block_kind_continue) && block.index != last) {
10240
vals[idx - first] = header_phi->operands[next_pred];
10241
next_pred++;
10242
continue;
10243
}
10244
10245
bool all_same = true;
10246
for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
10247
all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
10248
10249
Operand val;
10250
if (all_same) {
10251
val = vals[block.linear_preds[0] - first];
10252
} else {
10253
aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
10254
aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
10255
for (unsigned i = 0; i < block.linear_preds.size(); i++)
10256
phi->operands[i] = vals[block.linear_preds[i] - first];
10257
val = Operand(ctx->program->allocateTmp(rc));
10258
phi->definitions[0] = Definition(val.getTemp());
10259
block.instructions.emplace(block.instructions.begin(), std::move(phi));
10260
}
10261
vals[idx - first] = val;
10262
}
10263
10264
return vals[last - first];
10265
}
10266
10267
static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10268
static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10269
static void end_uniform_if(isel_context* ctx, if_context* ic);
10270
10271
static void
10272
visit_loop(isel_context* ctx, nir_loop* loop)
10273
{
10274
loop_context lc;
10275
begin_loop(ctx, &lc);
10276
10277
/* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the
10278
* loop header are live. Handle this without complicating the ACO IR by creating a dummy break.
10279
*/
10280
if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) {
10281
Builder bld(ctx->program, ctx->block);
10282
Temp cond = bld.copy(bld.def(s1, scc), Operand::zero());
10283
if_context ic;
10284
begin_uniform_if_then(ctx, &ic, cond);
10285
emit_loop_break(ctx);
10286
begin_uniform_if_else(ctx, &ic);
10287
end_uniform_if(ctx, &ic);
10288
}
10289
10290
bool unreachable = visit_cf_list(ctx, &loop->body);
10291
10292
unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10293
10294
/* Fixup phis in loop header from unreachable blocks.
10295
* has_branch/has_divergent_branch also indicates if the loop ends with a
10296
* break/continue instruction, but we don't emit those if unreachable=true */
10297
if (unreachable) {
10298
assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
10299
bool linear = ctx->cf_info.has_branch;
10300
bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
10301
for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10302
if ((logical && instr->opcode == aco_opcode::p_phi) ||
10303
(linear && instr->opcode == aco_opcode::p_linear_phi)) {
10304
/* the last operand should be the one that needs to be removed */
10305
instr->operands.pop_back();
10306
} else if (!is_phi(instr)) {
10307
break;
10308
}
10309
}
10310
}
10311
10312
/* Fixup linear phis in loop header from expecting a continue. Both this fixup
10313
* and the previous one shouldn't both happen at once because a break in the
10314
* merge block would get CSE'd */
10315
if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
10316
unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
10317
Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand));
10318
for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10319
if (instr->opcode == aco_opcode::p_linear_phi) {
10320
if (ctx->cf_info.has_branch)
10321
instr->operands.pop_back();
10322
else
10323
instr->operands.back() =
10324
create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
10325
} else if (!is_phi(instr)) {
10326
break;
10327
}
10328
}
10329
}
10330
10331
end_loop(ctx, &lc);
10332
}
10333
10334
static void
10335
begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
10336
{
10337
ic->cond = cond;
10338
10339
append_logical_end(ctx->block);
10340
ctx->block->kind |= block_kind_branch;
10341
10342
/* branch to linear then block */
10343
assert(cond.regClass() == ctx->program->lane_mask);
10344
aco_ptr<Pseudo_branch_instruction> branch;
10345
branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,
10346
Format::PSEUDO_BRANCH, 1, 1));
10347
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10348
branch->definitions[0].setHint(vcc);
10349
branch->operands[0] = Operand(cond);
10350
ctx->block->instructions.push_back(std::move(branch));
10351
10352
ic->BB_if_idx = ctx->block->index;
10353
ic->BB_invert = Block();
10354
/* Invert blocks are intentionally not marked as top level because they
10355
* are not part of the logical cfg. */
10356
ic->BB_invert.kind |= block_kind_invert;
10357
ic->BB_endif = Block();
10358
ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10359
10360
ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
10361
ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
10362
ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
10363
ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10364
ctx->cf_info.parent_if.is_divergent = true;
10365
10366
/* divergent branches use cbranch_execz */
10367
ctx->cf_info.exec_potentially_empty_discard = false;
10368
ctx->cf_info.exec_potentially_empty_break = false;
10369
ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10370
10371
/** emit logical then block */
10372
ctx->program->next_divergent_if_logical_depth++;
10373
Block* BB_then_logical = ctx->program->create_and_insert_block();
10374
add_edge(ic->BB_if_idx, BB_then_logical);
10375
ctx->block = BB_then_logical;
10376
append_logical_start(BB_then_logical);
10377
}
10378
10379
static void
10380
begin_divergent_if_else(isel_context* ctx, if_context* ic)
10381
{
10382
Block* BB_then_logical = ctx->block;
10383
append_logical_end(BB_then_logical);
10384
/* branch from logical then block to invert block */
10385
aco_ptr<Pseudo_branch_instruction> branch;
10386
branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10387
Format::PSEUDO_BRANCH, 0, 1));
10388
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10389
branch->definitions[0].setHint(vcc);
10390
BB_then_logical->instructions.emplace_back(std::move(branch));
10391
add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10392
if (!ctx->cf_info.parent_loop.has_divergent_branch)
10393
add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10394
BB_then_logical->kind |= block_kind_uniform;
10395
assert(!ctx->cf_info.has_branch);
10396
ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10397
ctx->cf_info.parent_loop.has_divergent_branch = false;
10398
ctx->program->next_divergent_if_logical_depth--;
10399
10400
/** emit linear then block */
10401
Block* BB_then_linear = ctx->program->create_and_insert_block();
10402
BB_then_linear->kind |= block_kind_uniform;
10403
add_linear_edge(ic->BB_if_idx, BB_then_linear);
10404
/* branch from linear then block to invert block */
10405
branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10406
Format::PSEUDO_BRANCH, 0, 1));
10407
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10408
branch->definitions[0].setHint(vcc);
10409
BB_then_linear->instructions.emplace_back(std::move(branch));
10410
add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10411
10412
/** emit invert merge block */
10413
ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10414
ic->invert_idx = ctx->block->index;
10415
10416
/* branch to linear else block (skip else) */
10417
branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10418
Format::PSEUDO_BRANCH, 0, 1));
10419
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10420
branch->definitions[0].setHint(vcc);
10421
ctx->block->instructions.push_back(std::move(branch));
10422
10423
ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
10424
ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
10425
ic->exec_potentially_empty_break_depth_old = std::min(
10426
ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10427
/* divergent branches use cbranch_execz */
10428
ctx->cf_info.exec_potentially_empty_discard = false;
10429
ctx->cf_info.exec_potentially_empty_break = false;
10430
ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10431
10432
/** emit logical else block */
10433
ctx->program->next_divergent_if_logical_depth++;
10434
Block* BB_else_logical = ctx->program->create_and_insert_block();
10435
add_logical_edge(ic->BB_if_idx, BB_else_logical);
10436
add_linear_edge(ic->invert_idx, BB_else_logical);
10437
ctx->block = BB_else_logical;
10438
append_logical_start(BB_else_logical);
10439
}
10440
10441
static void
10442
end_divergent_if(isel_context* ctx, if_context* ic)
10443
{
10444
Block* BB_else_logical = ctx->block;
10445
append_logical_end(BB_else_logical);
10446
10447
/* branch from logical else block to endif block */
10448
aco_ptr<Pseudo_branch_instruction> branch;
10449
branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10450
Format::PSEUDO_BRANCH, 0, 1));
10451
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10452
branch->definitions[0].setHint(vcc);
10453
BB_else_logical->instructions.emplace_back(std::move(branch));
10454
add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10455
if (!ctx->cf_info.parent_loop.has_divergent_branch)
10456
add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10457
BB_else_logical->kind |= block_kind_uniform;
10458
ctx->program->next_divergent_if_logical_depth--;
10459
10460
assert(!ctx->cf_info.has_branch);
10461
ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10462
10463
/** emit linear else block */
10464
Block* BB_else_linear = ctx->program->create_and_insert_block();
10465
BB_else_linear->kind |= block_kind_uniform;
10466
add_linear_edge(ic->invert_idx, BB_else_linear);
10467
10468
/* branch from linear else block to endif block */
10469
branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10470
Format::PSEUDO_BRANCH, 0, 1));
10471
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10472
branch->definitions[0].setHint(vcc);
10473
BB_else_linear->instructions.emplace_back(std::move(branch));
10474
add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10475
10476
/** emit endif merge block */
10477
ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10478
append_logical_start(ctx->block);
10479
10480
ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10481
ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
10482
ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
10483
ctx->cf_info.exec_potentially_empty_break_depth = std::min(
10484
ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10485
if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
10486
!ctx->cf_info.parent_if.is_divergent) {
10487
ctx->cf_info.exec_potentially_empty_break = false;
10488
ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10489
}
10490
/* uniform control flow never has an empty exec-mask */
10491
if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
10492
ctx->cf_info.exec_potentially_empty_discard = false;
10493
ctx->cf_info.exec_potentially_empty_break = false;
10494
ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10495
}
10496
}
10497
10498
static void
10499
begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10500
{
10501
assert(cond.regClass() == s1);
10502
10503
append_logical_end(ctx->block);
10504
ctx->block->kind |= block_kind_uniform;
10505
10506
aco_ptr<Pseudo_branch_instruction> branch;
10507
aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10508
branch.reset(
10509
create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
10510
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10511
branch->definitions[0].setHint(vcc);
10512
branch->operands[0] = Operand(cond);
10513
branch->operands[0].setFixed(scc);
10514
ctx->block->instructions.emplace_back(std::move(branch));
10515
10516
ic->BB_if_idx = ctx->block->index;
10517
ic->BB_endif = Block();
10518
ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10519
10520
ctx->cf_info.has_branch = false;
10521
ctx->cf_info.parent_loop.has_divergent_branch = false;
10522
10523
/** emit then block */
10524
ctx->program->next_uniform_if_depth++;
10525
Block* BB_then = ctx->program->create_and_insert_block();
10526
add_edge(ic->BB_if_idx, BB_then);
10527
append_logical_start(BB_then);
10528
ctx->block = BB_then;
10529
}
10530
10531
static void
10532
begin_uniform_if_else(isel_context* ctx, if_context* ic)
10533
{
10534
Block* BB_then = ctx->block;
10535
10536
ic->uniform_has_then_branch = ctx->cf_info.has_branch;
10537
ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10538
10539
if (!ic->uniform_has_then_branch) {
10540
append_logical_end(BB_then);
10541
/* branch from then block to endif block */
10542
aco_ptr<Pseudo_branch_instruction> branch;
10543
branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10544
Format::PSEUDO_BRANCH, 0, 1));
10545
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10546
branch->definitions[0].setHint(vcc);
10547
BB_then->instructions.emplace_back(std::move(branch));
10548
add_linear_edge(BB_then->index, &ic->BB_endif);
10549
if (!ic->then_branch_divergent)
10550
add_logical_edge(BB_then->index, &ic->BB_endif);
10551
BB_then->kind |= block_kind_uniform;
10552
}
10553
10554
ctx->cf_info.has_branch = false;
10555
ctx->cf_info.parent_loop.has_divergent_branch = false;
10556
10557
/** emit else block */
10558
Block* BB_else = ctx->program->create_and_insert_block();
10559
add_edge(ic->BB_if_idx, BB_else);
10560
append_logical_start(BB_else);
10561
ctx->block = BB_else;
10562
}
10563
10564
static void
10565
end_uniform_if(isel_context* ctx, if_context* ic)
10566
{
10567
Block* BB_else = ctx->block;
10568
10569
if (!ctx->cf_info.has_branch) {
10570
append_logical_end(BB_else);
10571
/* branch from then block to endif block */
10572
aco_ptr<Pseudo_branch_instruction> branch;
10573
branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10574
Format::PSEUDO_BRANCH, 0, 1));
10575
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10576
branch->definitions[0].setHint(vcc);
10577
BB_else->instructions.emplace_back(std::move(branch));
10578
add_linear_edge(BB_else->index, &ic->BB_endif);
10579
if (!ctx->cf_info.parent_loop.has_divergent_branch)
10580
add_logical_edge(BB_else->index, &ic->BB_endif);
10581
BB_else->kind |= block_kind_uniform;
10582
}
10583
10584
ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
10585
ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10586
10587
/** emit endif merge block */
10588
ctx->program->next_uniform_if_depth--;
10589
if (!ctx->cf_info.has_branch) {
10590
ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10591
append_logical_start(ctx->block);
10592
}
10593
}
10594
10595
static bool
10596
visit_if(isel_context* ctx, nir_if* if_stmt)
10597
{
10598
Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10599
Builder bld(ctx->program, ctx->block);
10600
aco_ptr<Pseudo_branch_instruction> branch;
10601
if_context ic;
10602
10603
if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
10604
/**
10605
* Uniform conditionals are represented in the following way*) :
10606
*
10607
* The linear and logical CFG:
10608
* BB_IF
10609
* / \
10610
* BB_THEN (logical) BB_ELSE (logical)
10611
* \ /
10612
* BB_ENDIF
10613
*
10614
* *) Exceptions may be due to break and continue statements within loops
10615
* If a break/continue happens within uniform control flow, it branches
10616
* to the loop exit/entry block. Otherwise, it branches to the next
10617
* merge block.
10618
**/
10619
10620
assert(cond.regClass() == ctx->program->lane_mask);
10621
cond = bool_to_scalar_condition(ctx, cond);
10622
10623
begin_uniform_if_then(ctx, &ic, cond);
10624
visit_cf_list(ctx, &if_stmt->then_list);
10625
10626
begin_uniform_if_else(ctx, &ic);
10627
visit_cf_list(ctx, &if_stmt->else_list);
10628
10629
end_uniform_if(ctx, &ic);
10630
} else { /* non-uniform condition */
10631
/**
10632
* To maintain a logical and linear CFG without critical edges,
10633
* non-uniform conditionals are represented in the following way*) :
10634
*
10635
* The linear CFG:
10636
* BB_IF
10637
* / \
10638
* BB_THEN (logical) BB_THEN (linear)
10639
* \ /
10640
* BB_INVERT (linear)
10641
* / \
10642
* BB_ELSE (logical) BB_ELSE (linear)
10643
* \ /
10644
* BB_ENDIF
10645
*
10646
* The logical CFG:
10647
* BB_IF
10648
* / \
10649
* BB_THEN (logical) BB_ELSE (logical)
10650
* \ /
10651
* BB_ENDIF
10652
*
10653
* *) Exceptions may be due to break and continue statements within loops
10654
**/
10655
10656
begin_divergent_if_then(ctx, &ic, cond);
10657
visit_cf_list(ctx, &if_stmt->then_list);
10658
10659
begin_divergent_if_else(ctx, &ic);
10660
visit_cf_list(ctx, &if_stmt->else_list);
10661
10662
end_divergent_if(ctx, &ic);
10663
}
10664
10665
return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
10666
}
10667
10668
static bool
10669
visit_cf_list(isel_context* ctx, struct exec_list* list)
10670
{
10671
foreach_list_typed (nir_cf_node, node, node, list) {
10672
switch (node->type) {
10673
case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10674
case nir_cf_node_if:
10675
if (!visit_if(ctx, nir_cf_node_as_if(node)))
10676
return true;
10677
break;
10678
case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10679
default: unreachable("unimplemented cf list type");
10680
}
10681
}
10682
return false;
10683
}
10684
10685
static void
10686
export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos)
10687
{
10688
assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10689
10690
int offset = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
10691
? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
10692
: ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
10693
unsigned mask = ctx->outputs.mask[slot];
10694
if (!is_pos && !mask)
10695
return;
10696
if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
10697
return;
10698
aco_ptr<Export_instruction> exp{
10699
create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
10700
exp->enabled_mask = mask;
10701
for (unsigned i = 0; i < 4; ++i) {
10702
if (mask & (1 << i))
10703
exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10704
else
10705
exp->operands[i] = Operand(v1);
10706
}
10707
/* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
10708
* Setting valid_mask=1 prevents it and has no other effect.
10709
*/
10710
exp->valid_mask = ctx->options->chip_class == GFX10 && is_pos && *next_pos == 0;
10711
exp->done = false;
10712
exp->compressed = false;
10713
if (is_pos)
10714
exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
10715
else
10716
exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
10717
ctx->block->instructions.emplace_back(std::move(exp));
10718
}
10719
10720
static void
10721
export_vs_psiz_layer_viewport_vrs(isel_context* ctx, int* next_pos)
10722
{
10723
aco_ptr<Export_instruction> exp{
10724
create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
10725
exp->enabled_mask = 0;
10726
for (unsigned i = 0; i < 4; ++i)
10727
exp->operands[i] = Operand(v1);
10728
if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
10729
exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
10730
exp->enabled_mask |= 0x1;
10731
}
10732
if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
10733
exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
10734
exp->enabled_mask |= 0x4;
10735
}
10736
if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
10737
if (ctx->options->chip_class < GFX9) {
10738
exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
10739
exp->enabled_mask |= 0x8;
10740
} else {
10741
Builder bld(ctx->program, ctx->block);
10742
10743
Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u),
10744
Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
10745
if (exp->operands[2].isTemp())
10746
out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
10747
10748
exp->operands[2] = Operand(out);
10749
exp->enabled_mask |= 0x4;
10750
}
10751
}
10752
if (ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_SHADING_RATE]) {
10753
exp->operands[1] = Operand(ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]);
10754
exp->enabled_mask |= 0x2;
10755
} else if (ctx->options->force_vrs_rates) {
10756
/* Bits [2:3] = VRS rate X
10757
* Bits [4:5] = VRS rate Y
10758
*
10759
* The range is [-2, 1]. Values:
10760
* 1: 2x coarser shading rate in that direction.
10761
* 0: normal shading rate
10762
* -1: 2x finer shading rate (sample shading, not directional)
10763
* -2: 4x finer shading rate (sample shading, not directional)
10764
*
10765
* Sample shading can't go above 8 samples, so both numbers can't be -2
10766
* at the same time.
10767
*/
10768
Builder bld(ctx->program, ctx->block);
10769
Temp rates = bld.copy(bld.def(v1), Operand::c32((unsigned)ctx->options->force_vrs_rates));
10770
10771
/* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
10772
Temp cond = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), Operand::c32(0x3f800000u),
10773
Operand(ctx->outputs.temps[VARYING_SLOT_POS + 3]));
10774
rates = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10775
bld.copy(bld.def(v1), Operand::zero()), rates, cond);
10776
10777
exp->operands[1] = Operand(rates);
10778
exp->enabled_mask |= 0x2;
10779
}
10780
10781
exp->valid_mask = ctx->options->chip_class == GFX10 && *next_pos == 0;
10782
exp->done = false;
10783
exp->compressed = false;
10784
exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
10785
ctx->block->instructions.emplace_back(std::move(exp));
10786
}
10787
10788
static void
10789
create_vs_exports(isel_context* ctx)
10790
{
10791
assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10792
10793
radv_vs_output_info* outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
10794
? &ctx->program->info->tes.outinfo
10795
: &ctx->program->info->vs.outinfo;
10796
10797
ctx->block->kind |= block_kind_export_end;
10798
10799
if (outinfo->export_prim_id && ctx->stage.hw != HWStage::NGG) {
10800
ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
10801
if (ctx->stage.has(SWStage::TES))
10802
ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
10803
get_arg(ctx, ctx->args->ac.tes_patch_id);
10804
else
10805
ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
10806
get_arg(ctx, ctx->args->ac.vs_prim_id);
10807
}
10808
10809
if (ctx->options->key.has_multiview_view_index) {
10810
ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
10811
ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] =
10812
as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
10813
}
10814
10815
/* Hardware requires position data to always be exported, even if the
10816
* application did not write gl_Position.
10817
*/
10818
ctx->outputs.mask[VARYING_SLOT_POS] = 0xf;
10819
10820
/* the order these position exports are created is important */
10821
int next_pos = 0;
10822
export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
10823
10824
bool writes_primitive_shading_rate =
10825
outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates;
10826
if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index ||
10827
writes_primitive_shading_rate) {
10828
export_vs_psiz_layer_viewport_vrs(ctx, &next_pos);
10829
}
10830
if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
10831
export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
10832
if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
10833
export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
10834
10835
if (ctx->export_clip_dists) {
10836
if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
10837
export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
10838
if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
10839
export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
10840
}
10841
10842
for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
10843
if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER && i != VARYING_SLOT_PRIMITIVE_ID &&
10844
i != VARYING_SLOT_VIEWPORT)
10845
continue;
10846
10847
export_vs_varying(ctx, i, false, NULL);
10848
}
10849
}
10850
10851
static bool
10852
export_fs_mrt_z(isel_context* ctx)
10853
{
10854
Builder bld(ctx->program, ctx->block);
10855
unsigned enabled_channels = 0;
10856
bool compr = false;
10857
Operand values[4];
10858
10859
for (unsigned i = 0; i < 4; ++i) {
10860
values[i] = Operand(v1);
10861
}
10862
10863
/* Both stencil and sample mask only need 16-bits. */
10864
if (!ctx->program->info->ps.writes_z &&
10865
(ctx->program->info->ps.writes_stencil || ctx->program->info->ps.writes_sample_mask)) {
10866
compr = true; /* COMPR flag */
10867
10868
if (ctx->program->info->ps.writes_stencil) {
10869
/* Stencil should be in X[23:16]. */
10870
values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10871
values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), values[0]);
10872
enabled_channels |= 0x3;
10873
}
10874
10875
if (ctx->program->info->ps.writes_sample_mask) {
10876
/* SampleMask should be in Y[15:0]. */
10877
values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10878
enabled_channels |= 0xc;
10879
}
10880
} else {
10881
if (ctx->program->info->ps.writes_z) {
10882
values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
10883
enabled_channels |= 0x1;
10884
}
10885
10886
if (ctx->program->info->ps.writes_stencil) {
10887
values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10888
enabled_channels |= 0x2;
10889
}
10890
10891
if (ctx->program->info->ps.writes_sample_mask) {
10892
values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10893
enabled_channels |= 0x4;
10894
}
10895
}
10896
10897
/* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
10898
* writemask component.
10899
*/
10900
if (ctx->options->chip_class == GFX6 && ctx->options->family != CHIP_OLAND &&
10901
ctx->options->family != CHIP_HAINAN) {
10902
enabled_channels |= 0x1;
10903
}
10904
10905
bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,
10906
V_008DFC_SQ_EXP_MRTZ, compr);
10907
10908
return true;
10909
}
10910
10911
static bool
10912
export_fs_mrt_color(isel_context* ctx, int slot)
10913
{
10914
Builder bld(ctx->program, ctx->block);
10915
unsigned write_mask = ctx->outputs.mask[slot];
10916
Operand values[4];
10917
10918
for (unsigned i = 0; i < 4; ++i) {
10919
if (write_mask & (1 << i)) {
10920
values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10921
} else {
10922
values[i] = Operand(v1);
10923
}
10924
}
10925
10926
unsigned target, col_format;
10927
unsigned enabled_channels = 0;
10928
aco_opcode compr_op = (aco_opcode)0;
10929
bool compr = false;
10930
10931
slot -= FRAG_RESULT_DATA0;
10932
target = V_008DFC_SQ_EXP_MRT + slot;
10933
col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf;
10934
10935
bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
10936
bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
10937
bool is_16bit = values[0].regClass() == v2b;
10938
10939
/* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10940
if (ctx->options->enable_mrt_output_nan_fixup && !is_16bit &&
10941
(col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
10942
col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
10943
col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10944
for (int i = 0; i < 4; i++) {
10945
if (!(write_mask & (1 << i)))
10946
continue;
10947
10948
Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
10949
values[i], bld.copy(bld.def(v1), Operand::c32(3u)));
10950
values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
10951
bld.copy(bld.def(v1), Operand::zero()), isnan);
10952
}
10953
}
10954
10955
switch (col_format) {
10956
case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
10957
10958
case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
10959
10960
case V_028714_SPI_SHADER_32_AR:
10961
if (ctx->options->chip_class >= GFX10) {
10962
/* Special case: on GFX10, the outputs are different for 32_AR */
10963
enabled_channels = 0x3;
10964
values[1] = values[3];
10965
values[3] = Operand(v1);
10966
} else {
10967
enabled_channels = 0x9;
10968
}
10969
break;
10970
10971
case V_028714_SPI_SHADER_FP16_ABGR:
10972
for (int i = 0; i < 2; i++) {
10973
bool enabled = (write_mask >> (i * 2)) & 0x3;
10974
if (enabled) {
10975
enabled_channels |= 0x3 << (i * 2);
10976
if (is_16bit) {
10977
values[i] =
10978
bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),
10979
values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2],
10980
values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]);
10981
} else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9) {
10982
values[i] =
10983
bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
10984
values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
10985
values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
10986
} else {
10987
values[i] =
10988
bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
10989
values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],
10990
values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);
10991
}
10992
} else {
10993
values[i] = Operand(v1);
10994
}
10995
}
10996
values[2] = Operand(v1);
10997
values[3] = Operand(v1);
10998
compr = true;
10999
break;
11000
11001
case V_028714_SPI_SHADER_UNORM16_ABGR:
11002
if (is_16bit && ctx->options->chip_class >= GFX9) {
11003
compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
11004
} else {
11005
compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
11006
}
11007
break;
11008
11009
case V_028714_SPI_SHADER_SNORM16_ABGR:
11010
if (is_16bit && ctx->options->chip_class >= GFX9) {
11011
compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
11012
} else {
11013
compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
11014
}
11015
break;
11016
11017
case V_028714_SPI_SHADER_UINT16_ABGR: {
11018
compr_op = aco_opcode::v_cvt_pk_u16_u32;
11019
if (is_int8 || is_int10) {
11020
/* clamp */
11021
uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
11022
Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
11023
11024
for (unsigned i = 0; i < 4; i++) {
11025
if ((write_mask >> i) & 1) {
11026
values[i] =
11027
bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
11028
i == 3 && is_int10 ? Operand::c32(3u) : Operand(max_rgb_val), values[i]);
11029
}
11030
}
11031
} else if (is_16bit) {
11032
for (unsigned i = 0; i < 4; i++) {
11033
if ((write_mask >> i) & 1) {
11034
Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
11035
values[i] = Operand(tmp);
11036
}
11037
}
11038
}
11039
break;
11040
}
11041
11042
case V_028714_SPI_SHADER_SINT16_ABGR:
11043
compr_op = aco_opcode::v_cvt_pk_i16_i32;
11044
if (is_int8 || is_int10) {
11045
/* clamp */
11046
uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
11047
uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
11048
Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
11049
Temp min_rgb_val = bld.copy(bld.def(s1), Operand::c32(min_rgb));
11050
11051
for (unsigned i = 0; i < 4; i++) {
11052
if ((write_mask >> i) & 1) {
11053
values[i] =
11054
bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
11055
i == 3 && is_int10 ? Operand::c32(1u) : Operand(max_rgb_val), values[i]);
11056
values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
11057
i == 3 && is_int10 ? Operand::c32(-2u) : Operand(min_rgb_val),
11058
values[i]);
11059
}
11060
}
11061
} else if (is_16bit) {
11062
for (unsigned i = 0; i < 4; i++) {
11063
if ((write_mask >> i) & 1) {
11064
Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
11065
values[i] = Operand(tmp);
11066
}
11067
}
11068
}
11069
break;
11070
11071
case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
11072
11073
case V_028714_SPI_SHADER_ZERO:
11074
default: return false;
11075
}
11076
11077
if ((bool)compr_op) {
11078
for (int i = 0; i < 2; i++) {
11079
/* check if at least one of the values to be compressed is enabled */
11080
bool enabled = (write_mask >> (i * 2)) & 0x3;
11081
if (enabled) {
11082
enabled_channels |= 0x3 << (i * 2);
11083
values[i] = bld.vop3(
11084
compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
11085
values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
11086
} else {
11087
values[i] = Operand(v1);
11088
}
11089
}
11090
values[2] = Operand(v1);
11091
values[3] = Operand(v1);
11092
compr = true;
11093
} else if (!compr) {
11094
for (int i = 0; i < 4; i++)
11095
values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
11096
}
11097
11098
bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, target,
11099
compr);
11100
return true;
11101
}
11102
11103
static void
11104
create_fs_null_export(isel_context* ctx)
11105
{
11106
/* FS must always have exports.
11107
* So when there are none, we need to add a null export.
11108
*/
11109
11110
Builder bld(ctx->program, ctx->block);
11111
unsigned dest = V_008DFC_SQ_EXP_NULL;
11112
bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
11113
/* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
11114
}
11115
11116
static void
11117
create_fs_exports(isel_context* ctx)
11118
{
11119
bool exported = false;
11120
11121
/* Export depth, stencil and sample mask. */
11122
if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
11123
ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11124
exported |= export_fs_mrt_z(ctx);
11125
11126
/* Export all color render targets. */
11127
for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)
11128
if (ctx->outputs.mask[i])
11129
exported |= export_fs_mrt_color(ctx, i);
11130
11131
if (!exported)
11132
create_fs_null_export(ctx);
11133
11134
ctx->block->kind |= block_kind_export_end;
11135
}
11136
11137
static void
11138
create_workgroup_barrier(Builder& bld)
11139
{
11140
bld.barrier(aco_opcode::p_barrier,
11141
memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup), scope_workgroup);
11142
}
11143
11144
static void
11145
emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset,
11146
const struct radv_stream_output* output)
11147
{
11148
unsigned num_comps = util_bitcount(output->component_mask);
11149
unsigned writemask = (1 << num_comps) - 1;
11150
unsigned loc = output->location;
11151
unsigned buf = output->buffer;
11152
11153
assert(num_comps && num_comps <= 4);
11154
if (!num_comps || num_comps > 4)
11155
return;
11156
11157
unsigned first_comp = ffs(output->component_mask) - 1;
11158
11159
Temp out[4];
11160
bool all_undef = true;
11161
assert(ctx->stage.hw == HWStage::VS);
11162
for (unsigned i = 0; i < num_comps; i++) {
11163
out[i] = ctx->outputs.temps[loc * 4 + first_comp + i];
11164
all_undef = all_undef && !out[i].id();
11165
}
11166
if (all_undef)
11167
return;
11168
11169
while (writemask) {
11170
int start, count;
11171
u_bit_scan_consecutive_range(&writemask, &start, &count);
11172
if (count == 3 && ctx->options->chip_class == GFX6) {
11173
/* GFX6 doesn't support storing vec3, split it. */
11174
writemask |= 1u << (start + 2);
11175
count = 2;
11176
}
11177
11178
unsigned offset = output->offset + start * 4;
11179
11180
Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count));
11181
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
11182
aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
11183
for (int i = 0; i < count; ++i)
11184
vec->operands[i] =
11185
(ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand::zero();
11186
vec->definitions[0] = Definition(write_data);
11187
ctx->block->instructions.emplace_back(std::move(vec));
11188
11189
aco_opcode opcode;
11190
switch (count) {
11191
case 1: opcode = aco_opcode::buffer_store_dword; break;
11192
case 2: opcode = aco_opcode::buffer_store_dwordx2; break;
11193
case 3: opcode = aco_opcode::buffer_store_dwordx3; break;
11194
case 4: opcode = aco_opcode::buffer_store_dwordx4; break;
11195
default: unreachable("Unsupported dword count.");
11196
}
11197
11198
aco_ptr<MUBUF_instruction> store{
11199
create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
11200
store->operands[0] = Operand(so_buffers[buf]);
11201
store->operands[1] = Operand(so_write_offset[buf]);
11202
store->operands[2] = Operand::c32(0);
11203
store->operands[3] = Operand(write_data);
11204
if (offset > 4095) {
11205
/* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
11206
Builder bld(ctx->program, ctx->block);
11207
store->operands[0] =
11208
bld.vadd32(bld.def(v1), Operand::c32(offset), Operand(so_write_offset[buf]));
11209
} else {
11210
store->offset = offset;
11211
}
11212
store->offen = true;
11213
store->glc = true;
11214
store->dlc = false;
11215
store->slc = true;
11216
ctx->block->instructions.emplace_back(std::move(store));
11217
}
11218
}
11219
11220
static void
11221
emit_streamout(isel_context* ctx, unsigned stream)
11222
{
11223
Builder bld(ctx->program, ctx->block);
11224
11225
Temp so_buffers[4];
11226
Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
11227
for (unsigned i = 0; i < 4; i++) {
11228
unsigned stride = ctx->program->info->so.strides[i];
11229
if (!stride)
11230
continue;
11231
11232
Operand off = bld.copy(bld.def(s1), Operand::c32(i * 16u));
11233
so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, off);
11234
}
11235
11236
Temp so_vtx_count =
11237
bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11238
get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u));
11239
11240
Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
11241
11242
Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
11243
11244
if_context ic;
11245
begin_divergent_if_then(ctx, &ic, can_emit);
11246
11247
bld.reset(ctx->block);
11248
11249
Temp so_write_index =
11250
bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);
11251
11252
Temp so_write_offset[4];
11253
11254
for (unsigned i = 0; i < 4; i++) {
11255
unsigned stride = ctx->program->info->so.strides[i];
11256
if (!stride)
11257
continue;
11258
11259
if (stride == 1) {
11260
Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11261
get_arg(ctx, ctx->args->ac.streamout_write_index),
11262
get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11263
Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
11264
11265
so_write_offset[i] =
11266
bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), new_offset);
11267
} else {
11268
Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
11269
Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(4u),
11270
get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11271
so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
11272
}
11273
}
11274
11275
for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
11276
struct radv_stream_output* output = &ctx->program->info->so.outputs[i];
11277
if (stream != output->stream)
11278
continue;
11279
11280
emit_stream_output(ctx, so_buffers, so_write_offset, output);
11281
}
11282
11283
begin_divergent_if_else(ctx, &ic);
11284
end_divergent_if(ctx, &ic);
11285
}
11286
11287
Pseudo_instruction*
11288
add_startpgm(struct isel_context* ctx)
11289
{
11290
unsigned arg_count = ctx->args->ac.arg_count;
11291
if (ctx->stage == fragment_fs) {
11292
/* LLVM optimizes away unused FS inputs and computes spi_ps_input_addr
11293
* itself and then communicates the results back via the ELF binary.
11294
* Mirror what LLVM does by re-mapping the VGPR arguments here.
11295
*
11296
* TODO: If we made the FS input scanning code into a separate pass that
11297
* could run before argument setup, then this wouldn't be necessary
11298
* anymore.
11299
*/
11300
struct ac_shader_args* args = &ctx->args->ac;
11301
arg_count = 0;
11302
for (unsigned i = 0, vgpr_arg = 0, vgpr_reg = 0; i < args->arg_count; i++) {
11303
if (args->args[i].file != AC_ARG_VGPR) {
11304
arg_count++;
11305
continue;
11306
}
11307
11308
if (!(ctx->program->config->spi_ps_input_addr & (1 << vgpr_arg))) {
11309
args->args[i].skip = true;
11310
} else {
11311
args->args[i].offset = vgpr_reg;
11312
vgpr_reg += args->args[i].size;
11313
arg_count++;
11314
}
11315
vgpr_arg++;
11316
}
11317
}
11318
11319
aco_ptr<Pseudo_instruction> startpgm{
11320
create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, arg_count)};
11321
for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) {
11322
if (ctx->args->ac.args[i].skip)
11323
continue;
11324
11325
enum ac_arg_regfile file = ctx->args->ac.args[i].file;
11326
unsigned size = ctx->args->ac.args[i].size;
11327
unsigned reg = ctx->args->ac.args[i].offset;
11328
RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11329
Temp dst = ctx->program->allocateTmp(type);
11330
ctx->arg_temps[i] = dst;
11331
startpgm->definitions[arg] = Definition(dst);
11332
startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11333
arg++;
11334
}
11335
Pseudo_instruction* instr = startpgm.get();
11336
ctx->block->instructions.push_back(std::move(startpgm));
11337
11338
/* Stash these in the program so that they can be accessed later when
11339
* handling spilling.
11340
*/
11341
ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11342
ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset);
11343
11344
return instr;
11345
}
11346
11347
void
11348
fix_ls_vgpr_init_bug(isel_context* ctx, Pseudo_instruction* startpgm)
11349
{
11350
assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
11351
Builder bld(ctx->program, ctx->block);
11352
constexpr unsigned hs_idx = 1u;
11353
Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11354
get_arg(ctx, ctx->args->ac.merged_wave_info),
11355
Operand::c32((8u << 16) | (hs_idx * 8u)));
11356
Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11357
11358
/* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
11359
11360
Temp instance_id =
11361
bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.vertex_id),
11362
get_arg(ctx, ctx->args->ac.instance_id), ls_has_nonzero_hs_threads);
11363
Temp vs_rel_patch_id =
11364
bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
11365
get_arg(ctx, ctx->args->ac.vs_rel_patch_id), ls_has_nonzero_hs_threads);
11366
Temp vertex_id =
11367
bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_patch_id),
11368
get_arg(ctx, ctx->args->ac.vertex_id), ls_has_nonzero_hs_threads);
11369
11370
ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
11371
ctx->arg_temps[ctx->args->ac.vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11372
ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
11373
}
11374
11375
void
11376
split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
11377
{
11378
/* Split all arguments except for the first (ring_offsets) and the last
11379
* (exec) so that the dead channels don't stay live throughout the program.
11380
*/
11381
for (int i = 1; i < startpgm->definitions.size(); i++) {
11382
if (startpgm->definitions[i].regClass().size() > 1) {
11383
emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11384
startpgm->definitions[i].regClass().size());
11385
}
11386
}
11387
}
11388
11389
void
11390
handle_bc_optimize(isel_context* ctx)
11391
{
11392
/* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
11393
Builder bld(ctx->program, ctx->block);
11394
uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
11395
bool uses_center =
11396
G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
11397
bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) ||
11398
G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
11399
ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
11400
ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
11401
if (uses_center && uses_centroid) {
11402
Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
11403
get_arg(ctx, ctx->args->ac.prim_mask), Operand::zero());
11404
11405
if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
11406
Temp new_coord[2];
11407
for (unsigned i = 0; i < 2; i++) {
11408
Temp persp_centroid =
11409
emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
11410
Temp persp_center =
11411
emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
11412
new_coord[i] =
11413
bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), persp_centroid, persp_center, sel);
11414
}
11415
ctx->persp_centroid = bld.tmp(v2);
11416
bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
11417
Operand(new_coord[0]), Operand(new_coord[1]));
11418
emit_split_vector(ctx, ctx->persp_centroid, 2);
11419
}
11420
11421
if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
11422
Temp new_coord[2];
11423
for (unsigned i = 0; i < 2; i++) {
11424
Temp linear_centroid =
11425
emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
11426
Temp linear_center =
11427
emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
11428
new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), linear_centroid,
11429
linear_center, sel);
11430
}
11431
ctx->linear_centroid = bld.tmp(v2);
11432
bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
11433
Operand(new_coord[0]), Operand(new_coord[1]));
11434
emit_split_vector(ctx, ctx->linear_centroid, 2);
11435
}
11436
}
11437
}
11438
11439
void
11440
setup_fp_mode(isel_context* ctx, nir_shader* shader)
11441
{
11442
Program* program = ctx->program;
11443
11444
unsigned float_controls = shader->info.float_controls_execution_mode;
11445
11446
program->next_fp_mode.preserve_signed_zero_inf_nan32 =
11447
float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
11448
program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
11449
float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
11450
FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
11451
11452
program->next_fp_mode.must_flush_denorms32 =
11453
float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
11454
program->next_fp_mode.must_flush_denorms16_64 =
11455
float_controls &
11456
(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
11457
11458
program->next_fp_mode.care_about_round32 =
11459
float_controls &
11460
(FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
11461
11462
program->next_fp_mode.care_about_round16_64 =
11463
float_controls &
11464
(FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
11465
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
11466
11467
/* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
11468
* the precision seems needed for Wolfenstein: Youngblood to render correctly */
11469
if (program->next_fp_mode.must_flush_denorms16_64)
11470
program->next_fp_mode.denorm16_64 = 0;
11471
else
11472
program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11473
11474
/* preserving fp32 denorms is expensive, so only do it if asked */
11475
if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11476
program->next_fp_mode.denorm32 = fp_denorm_keep;
11477
else
11478
program->next_fp_mode.denorm32 = 0;
11479
11480
if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11481
program->next_fp_mode.round32 = fp_round_tz;
11482
else
11483
program->next_fp_mode.round32 = fp_round_ne;
11484
11485
if (float_controls &
11486
(FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11487
program->next_fp_mode.round16_64 = fp_round_tz;
11488
else
11489
program->next_fp_mode.round16_64 = fp_round_ne;
11490
11491
ctx->block->fp_mode = program->next_fp_mode;
11492
}
11493
11494
void
11495
cleanup_cfg(Program* program)
11496
{
11497
/* create linear_succs/logical_succs */
11498
for (Block& BB : program->blocks) {
11499
for (unsigned idx : BB.linear_preds)
11500
program->blocks[idx].linear_succs.emplace_back(BB.index);
11501
for (unsigned idx : BB.logical_preds)
11502
program->blocks[idx].logical_succs.emplace_back(BB.index);
11503
}
11504
}
11505
11506
Temp
11507
lanecount_to_mask(isel_context* ctx, Temp count, bool allow64 = true)
11508
{
11509
assert(count.regClass() == s1);
11510
11511
Builder bld(ctx->program, ctx->block);
11512
Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11513
Temp cond;
11514
11515
if (ctx->program->wave_size == 64) {
11516
/* If we know that all 64 threads can't be active at a time, we just use the mask as-is */
11517
if (!allow64)
11518
return mask;
11519
11520
/* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
11521
Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,
11522
Operand::c32(6u /* log2(64) */));
11523
cond =
11524
bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));
11525
} else {
11526
/* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11527
* the register */
11528
cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11529
}
11530
11531
return cond;
11532
}
11533
11534
Temp
11535
merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11536
{
11537
Builder bld(ctx->program, ctx->block);
11538
11539
/* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
11540
Temp count = i == 0
11541
? get_arg(ctx, ctx->args->ac.merged_wave_info)
11542
: bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
11543
get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(i * 8u));
11544
11545
return lanecount_to_mask(ctx, count);
11546
}
11547
11548
void
11549
ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt)
11550
{
11551
assert(vtx_cnt.id() && prm_cnt.id());
11552
11553
Builder bld(ctx->program, ctx->block);
11554
Temp prm_cnt_0;
11555
11556
if (ctx->program->chip_class == GFX10 &&
11557
(ctx->stage.has(SWStage::GS) || ctx->program->info->has_ngg_culling)) {
11558
/* Navi 1x workaround: check whether the workgroup has no output.
11559
* If so, change the number of exported vertices and primitives to 1.
11560
*/
11561
prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand::zero());
11562
prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), prm_cnt,
11563
bld.scc(prm_cnt_0));
11564
vtx_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), vtx_cnt,
11565
bld.scc(prm_cnt_0));
11566
}
11567
11568
/* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
11569
Temp tmp =
11570
bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand::c32(12u));
11571
tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
11572
11573
/* Request the SPI to allocate space for the primitives and vertices
11574
* that will be exported by the threadgroup.
11575
*/
11576
bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
11577
11578
if (prm_cnt_0.id()) {
11579
/* Navi 1x workaround: export a triangle with NaN coordinates when NGG has no output.
11580
* It can't have all-zero positions because that would render an undesired pixel with
11581
* conservative rasterization.
11582
*/
11583
Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
11584
Temp cond = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc),
11585
Operand::c32_or_c64(1u, ctx->program->wave_size == 64), first_lane);
11586
cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond,
11587
Operand::zero(ctx->program->wave_size == 64 ? 8 : 4), bld.scc(prm_cnt_0));
11588
11589
if_context ic_prim_0;
11590
begin_divergent_if_then(ctx, &ic_prim_0, cond);
11591
bld.reset(ctx->block);
11592
ctx->block->kind |= block_kind_export_end;
11593
11594
/* Use zero: means that it's a triangle whose every vertex index is 0. */
11595
Temp zero = bld.copy(bld.def(v1), Operand::zero());
11596
/* Use NaN for the coordinates, so that the rasterizer allways culls it. */
11597
Temp nan_coord = bld.copy(bld.def(v1), Operand::c32(-1u));
11598
11599
bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1), 1 /* enabled mask */,
11600
V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, true /* done */,
11601
false /* valid mask */);
11602
bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord, 0xf /* enabled mask */,
11603
V_008DFC_SQ_EXP_POS /* dest */, false /* compressed */, true /* done */,
11604
true /* valid mask */);
11605
11606
begin_divergent_if_else(ctx, &ic_prim_0);
11607
end_divergent_if(ctx, &ic_prim_0);
11608
bld.reset(ctx->block);
11609
}
11610
}
11611
11612
} /* end namespace */
11613
11614
void
11615
select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
11616
ac_shader_config* config, struct radv_shader_args* args)
11617
{
11618
isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
11619
if_context ic_merged_wave_info;
11620
bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
11621
11622
for (unsigned i = 0; i < shader_count; i++) {
11623
nir_shader* nir = shaders[i];
11624
init_context(&ctx, nir);
11625
11626
setup_fp_mode(&ctx, nir);
11627
11628
if (!i) {
11629
/* needs to be after init_context() for FS */
11630
Pseudo_instruction* startpgm = add_startpgm(&ctx);
11631
append_logical_start(ctx.block);
11632
11633
if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
11634
fix_ls_vgpr_init_bug(&ctx, startpgm);
11635
11636
split_arguments(&ctx, startpgm);
11637
11638
if (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES)) {
11639
Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);
11640
}
11641
}
11642
11643
/* In a merged VS+TCS HS, the VS implementation can be completely empty. */
11644
nir_function_impl* func = nir_shader_get_entrypoint(nir);
11645
bool empty_shader =
11646
nir_cf_list_is_empty_block(&func->body) &&
11647
((nir->info.stage == MESA_SHADER_VERTEX &&
11648
(ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
11649
(nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
11650
11651
bool check_merged_wave_info =
11652
ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
11653
bool endif_merged_wave_info =
11654
ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
11655
11656
if (program->chip_class == GFX10 && program->stage.hw == HWStage::NGG &&
11657
program->stage.num_sw_stages() == 1) {
11658
/* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11659
* s_sendmsg(GS_ALLOC_REQ). */
11660
Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);
11661
}
11662
11663
if (check_merged_wave_info) {
11664
Temp cond = merged_wave_info_to_mask(&ctx, i);
11665
begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
11666
}
11667
11668
if (i) {
11669
Builder bld(ctx.program, ctx.block);
11670
11671
/* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
11672
bool tcs_skip_barrier = ctx.stage == vertex_tess_control_hs &&
11673
ctx.tcs_temp_only_inputs == nir->info.inputs_read;
11674
11675
if (!ngg_gs && !tcs_skip_barrier)
11676
create_workgroup_barrier(bld);
11677
11678
if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
11679
ctx.gs_wave_id = bld.pseudo(aco_opcode::p_extract, bld.def(s1, m0), bld.def(s1, scc),
11680
get_arg(&ctx, args->ac.merged_wave_info), Operand::c32(2u),
11681
Operand::c32(8u), Operand::zero());
11682
}
11683
} else if (ctx.stage == geometry_gs)
11684
ctx.gs_wave_id = get_arg(&ctx, args->ac.gs_wave_id);
11685
11686
if (ctx.stage == fragment_fs)
11687
handle_bc_optimize(&ctx);
11688
11689
visit_cf_list(&ctx, &func->body);
11690
11691
if (ctx.program->info->so.num_outputs && ctx.stage.hw == HWStage::VS)
11692
emit_streamout(&ctx, 0);
11693
11694
if (ctx.stage.hw == HWStage::VS) {
11695
create_vs_exports(&ctx);
11696
} else if (nir->info.stage == MESA_SHADER_GEOMETRY && !ngg_gs) {
11697
Builder bld(ctx.program, ctx.block);
11698
bld.barrier(aco_opcode::p_barrier,
11699
memory_sync_info(storage_vmem_output, semantic_release, scope_device));
11700
bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1,
11701
sendmsg_gs_done(false, false, 0));
11702
}
11703
11704
if (ctx.stage == fragment_fs) {
11705
create_fs_exports(&ctx);
11706
}
11707
11708
if (endif_merged_wave_info) {
11709
begin_divergent_if_else(&ctx, &ic_merged_wave_info);
11710
end_divergent_if(&ctx, &ic_merged_wave_info);
11711
}
11712
11713
if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
11714
/* Outputs of the previous stage are inputs to the next stage */
11715
ctx.inputs = ctx.outputs;
11716
ctx.outputs = shader_io_state();
11717
}
11718
11719
cleanup_context(&ctx);
11720
}
11721
11722
program->config->float_mode = program->blocks[0].fp_mode.val;
11723
11724
append_logical_end(ctx.block);
11725
ctx.block->kind |= block_kind_uniform;
11726
Builder bld(ctx.program, ctx.block);
11727
bld.sopp(aco_opcode::s_endpgm);
11728
11729
cleanup_cfg(program);
11730
}
11731
11732
void
11733
select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
11734
struct radv_shader_args* args)
11735
{
11736
isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
11737
11738
ctx.block->fp_mode = program->next_fp_mode;
11739
11740
add_startpgm(&ctx);
11741
append_logical_start(ctx.block);
11742
11743
Builder bld(ctx.program, ctx.block);
11744
11745
Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4),
11746
program->private_segment_buffer, Operand::c32(RING_GSVS_VS * 16u));
11747
11748
Operand stream_id = Operand::zero();
11749
if (args->shader_info->so.num_outputs)
11750
stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11751
get_arg(&ctx, ctx.args->ac.streamout_config), Operand::c32(0x20018u));
11752
11753
Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u),
11754
get_arg(&ctx, ctx.args->ac.vertex_id));
11755
11756
std::stack<if_context> if_contexts;
11757
11758
for (unsigned stream = 0; stream < 4; stream++) {
11759
if (stream_id.isConstant() && stream != stream_id.constantValue())
11760
continue;
11761
11762
unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
11763
if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
11764
continue;
11765
11766
memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
11767
11768
if (!stream_id.isConstant()) {
11769
Temp cond =
11770
bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand::c32(stream));
11771
if_contexts.emplace();
11772
begin_uniform_if_then(&ctx, &if_contexts.top(), cond);
11773
bld.reset(ctx.block);
11774
}
11775
11776
unsigned offset = 0;
11777
for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
11778
if (args->shader_info->gs.output_streams[i] != stream)
11779
continue;
11780
11781
unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
11782
unsigned length = util_last_bit(output_usage_mask);
11783
for (unsigned j = 0; j < length; ++j) {
11784
if (!(output_usage_mask & (1 << j)))
11785
continue;
11786
11787
Temp val = bld.tmp(v1);
11788
unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
11789
load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true,
11790
true, true);
11791
11792
ctx.outputs.mask[i] |= 1 << j;
11793
ctx.outputs.temps[i * 4u + j] = val;
11794
11795
offset++;
11796
}
11797
}
11798
11799
if (args->shader_info->so.num_outputs) {
11800
emit_streamout(&ctx, stream);
11801
bld.reset(ctx.block);
11802
}
11803
11804
if (stream == 0) {
11805
create_vs_exports(&ctx);
11806
}
11807
11808
if (!stream_id.isConstant()) {
11809
begin_uniform_if_else(&ctx, &if_contexts.top());
11810
bld.reset(ctx.block);
11811
}
11812
}
11813
11814
while (!if_contexts.empty()) {
11815
end_uniform_if(&ctx, &if_contexts.top());
11816
if_contexts.pop();
11817
}
11818
11819
program->config->float_mode = program->blocks[0].fp_mode.val;
11820
11821
append_logical_end(ctx.block);
11822
ctx.block->kind |= block_kind_uniform;
11823
bld.reset(ctx.block);
11824
bld.sopp(aco_opcode::s_endpgm);
11825
11826
cleanup_cfg(program);
11827
}
11828
11829
void
11830
select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
11831
struct radv_shader_args* args)
11832
{
11833
assert(args->options->chip_class == GFX8);
11834
11835
init_program(program, compute_cs, args->shader_info, args->options->chip_class,
11836
args->options->family, args->options->wgp_mode, config);
11837
11838
isel_context ctx = {};
11839
ctx.program = program;
11840
ctx.args = args;
11841
ctx.options = args->options;
11842
ctx.stage = program->stage;
11843
11844
ctx.block = ctx.program->create_and_insert_block();
11845
ctx.block->kind = block_kind_top_level;
11846
11847
program->workgroup_size = 1; /* XXX */
11848
11849
add_startpgm(&ctx);
11850
append_logical_start(ctx.block);
11851
11852
Builder bld(ctx.program, ctx.block);
11853
11854
/* Load the buffer descriptor from TMA. */
11855
bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
11856
Operand::zero());
11857
11858
/* Store TTMP0-TTMP1. */
11859
bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
11860
Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
11861
11862
uint32_t hw_regs_idx[] = {
11863
2, /* HW_REG_STATUS */
11864
3, /* HW_REG_TRAP_STS */
11865
4, /* HW_REG_HW_ID */
11866
7, /* HW_REG_IB_STS */
11867
};
11868
11869
/* Store some hardware registers. */
11870
for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
11871
/* "((size - 1) << 11) | register" */
11872
bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
11873
((20 - 1) << 11) | hw_regs_idx[i]);
11874
11875
bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
11876
Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
11877
}
11878
11879
program->config->float_mode = program->blocks[0].fp_mode.val;
11880
11881
append_logical_end(ctx.block);
11882
ctx.block->kind |= block_kind_uniform;
11883
bld.sopp(aco_opcode::s_endpgm);
11884
11885
cleanup_cfg(program);
11886
}
11887
} // namespace aco
11888
11889