CoCalc -- aco_instruction

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_instruction_selection.cpp
⁷¹¹² views
1
/*
2
 * Copyright © 2018 Valve Corporation
3
 * Copyright © 2018 Google
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
 * and/or sell copies of the Software, and to permit persons to whom the
10
 * Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
 * IN THE SOFTWARE.
23
 *
24
 */
25

26
#include "aco_instruction_selection.h"
27

28
#include "aco_builder.h"
29
#include "aco_ir.h"
30

31
#include "common/ac_exp_param.h"
32
#include "common/sid.h"
33
#include "vulkan/radv_descriptor_set.h"
34

35
#include "util/fast_idiv_by_const.h"
36
#include "util/memstream.h"
37

38
#include <array>
39
#include <functional>
40
#include <map>
41
#include <numeric>
42
#include <stack>
43
#include <vector>
44

45
namespace aco {
46
namespace {
47

48
#define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
49

50
static void
51
_isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
52
          const char* msg)
53
{
54
   char* out;
55
   size_t outsize;
56
   struct u_memstream mem;
57
   u_memstream_open(&mem, &out, &outsize);
58
   FILE* const memf = u_memstream_get(&mem);
59

60
   fprintf(memf, "%s: ", msg);
61
   nir_print_instr(instr, memf);
62
   u_memstream_close(&mem);
63

64
   _aco_err(ctx->program, file, line, out);
65
   free(out);
66
}
67

68
struct if_context {
69
   Temp cond;
70

71
   bool divergent_old;
72
   bool exec_potentially_empty_discard_old;
73
   bool exec_potentially_empty_break_old;
74
   uint16_t exec_potentially_empty_break_depth_old;
75

76
   unsigned BB_if_idx;
77
   unsigned invert_idx;
78
   bool uniform_has_then_branch;
79
   bool then_branch_divergent;
80
   Block BB_invert;
81
   Block BB_endif;
82
};
83

84
struct loop_context {
85
   Block loop_exit;
86

87
   unsigned header_idx_old;
88
   Block* exit_old;
89
   bool divergent_cont_old;
90
   bool divergent_branch_old;
91
   bool divergent_if_old;
92
};
93

94
static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);
95

96
static void
97
add_logical_edge(unsigned pred_idx, Block* succ)
98
{
99
   succ->logical_preds.emplace_back(pred_idx);
100
}
101

102
static void
103
add_linear_edge(unsigned pred_idx, Block* succ)
104
{
105
   succ->linear_preds.emplace_back(pred_idx);
106
}
107

108
static void
109
add_edge(unsigned pred_idx, Block* succ)
110
{
111
   add_logical_edge(pred_idx, succ);
112
   add_linear_edge(pred_idx, succ);
113
}
114

115
static void
116
append_logical_start(Block* b)
117
{
118
   Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
119
}
120

121
static void
122
append_logical_end(Block* b)
123
{
124
   Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
125
}
126

127
Temp
128
get_ssa_temp(struct isel_context* ctx, nir_ssa_def* def)
129
{
130
   uint32_t id = ctx->first_temp_id + def->index;
131
   return Temp(id, ctx->program->temp_rc[id]);
132
}
133

134
Temp
135
emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
136
{
137
   Builder bld(ctx->program, ctx->block);
138
   assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
139
   assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
140

141
   if (ctx->program->wave_size == 32) {
142
      Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
143
      return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
144
   }
145

146
   Operand mask_lo = Operand::c32(-1u);
147
   Operand mask_hi = Operand::c32(-1u);
148

149
   if (mask.isTemp()) {
150
      RegClass rc = RegClass(mask.regClass().type(), 1);
151
      Builder::Result mask_split =
152
         bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
153
      mask_lo = Operand(mask_split.def(0).getTemp());
154
      mask_hi = Operand(mask_split.def(1).getTemp());
155
   } else if (mask.physReg() == exec) {
156
      mask_lo = Operand(exec_lo, s1);
157
      mask_hi = Operand(exec_hi, s1);
158
   }
159

160
   Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
161

162
   if (ctx->program->chip_class <= GFX7)
163
      return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
164
   else
165
      return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
166
}
167

168
Temp
169
emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false)
170
{
171
   if (!dst.id())
172
      dst = bld.tmp(src.regClass());
173

174
   assert(src.size() == dst.size());
175

176
   if (bld.program->stage != fragment_fs) {
177
      if (!dst.id())
178
         return src;
179

180
      bld.copy(Definition(dst), src);
181
      return dst;
182
   }
183

184
   bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
185
   bld.program->needs_wqm |= program_needs_wqm;
186
   return dst;
187
}
188

189
static Temp
190
emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
191
{
192
   if (index.regClass() == s1)
193
      return bld.readlane(bld.def(s1), data, index);
194

195
   if (ctx->options->chip_class <= GFX7) {
196
      /* GFX6-7: there is no bpermute instruction */
197
      Operand index_op(index);
198
      Operand input_data(data);
199
      index_op.setLateKill(true);
200
      input_data.setLateKill(true);
201

202
      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc),
203
                        index_op, input_data);
204
   } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
205

206
      /* GFX10 wave64 mode: emulate full-wave bpermute */
207
      Temp index_is_lo =
208
         bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
209
      Builder::Result index_is_lo_split =
210
         bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
211
      Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
212
                                     index_is_lo_split.def(1).getTemp());
213
      Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
214
                                     index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
215
      Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
216
      Operand input_data(data);
217

218
      index_x4.setLateKill(true);
219
      input_data.setLateKill(true);
220
      same_half.setLateKill(true);
221

222
      /* We need one pair of shared VGPRs:
223
       * Note, that these have twice the allocation granularity of normal VGPRs */
224
      ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
225

226
      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
227
                        index_x4, input_data, same_half);
228
   } else {
229
      /* GFX8-9 or GFX10 wave32: bpermute works normally */
230
      Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
231
      return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
232
   }
233
}
234

235
static Temp
236
emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)
237
{
238
   if (ctx->options->chip_class >= GFX8) {
239
      unsigned and_mask = mask & 0x1f;
240
      unsigned or_mask = (mask >> 5) & 0x1f;
241
      unsigned xor_mask = (mask >> 10) & 0x1f;
242

243
      uint16_t dpp_ctrl = 0xffff;
244

245
      // TODO: we could use DPP8 for some swizzles
246
      if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
247
         unsigned res[4] = {0, 1, 2, 3};
248
         for (unsigned i = 0; i < 4; i++)
249
            res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
250
         dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
251
      } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
252
         dpp_ctrl = dpp_row_rr(8);
253
      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
254
         dpp_ctrl = dpp_row_mirror;
255
      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
256
         dpp_ctrl = dpp_row_half_mirror;
257
      }
258

259
      if (dpp_ctrl != 0xffff)
260
         return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
261
   }
262

263
   return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
264
}
265

266
Temp
267
as_vgpr(isel_context* ctx, Temp val)
268
{
269
   if (val.type() == RegType::sgpr) {
270
      Builder bld(ctx->program, ctx->block);
271
      return bld.copy(bld.def(RegType::vgpr, val.size()), val);
272
   }
273
   assert(val.type() == RegType::vgpr);
274
   return val;
275
}
276

277
// assumes a != 0xffffffff
278
void
279
emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b)
280
{
281
   assert(b != 0);
282
   Builder bld(ctx->program, ctx->block);
283

284
   if (util_is_power_of_two_or_zero(b)) {
285
      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(util_logbase2(b)), a);
286
      return;
287
   }
288

289
   util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
290

291
   assert(info.multiplier <= 0xffffffff);
292

293
   bool pre_shift = info.pre_shift != 0;
294
   bool increment = info.increment != 0;
295
   bool multiply = true;
296
   bool post_shift = info.post_shift != 0;
297

298
   if (!pre_shift && !increment && !multiply && !post_shift) {
299
      bld.copy(Definition(dst), a);
300
      return;
301
   }
302

303
   Temp pre_shift_dst = a;
304
   if (pre_shift) {
305
      pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
306
      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand::c32(info.pre_shift),
307
               a);
308
   }
309

310
   Temp increment_dst = pre_shift_dst;
311
   if (increment) {
312
      increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
313
      bld.vadd32(Definition(increment_dst), Operand::c32(info.increment), pre_shift_dst);
314
   }
315

316
   Temp multiply_dst = increment_dst;
317
   if (multiply) {
318
      multiply_dst = post_shift ? bld.tmp(v1) : dst;
319
      bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
320
               bld.copy(bld.def(v1), Operand::c32(info.multiplier)));
321
   }
322

323
   if (post_shift) {
324
      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(info.post_shift),
325
               multiply_dst);
326
   }
327
}
328

329
void
330
emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
331
{
332
   Builder bld(ctx->program, ctx->block);
333
   bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
334
}
335

336
Temp
337
emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
338
{
339
   /* no need to extract the whole vector */
340
   if (src.regClass() == dst_rc) {
341
      assert(idx == 0);
342
      return src;
343
   }
344

345
   assert(src.bytes() > (idx * dst_rc.bytes()));
346
   Builder bld(ctx->program, ctx->block);
347
   auto it = ctx->allocated_vec.find(src.id());
348
   if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
349
      if (it->second[idx].regClass() == dst_rc) {
350
         return it->second[idx];
351
      } else {
352
         assert(!dst_rc.is_subdword());
353
         assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
354
         return bld.copy(bld.def(dst_rc), it->second[idx]);
355
      }
356
   }
357

358
   if (dst_rc.is_subdword())
359
      src = as_vgpr(ctx, src);
360

361
   if (src.bytes() == dst_rc.bytes()) {
362
      assert(idx == 0);
363
      return bld.copy(bld.def(dst_rc), src);
364
   } else {
365
      Temp dst = bld.tmp(dst_rc);
366
      emit_extract_vector(ctx, src, idx, dst);
367
      return dst;
368
   }
369
}
370

371
void
372
emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
373
{
374
   if (num_components == 1)
375
      return;
376
   if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
377
      return;
378
   RegClass rc;
379
   if (num_components > vec_src.size()) {
380
      if (vec_src.type() == RegType::sgpr) {
381
         /* should still help get_alu_src() */
382
         emit_split_vector(ctx, vec_src, vec_src.size());
383
         return;
384
      }
385
      /* sub-dword split */
386
      rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
387
   } else {
388
      rc = RegClass(vec_src.type(), vec_src.size() / num_components);
389
   }
390
   aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
391
      aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
392
   split->operands[0] = Operand(vec_src);
393
   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
394
   for (unsigned i = 0; i < num_components; i++) {
395
      elems[i] = ctx->program->allocateTmp(rc);
396
      split->definitions[i] = Definition(elems[i]);
397
   }
398
   ctx->block->instructions.emplace_back(std::move(split));
399
   ctx->allocated_vec.emplace(vec_src.id(), elems);
400
}
401

402
/* This vector expansion uses a mask to determine which elements in the new vector
403
 * come from the original vector. The other elements are undefined. */
404
void
405
expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
406
{
407
   emit_split_vector(ctx, vec_src, util_bitcount(mask));
408

409
   if (vec_src == dst)
410
      return;
411

412
   Builder bld(ctx->program, ctx->block);
413
   if (num_components == 1) {
414
      if (dst.type() == RegType::sgpr)
415
         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
416
      else
417
         bld.copy(Definition(dst), vec_src);
418
      return;
419
   }
420

421
   unsigned component_size = dst.size() / num_components;
422
   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
423

424
   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
425
      aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
426
   vec->definitions[0] = Definition(dst);
427
   unsigned k = 0;
428
   for (unsigned i = 0; i < num_components; i++) {
429
      if (mask & (1 << i)) {
430
         Temp src =
431
            emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
432
         if (dst.type() == RegType::sgpr)
433
            src = bld.as_uniform(src);
434
         vec->operands[i] = Operand(src);
435
      } else {
436
         vec->operands[i] = Operand::zero(component_size == 2 ? 8 : 4);
437
      }
438
      elems[i] = vec->operands[i].getTemp();
439
   }
440
   ctx->block->instructions.emplace_back(std::move(vec));
441
   ctx->allocated_vec.emplace(dst.id(), elems);
442
}
443

444
/* adjust misaligned small bit size loads */
445
void
446
byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
447
{
448
   Builder bld(ctx->program, ctx->block);
449
   Operand shift;
450
   Temp select = Temp();
451
   if (offset.isConstant()) {
452
      assert(offset.constantValue() && offset.constantValue() < 4);
453
      shift = Operand::c32(offset.constantValue() * 8);
454
   } else {
455
      /* bit_offset = 8 * (offset & 0x3) */
456
      Temp tmp =
457
         bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));
458
      select = bld.tmp(s1);
459
      shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
460
                       Operand::c32(3u));
461
   }
462

463
   if (vec.size() == 1) {
464
      bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
465
   } else if (vec.size() == 2) {
466
      Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
467
      bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
468
      if (tmp == dst)
469
         emit_split_vector(ctx, dst, 2);
470
      else
471
         emit_extract_vector(ctx, tmp, 0, dst);
472
   } else if (vec.size() == 3 || vec.size() == 4) {
473
      Temp lo = bld.tmp(s2), hi;
474
      if (vec.size() == 3) {
475
         /* this can happen if we use VMEM for a uniform load */
476
         hi = bld.tmp(s1);
477
         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
478
      } else {
479
         hi = bld.tmp(s2);
480
         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
481
         hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());
482
      }
483
      if (select != Temp())
484
         hi =
485
            bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));
486
      lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
487
      Temp mid = bld.tmp(s1);
488
      lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
489
      hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
490
      mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
491
      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
492
      emit_split_vector(ctx, dst, 2);
493
   }
494
}
495

496
void
497
byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
498
{
499
   Builder bld(ctx->program, ctx->block);
500
   if (offset.isTemp()) {
501
      Temp tmp[4] = {vec, vec, vec, vec};
502

503
      if (vec.size() == 4) {
504
         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
505
         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
506
                    Definition(tmp[2]), Definition(tmp[3]), vec);
507
      } else if (vec.size() == 3) {
508
         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
509
         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
510
                    Definition(tmp[2]), vec);
511
      } else if (vec.size() == 2) {
512
         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
513
         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
514
      }
515
      for (unsigned i = 0; i < dst.size(); i++)
516
         tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
517

518
      vec = tmp[0];
519
      if (dst.size() == 2)
520
         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
521

522
      offset = Operand::zero();
523
   }
524

525
   unsigned num_components = vec.bytes() / component_size;
526
   if (vec.regClass() == dst.regClass()) {
527
      assert(offset.constantValue() == 0);
528
      bld.copy(Definition(dst), vec);
529
      emit_split_vector(ctx, dst, num_components);
530
      return;
531
   }
532

533
   emit_split_vector(ctx, vec, num_components);
534
   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
535
   RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
536

537
   assert(offset.constantValue() % component_size == 0);
538
   unsigned skip = offset.constantValue() / component_size;
539
   for (unsigned i = skip; i < num_components; i++)
540
      elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
541

542
   if (dst.type() == RegType::vgpr) {
543
      /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
544
      num_components = dst.bytes() / component_size;
545
      aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(
546
         aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
547
      for (unsigned i = 0; i < num_components; i++)
548
         create_vec->operands[i] = Operand(elems[i]);
549
      create_vec->definitions[0] = Definition(dst);
550
      bld.insert(std::move(create_vec));
551

552
   } else if (skip) {
553
      /* if dst is sgpr - split the src, but move the original to sgpr. */
554
      vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
555
      byte_align_scalar(ctx, vec, offset, dst);
556
   } else {
557
      assert(dst.size() == vec.size());
558
      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
559
   }
560

561
   ctx->allocated_vec.emplace(dst.id(), elems);
562
}
563

564
Temp
565
bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
566
{
567
   Builder bld(ctx->program, ctx->block);
568
   if (!dst.id())
569
      dst = bld.tmp(bld.lm);
570

571
   assert(val.regClass() == s1);
572
   assert(dst.regClass() == bld.lm);
573

574
   return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
575
                   bld.scc(val));
576
}
577

578
Temp
579
bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
580
{
581
   Builder bld(ctx->program, ctx->block);
582
   if (!dst.id())
583
      dst = bld.tmp(s1);
584

585
   assert(val.regClass() == bld.lm);
586
   assert(dst.regClass() == s1);
587

588
   /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
589
   Temp tmp = bld.tmp(s1);
590
   bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
591
   return emit_wqm(bld, tmp, dst);
592
}
593

594
/**
595
 * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
596
 * src_bits and dst_bits are truncated.
597
 *
598
 * Sign extension may be applied using the sign_extend parameter. The position of the input sign
599
 * bit is indicated by src_bits in this case.
600
 *
601
 * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
602
 */
603
Temp
604
convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
605
            bool sign_extend, Temp dst = Temp())
606
{
607
   assert(!(sign_extend && dst_bits < src_bits) &&
608
          "Shrinking integers is not supported for signed inputs");
609

610
   if (!dst.id()) {
611
      if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
612
         dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
613
      else
614
         dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
615
   }
616

617
   assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
618
   assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
619

620
   if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
621
      /* Copy the raw value, leaving an undefined value in the upper bits for
622
       * the caller to handle appropriately */
623
      return bld.copy(Definition(dst), src);
624
   } else if (dst.bytes() < src.bytes()) {
625
      return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
626
   }
627

628
   Temp tmp = dst;
629
   if (dst_bits == 64)
630
      tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
631

632
   if (tmp == src) {
633
   } else if (src.regClass() == s1) {
634
      assert(src_bits < 32);
635
      bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
636
                 Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
637
   } else if (ctx->options->chip_class >= GFX8) {
638
      assert(src_bits < 32);
639
      assert(src_bits != 8 || src.regClass() == v1b);
640
      assert(src_bits != 16 || src.regClass() == v2b);
641
      assert(dst_bits >= 16);
642
      aco_ptr<SDWA_instruction> sdwa{
643
         create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
644
      sdwa->operands[0] = Operand(src);
645
      sdwa->definitions[0] = Definition(tmp);
646
      if (sign_extend)
647
         sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
648
      else
649
         sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
650
      sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
651
      bld.insert(std::move(sdwa));
652
   } else {
653
      assert(src_bits < 32);
654
      assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
655
      aco_opcode opcode = sign_extend ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
656
      bld.vop3(opcode, Definition(tmp), src, Operand::zero(),
657
               Operand::c32(src_bits == 8 ? 8u : 16u));
658
   }
659

660
   if (dst_bits == 64) {
661
      if (sign_extend && dst.regClass() == s2) {
662
         Temp high =
663
            bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
664
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
665
      } else if (sign_extend && dst.regClass() == v2) {
666
         Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
667
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
668
      } else {
669
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
670
      }
671
   }
672

673
   return dst;
674
}
675

676
enum sgpr_extract_mode {
677
   sgpr_extract_sext,
678
   sgpr_extract_zext,
679
   sgpr_extract_undef,
680
};
681

682
Temp
683
extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
684
{
685
   Temp vec = get_ssa_temp(ctx, src->src.ssa);
686
   unsigned src_size = src->src.ssa->bit_size;
687
   unsigned swizzle = src->swizzle[0];
688

689
   if (vec.size() > 1) {
690
      assert(src_size == 16);
691
      vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
692
      swizzle = swizzle & 1;
693
   }
694

695
   Builder bld(ctx->program, ctx->block);
696
   Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
697

698
   if (mode == sgpr_extract_undef && swizzle == 0)
699
      bld.copy(Definition(tmp), vec);
700
   else
701
      bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
702
                 Operand::c32(swizzle), Operand::c32(src_size),
703
                 Operand::c32((mode == sgpr_extract_sext)));
704

705
   if (dst.regClass() == s2)
706
      convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
707

708
   return dst;
709
}
710

711
Temp
712
get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
713
{
714
   if (src.src.ssa->num_components == 1 && size == 1)
715
      return get_ssa_temp(ctx, src.src.ssa);
716

717
   Temp vec = get_ssa_temp(ctx, src.src.ssa);
718
   unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
719
   bool identity_swizzle = true;
720

721
   for (unsigned i = 0; identity_swizzle && i < size; i++) {
722
      if (src.swizzle[i] != i)
723
         identity_swizzle = false;
724
   }
725
   if (identity_swizzle)
726
      return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
727

728
   assert(elem_size > 0);
729
   assert(vec.bytes() % elem_size == 0);
730

731
   if (elem_size < 4 && vec.type() == RegType::sgpr) {
732
      assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
733
      assert(size == 1);
734
      return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
735
                                           sgpr_extract_undef);
736
   }
737

738
   RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
739
                                    : RegClass(vec.type(), elem_size / 4);
740
   if (size == 1) {
741
      return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
742
   } else {
743
      assert(size <= 4);
744
      std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
745
      aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(
746
         aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
747
      for (unsigned i = 0; i < size; ++i) {
748
         elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
749
         vec_instr->operands[i] = Operand{elems[i]};
750
      }
751
      Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
752
      vec_instr->definitions[0] = Definition(dst);
753
      ctx->block->instructions.emplace_back(std::move(vec_instr));
754
      ctx->allocated_vec.emplace(dst.id(), elems);
755
      return dst;
756
   }
757
}
758

759
Temp
760
get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
761
{
762
   /* returns v2b or v1 for vop3p usage.
763
    * The source expects exactly 2 16bit components
764
    * which are within the same dword
765
    */
766
   assert(src.src.ssa->bit_size == 16);
767
   assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
768

769
   Temp tmp = get_ssa_temp(ctx, src.src.ssa);
770
   if (tmp.size() == 1)
771
      return tmp;
772

773
   /* the size is larger than 1 dword: check the swizzle */
774
   unsigned dword = src.swizzle[0] >> 1;
775

776
   /* extract a full dword if possible */
777
   if (tmp.bytes() >= (dword + 1) * 4) {
778
      return emit_extract_vector(ctx, tmp, dword, RegClass(tmp.type(), 1));
779
   } else {
780
      /* This must be a swizzled access to %a.zz where %a is v6b */
781
      assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
782
      assert(tmp.regClass() == v6b && dword == 1);
783
      return emit_extract_vector(ctx, tmp, dword * 2, v2b);
784
   }
785
}
786

787
uint32_t
788
get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
789
{
790
   nir_ssa_scalar scalar =
791
      nir_ssa_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
792
   return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
793
}
794

795
Temp
796
convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
797
{
798
   if (ptr.size() == 2)
799
      return ptr;
800
   Builder bld(ctx->program, ctx->block);
801
   if (ptr.type() == RegType::vgpr && !non_uniform)
802
      ptr = bld.as_uniform(ptr);
803
   return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
804
                     Operand::c32((unsigned)ctx->options->address32_hi));
805
}
806

807
void
808
emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
809
                      bool writes_scc, uint8_t uses_ub = 0)
810
{
811
   aco_ptr<SOP2_instruction> sop2{
812
      create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
813
   sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
814
   sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
815
   sop2->definitions[0] = Definition(dst);
816
   if (instr->no_unsigned_wrap)
817
      sop2->definitions[0].setNUW(true);
818
   if (writes_scc)
819
      sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);
820

821
   for (int i = 0; i < 2; i++) {
822
      if (uses_ub & (1 << i)) {
823
         uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
824
         if (src_ub <= 0xffff)
825
            sop2->operands[i].set16bit(true);
826
         else if (src_ub <= 0xffffff)
827
            sop2->operands[i].set24bit(true);
828
      }
829
   }
830

831
   ctx->block->instructions.emplace_back(std::move(sop2));
832
}
833

834
void
835
emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
836
                      bool commutative, bool swap_srcs = false, bool flush_denorms = false,
837
                      bool nuw = false, uint8_t uses_ub = 0)
838
{
839
   Builder bld(ctx->program, ctx->block);
840
   bld.is_precise = instr->exact;
841

842
   Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
843
   Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
844
   if (src1.type() == RegType::sgpr) {
845
      if (commutative && src0.type() == RegType::vgpr) {
846
         Temp t = src0;
847
         src0 = src1;
848
         src1 = t;
849
      } else {
850
         src1 = as_vgpr(ctx, src1);
851
      }
852
   }
853

854
   Operand op0(src0);
855
   Operand op1(src1);
856

857
   for (int i = 0; i < 2; i++) {
858
      if (uses_ub & (1 << i)) {
859
         uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
860
         if (src_ub <= 0xffff)
861
            bld.set16bit(i ? op1 : op0);
862
         else if (src_ub <= 0xffffff)
863
            bld.set24bit(i ? op1 : op0);
864
      }
865
   }
866

867
   if (flush_denorms && ctx->program->chip_class < GFX9) {
868
      assert(dst.size() == 1);
869
      Temp tmp = bld.vop2(op, bld.def(v1), op0, op1);
870
      bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
871
   } else {
872
      if (nuw) {
873
         bld.nuw().vop2(op, Definition(dst), op0, op1);
874
      } else {
875
         bld.vop2(op, Definition(dst), op0, op1);
876
      }
877
   }
878
}
879

880
void
881
emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
882
{
883
   Builder bld(ctx->program, ctx->block);
884
   bld.is_precise = instr->exact;
885

886
   Temp src0 = get_alu_src(ctx, instr->src[0]);
887
   Temp src1 = get_alu_src(ctx, instr->src[1]);
888

889
   if (src1.type() == RegType::sgpr) {
890
      assert(src0.type() == RegType::vgpr);
891
      std::swap(src0, src1);
892
   }
893

894
   Temp src00 = bld.tmp(src0.type(), 1);
895
   Temp src01 = bld.tmp(src0.type(), 1);
896
   bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
897
   Temp src10 = bld.tmp(v1);
898
   Temp src11 = bld.tmp(v1);
899
   bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
900
   Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
901
   Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
902
   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
903
}
904

905
void
906
emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
907
                       bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
908
{
909
   assert(num_sources == 2 || num_sources == 3);
910
   Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
911
   bool has_sgpr = false;
912
   for (unsigned i = 0; i < num_sources; i++) {
913
      src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]);
914
      if (has_sgpr)
915
         src[i] = as_vgpr(ctx, src[i]);
916
      else
917
         has_sgpr = src[i].type() == RegType::sgpr;
918
   }
919

920
   Builder bld(ctx->program, ctx->block);
921
   bld.is_precise = instr->exact;
922
   if (flush_denorms && ctx->program->chip_class < GFX9) {
923
      Temp tmp;
924
      if (num_sources == 3)
925
         tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
926
      else
927
         tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
928
      if (dst.size() == 1)
929
         bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
930
      else
931
         bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp);
932
   } else if (num_sources == 3) {
933
      bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
934
   } else {
935
      bld.vop3(op, Definition(dst), src[0], src[1]);
936
   }
937
}
938

939
Builder::Result
940
emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
941
                       bool swap_srcs = false)
942
{
943
   Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
944
   Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
945
   if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
946
      src1 = as_vgpr(ctx, src1);
947
   assert(instr->dest.dest.ssa.num_components == 2);
948

949
   /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
950
   unsigned opsel_lo =
951
      (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
952
   unsigned opsel_hi =
953
      (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
954

955
   Builder bld(ctx->program, ctx->block);
956
   bld.is_precise = instr->exact;
957
   Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
958
   emit_split_vector(ctx, dst, 2);
959
   return res;
960
}
961

962
void
963
emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
964
{
965
   Builder bld(ctx->program, ctx->block);
966
   bld.is_precise = instr->exact;
967
   if (dst.type() == RegType::sgpr)
968
      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
969
                 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
970
   else
971
      bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
972
}
973

974
void
975
emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
976
{
977
   Temp src0 = get_alu_src(ctx, instr->src[0]);
978
   Temp src1 = get_alu_src(ctx, instr->src[1]);
979
   assert(src0.size() == src1.size());
980

981
   aco_ptr<Instruction> vopc;
982
   if (src1.type() == RegType::sgpr) {
983
      if (src0.type() == RegType::vgpr) {
984
         /* to swap the operands, we might also have to change the opcode */
985
         switch (op) {
986
         case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break;
987
         case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break;
988
         case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break;
989
         case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break;
990
         case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break;
991
         case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break;
992
         case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break;
993
         case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break;
994
         case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break;
995
         case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break;
996
         case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break;
997
         case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break;
998
         case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break;
999
         case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break;
1000
         case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break;
1001
         case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break;
1002
         case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break;
1003
         case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break;
1004
         default: /* eq and ne are commutative */ break;
1005
         }
1006
         Temp t = src0;
1007
         src0 = src1;
1008
         src1 = t;
1009
      } else {
1010
         src1 = as_vgpr(ctx, src1);
1011
      }
1012
   }
1013

1014
   Builder bld(ctx->program, ctx->block);
1015
   bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
1016
}
1017

1018
void
1019
emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1020
{
1021
   Temp src0 = get_alu_src(ctx, instr->src[0]);
1022
   Temp src1 = get_alu_src(ctx, instr->src[1]);
1023
   Builder bld(ctx->program, ctx->block);
1024

1025
   assert(dst.regClass() == bld.lm);
1026
   assert(src0.type() == RegType::sgpr);
1027
   assert(src1.type() == RegType::sgpr);
1028
   assert(src0.regClass() == src1.regClass());
1029

1030
   /* Emit the SALU comparison instruction */
1031
   Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
1032
   /* Turn the result into a per-lane bool */
1033
   bool_to_vector_condition(ctx, cmp, dst);
1034
}
1035

1036
void
1037
emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1038
                aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes,
1039
                aco_opcode s64_op = aco_opcode::num_opcodes)
1040
{
1041
   aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64   ? s64_op
1042
                     : instr->src[0].src.ssa->bit_size == 32 ? s32_op
1043
                                                             : aco_opcode::num_opcodes;
1044
   aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64   ? v64_op
1045
                     : instr->src[0].src.ssa->bit_size == 32 ? v32_op
1046
                                                             : v16_op;
1047
   bool use_valu = s_op == aco_opcode::num_opcodes || nir_dest_is_divergent(instr->dest.dest) ||
1048
                   get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1049
                   get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1050
   aco_opcode op = use_valu ? v_op : s_op;
1051
   assert(op != aco_opcode::num_opcodes);
1052
   assert(dst.regClass() == ctx->program->lane_mask);
1053

1054
   if (use_valu)
1055
      emit_vopc_instruction(ctx, instr, op, dst);
1056
   else
1057
      emit_sopc_instruction(ctx, instr, op, dst);
1058
}
1059

1060
void
1061
emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1062
                   Temp dst)
1063
{
1064
   Builder bld(ctx->program, ctx->block);
1065
   Temp src0 = get_alu_src(ctx, instr->src[0]);
1066
   Temp src1 = get_alu_src(ctx, instr->src[1]);
1067

1068
   assert(dst.regClass() == bld.lm);
1069
   assert(src0.regClass() == bld.lm);
1070
   assert(src1.regClass() == bld.lm);
1071

1072
   bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1073
}
1074

1075
void
1076
emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1077
{
1078
   Builder bld(ctx->program, ctx->block);
1079
   Temp cond = get_alu_src(ctx, instr->src[0]);
1080
   Temp then = get_alu_src(ctx, instr->src[1]);
1081
   Temp els = get_alu_src(ctx, instr->src[2]);
1082

1083
   assert(cond.regClass() == bld.lm);
1084

1085
   if (dst.type() == RegType::vgpr) {
1086
      aco_ptr<Instruction> bcsel;
1087
      if (dst.size() == 1) {
1088
         then = as_vgpr(ctx, then);
1089
         els = as_vgpr(ctx, els);
1090

1091
         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1092
      } else if (dst.size() == 2) {
1093
         Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1094
         bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
1095
         Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1096
         bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
1097

1098
         Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
1099
         Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
1100

1101
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1102
      } else {
1103
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1104
      }
1105
      return;
1106
   }
1107

1108
   if (instr->dest.dest.ssa.bit_size == 1) {
1109
      assert(dst.regClass() == bld.lm);
1110
      assert(then.regClass() == bld.lm);
1111
      assert(els.regClass() == bld.lm);
1112
   }
1113

1114
   if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1115
      if (dst.regClass() == s1 || dst.regClass() == s2) {
1116
         assert((then.regClass() == s1 || then.regClass() == s2) &&
1117
                els.regClass() == then.regClass());
1118
         assert(dst.size() == then.size());
1119
         aco_opcode op =
1120
            dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1121
         bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1122
      } else {
1123
         isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1124
      }
1125
      return;
1126
   }
1127

1128
   /* divergent boolean bcsel
1129
    * this implements bcsel on bools: dst = s0 ? s1 : s2
1130
    * are going to be: dst = (s0 & s1) | (~s0 & s2) */
1131
   assert(instr->dest.dest.ssa.bit_size == 1);
1132

1133
   if (cond.id() != then.id())
1134
      then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1135

1136
   if (cond.id() == els.id())
1137
      bld.copy(Definition(dst), then);
1138
   else
1139
      bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1140
               bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1141
}
1142

1143
void
1144
emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,
1145
               uint32_t undo)
1146
{
1147
   /* multiply by 16777216 to handle denormals */
1148
   Temp is_denormal =
1149
      bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), as_vgpr(ctx, val),
1150
               bld.copy(bld.def(v1), Operand::c32((1u << 7) | (1u << 4))));
1151
   Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val);
1152
   scaled = bld.vop1(op, bld.def(v1), scaled);
1153
   scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled);
1154

1155
   Temp not_scaled = bld.vop1(op, bld.def(v1), val);
1156

1157
   bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
1158
}
1159

1160
void
1161
emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1162
{
1163
   if (ctx->block->fp_mode.denorm32 == 0) {
1164
      bld.vop1(aco_opcode::v_rcp_f32, dst, val);
1165
      return;
1166
   }
1167

1168
   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
1169
}
1170

1171
void
1172
emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1173
{
1174
   if (ctx->block->fp_mode.denorm32 == 0) {
1175
      bld.vop1(aco_opcode::v_rsq_f32, dst, val);
1176
      return;
1177
   }
1178

1179
   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
1180
}
1181

1182
void
1183
emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1184
{
1185
   if (ctx->block->fp_mode.denorm32 == 0) {
1186
      bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
1187
      return;
1188
   }
1189

1190
   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
1191
}
1192

1193
void
1194
emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1195
{
1196
   if (ctx->block->fp_mode.denorm32 == 0) {
1197
      bld.vop1(aco_opcode::v_log_f32, dst, val);
1198
      return;
1199
   }
1200

1201
   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
1202
}
1203

1204
Temp
1205
emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1206
{
1207
   if (ctx->options->chip_class >= GFX7)
1208
      return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1209

1210
   /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1211
   /* TODO: create more efficient code! */
1212
   if (val.type() == RegType::sgpr)
1213
      val = as_vgpr(ctx, val);
1214

1215
   /* Split the input value. */
1216
   Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1217
   bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1218

1219
   /* Extract the exponent and compute the unbiased value. */
1220
   Temp exponent =
1221
      bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1222
   exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1223

1224
   /* Extract the fractional part. */
1225
   Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1226
                                Operand::c32(0x000fffffu));
1227
   fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1228

1229
   Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1230
   bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1231
              fract_mask);
1232

1233
   Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1234
   Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1235
   fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1236
   tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1237
   fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1238

1239
   /* Get the sign bit. */
1240
   Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1241

1242
   /* Decide the operation to apply depending on the unbiased exponent. */
1243
   Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent,
1244
                               Operand::zero());
1245
   Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1246
                          bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1247
   Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1248
   Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1249
   dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1250
   dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1251

1252
   return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1253
}
1254

1255
Temp
1256
emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1257
{
1258
   if (ctx->options->chip_class >= GFX7)
1259
      return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1260

1261
   /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1262
    * lowered at NIR level for precision reasons). */
1263
   Temp src0 = as_vgpr(ctx, val);
1264

1265
   Temp mask = bld.copy(bld.def(s1), Operand::c32(3u)); /* isnan */
1266
   Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1267
                             Operand::c32(0x3fefffffu));
1268

1269
   Temp isnan =
1270
      bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
1271
   Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1272
   Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1273

1274
   Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1275
   bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1276
   Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1277
   bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1278

1279
   Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1280
   Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1281

1282
   Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1283

1284
   Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1285
   add->vop3().neg[1] = true;
1286

1287
   return add->definitions[0].getTemp();
1288
}
1289

1290
Temp
1291
uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1292
{
1293
   if (bld.program->chip_class < GFX8) {
1294
      Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1295
      return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1296
                          add.def(1).getTemp());
1297
   }
1298

1299
   Builder::Result add(NULL);
1300
   if (bld.program->chip_class >= GFX9) {
1301
      add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1302
   } else {
1303
      add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.hint_vcc(bld.def(bld.lm)), src0, src1);
1304
   }
1305
   add.instr->vop3().clamp = 1;
1306
   return dst.getTemp();
1307
}
1308

1309
void
1310
visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1311
{
1312
   if (!instr->dest.dest.is_ssa) {
1313
      isel_err(&instr->instr, "nir alu dst not in ssa");
1314
      abort();
1315
   }
1316
   Builder bld(ctx->program, ctx->block);
1317
   bld.is_precise = instr->exact;
1318
   Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1319
   switch (instr->op) {
1320
   case nir_op_vec2:
1321
   case nir_op_vec3:
1322
   case nir_op_vec4:
1323
   case nir_op_vec5: {
1324
      std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1325
      unsigned num = instr->dest.dest.ssa.num_components;
1326
      for (unsigned i = 0; i < num; ++i)
1327
         elems[i] = get_alu_src(ctx, instr->src[i]);
1328

1329
      if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
1330
         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1331
            aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1332
         RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
1333
         for (unsigned i = 0; i < num; ++i) {
1334
            if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1335
               elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1336
            vec->operands[i] = Operand{elems[i]};
1337
         }
1338
         vec->definitions[0] = Definition(dst);
1339
         ctx->block->instructions.emplace_back(std::move(vec));
1340
         ctx->allocated_vec.emplace(dst.id(), elems);
1341
      } else {
1342
         bool use_s_pack = ctx->program->chip_class >= GFX9;
1343
         Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->dest.dest.ssa.bit_size) - 1));
1344

1345
         std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1346
         uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1347
         for (unsigned i = 0; i < num; i++) {
1348
            unsigned packed_size = use_s_pack ? 16 : 32;
1349
            unsigned idx = i * instr->dest.dest.ssa.bit_size / packed_size;
1350
            unsigned offset = i * instr->dest.dest.ssa.bit_size % packed_size;
1351
            if (nir_src_is_const(instr->src[i].src)) {
1352
               const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1353
               continue;
1354
            }
1355

1356
            if (offset != packed_size - instr->dest.dest.ssa.bit_size)
1357
               elems[i] =
1358
                  bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1359

1360
            if (offset)
1361
               elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1362
                                   Operand::c32(offset));
1363

1364
            if (packed[idx].id())
1365
               packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1366
                                      packed[idx]);
1367
            else
1368
               packed[idx] = elems[i];
1369
         }
1370

1371
         if (use_s_pack) {
1372
            for (unsigned i = 0; i < dst.size(); i++) {
1373
               bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1374

1375
               if (packed[i * 2].id() && packed[i * 2 + 1].id())
1376
                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1377
                                       packed[i * 2 + 1]);
1378
               else if (packed[i * 2 + 1].id())
1379
                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1380
                                       Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1381
               else if (packed[i * 2].id())
1382
                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1383
                                       Operand::c32(const_vals[i * 2 + 1]));
1384

1385
               if (same)
1386
                  const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1387
               else
1388
                  const_vals[i] = 0;
1389
            }
1390
         }
1391

1392
         for (unsigned i = 0; i < dst.size(); i++) {
1393
            if (const_vals[i] && packed[i].id())
1394
               packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1395
                                    Operand::c32(const_vals[i]), packed[i]);
1396
            else if (!packed[i].id())
1397
               packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1398
         }
1399

1400
         if (dst.size() == 1)
1401
            bld.copy(Definition(dst), packed[0]);
1402
         else if (dst.size() == 2)
1403
            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1]);
1404
         else
1405
            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1],
1406
                       packed[2]);
1407
      }
1408
      break;
1409
   }
1410
   case nir_op_mov: {
1411
      Temp src = get_alu_src(ctx, instr->src[0]);
1412
      if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1413
         /* use size() instead of bytes() for 8/16-bit */
1414
         assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1415
         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1416
      } else {
1417
         assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1418
         bld.copy(Definition(dst), src);
1419
      }
1420
      break;
1421
   }
1422
   case nir_op_inot: {
1423
      Temp src = get_alu_src(ctx, instr->src[0]);
1424
      if (instr->dest.dest.ssa.bit_size == 1) {
1425
         assert(src.regClass() == bld.lm);
1426
         assert(dst.regClass() == bld.lm);
1427
         /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
1428
         Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
1429
         bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
1430
      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1431
         emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1432
      } else if (dst.regClass() == v2) {
1433
         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1434
         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1435
         lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1436
         hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1437
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1438
      } else if (dst.type() == RegType::sgpr) {
1439
         aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1440
         bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1441
      } else {
1442
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1443
      }
1444
      break;
1445
   }
1446
   case nir_op_iabs: {
1447
      Temp src = get_alu_src(ctx, instr->src[0]);
1448
      if (dst.regClass() == s1) {
1449
         bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1450
      } else if (dst.regClass() == v1) {
1451
         bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1452
                  bld.vsub32(bld.def(v1), Operand::zero(), src));
1453
      } else {
1454
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1455
      }
1456
      break;
1457
   }
1458
   case nir_op_isign: {
1459
      Temp src = get_alu_src(ctx, instr->src[0]);
1460
      if (dst.regClass() == s1) {
1461
         Temp tmp =
1462
            bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1463
         bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1464
      } else if (dst.regClass() == s2) {
1465
         Temp neg =
1466
            bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1467
         Temp neqz;
1468
         if (ctx->program->chip_class >= GFX8)
1469
            neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1470
         else
1471
            neqz =
1472
               bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1473
                  .def(1)
1474
                  .getTemp();
1475
         /* SCC gets zero-extended to 64 bit */
1476
         bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1477
      } else if (dst.regClass() == v1) {
1478
         bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1479
      } else if (dst.regClass() == v2) {
1480
         Temp upper = emit_extract_vector(ctx, src, 1, v1);
1481
         Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1482
         Temp gtz =
1483
            bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);
1484
         Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1485
         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1486
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1487
      } else {
1488
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1489
      }
1490
      break;
1491
   }
1492
   case nir_op_imax: {
1493
      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1494
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1495
      } else if (dst.regClass() == v2b) {
1496
         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1497
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1498
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1499
      } else if (dst.regClass() == v1) {
1500
         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1501
      } else if (dst.regClass() == s1) {
1502
         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1503
      } else {
1504
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1505
      }
1506
      break;
1507
   }
1508
   case nir_op_umax: {
1509
      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1510
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1511
      } else if (dst.regClass() == v2b) {
1512
         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1513
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1514
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1515
      } else if (dst.regClass() == v1) {
1516
         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1517
      } else if (dst.regClass() == s1) {
1518
         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1519
      } else {
1520
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1521
      }
1522
      break;
1523
   }
1524
   case nir_op_imin: {
1525
      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1526
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1527
      } else if (dst.regClass() == v2b) {
1528
         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1529
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1530
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1531
      } else if (dst.regClass() == v1) {
1532
         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1533
      } else if (dst.regClass() == s1) {
1534
         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1535
      } else {
1536
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1537
      }
1538
      break;
1539
   }
1540
   case nir_op_umin: {
1541
      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1542
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1543
      } else if (dst.regClass() == v2b) {
1544
         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1545
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1546
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1547
      } else if (dst.regClass() == v1) {
1548
         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1549
      } else if (dst.regClass() == s1) {
1550
         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1551
      } else {
1552
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1553
      }
1554
      break;
1555
   }
1556
   case nir_op_ior: {
1557
      if (instr->dest.dest.ssa.bit_size == 1) {
1558
         emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1559
      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1560
         emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1561
      } else if (dst.regClass() == v2) {
1562
         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1563
      } else if (dst.regClass() == s1) {
1564
         emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1565
      } else if (dst.regClass() == s2) {
1566
         emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1567
      } else {
1568
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1569
      }
1570
      break;
1571
   }
1572
   case nir_op_iand: {
1573
      if (instr->dest.dest.ssa.bit_size == 1) {
1574
         emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1575
      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1576
         emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1577
      } else if (dst.regClass() == v2) {
1578
         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1579
      } else if (dst.regClass() == s1) {
1580
         emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1581
      } else if (dst.regClass() == s2) {
1582
         emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1583
      } else {
1584
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1585
      }
1586
      break;
1587
   }
1588
   case nir_op_ixor: {
1589
      if (instr->dest.dest.ssa.bit_size == 1) {
1590
         emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1591
      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1592
         emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1593
      } else if (dst.regClass() == v2) {
1594
         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1595
      } else if (dst.regClass() == s1) {
1596
         emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1597
      } else if (dst.regClass() == s2) {
1598
         emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1599
      } else {
1600
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1601
      }
1602
      break;
1603
   }
1604
   case nir_op_ushr: {
1605
      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1606
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1607
      } else if (dst.regClass() == v2b) {
1608
         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1609
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1610
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1611
      } else if (dst.regClass() == v1) {
1612
         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1613
      } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1614
         bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1615
                  get_alu_src(ctx, instr->src[0]));
1616
      } else if (dst.regClass() == v2) {
1617
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1618
      } else if (dst.regClass() == s2) {
1619
         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1620
      } else if (dst.regClass() == s1) {
1621
         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1622
      } else {
1623
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1624
      }
1625
      break;
1626
   }
1627
   case nir_op_ishl: {
1628
      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1629
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1630
      } else if (dst.regClass() == v2b) {
1631
         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1632
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1633
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1634
      } else if (dst.regClass() == v1) {
1635
         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1636
                               false, 1);
1637
      } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1638
         bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1639
                  get_alu_src(ctx, instr->src[0]));
1640
      } else if (dst.regClass() == v2) {
1641
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1642
      } else if (dst.regClass() == s1) {
1643
         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1644
      } else if (dst.regClass() == s2) {
1645
         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1646
      } else {
1647
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1648
      }
1649
      break;
1650
   }
1651
   case nir_op_ishr: {
1652
      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1653
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1654
      } else if (dst.regClass() == v2b) {
1655
         emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1656
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1657
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1658
      } else if (dst.regClass() == v1) {
1659
         emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1660
      } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1661
         bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1662
                  get_alu_src(ctx, instr->src[0]));
1663
      } else if (dst.regClass() == v2) {
1664
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1665
      } else if (dst.regClass() == s1) {
1666
         emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1667
      } else if (dst.regClass() == s2) {
1668
         emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1669
      } else {
1670
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1671
      }
1672
      break;
1673
   }
1674
   case nir_op_find_lsb: {
1675
      Temp src = get_alu_src(ctx, instr->src[0]);
1676
      if (src.regClass() == s1) {
1677
         bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1678
      } else if (src.regClass() == v1) {
1679
         emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1680
      } else if (src.regClass() == s2) {
1681
         bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1682
      } else {
1683
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1684
      }
1685
      break;
1686
   }
1687
   case nir_op_ufind_msb:
1688
   case nir_op_ifind_msb: {
1689
      Temp src = get_alu_src(ctx, instr->src[0]);
1690
      if (src.regClass() == s1 || src.regClass() == s2) {
1691
         aco_opcode op = src.regClass() == s2
1692
                            ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1693
                                                             : aco_opcode::s_flbit_i32_i64)
1694
                            : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1695
                                                             : aco_opcode::s_flbit_i32);
1696
         Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1697

1698
         Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1699
                                        Operand::c32(src.size() * 32u - 1u), msb_rev);
1700
         Temp msb = sub.def(0).getTemp();
1701
         Temp carry = sub.def(1).getTemp();
1702

1703
         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1704
                  bld.scc(carry));
1705
      } else if (src.regClass() == v1) {
1706
         aco_opcode op =
1707
            instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1708
         Temp msb_rev = bld.tmp(v1);
1709
         emit_vop1_instruction(ctx, instr, op, msb_rev);
1710
         Temp msb = bld.tmp(v1);
1711
         Temp carry =
1712
            bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1713
         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);
1714
      } else if (src.regClass() == v2) {
1715
         aco_opcode op =
1716
            instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1717

1718
         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1719
         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1720

1721
         lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)),
1722
                         bld.vop1(op, bld.def(v1), lo));
1723
         hi = bld.vop1(op, bld.def(v1), hi);
1724
         Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi);
1725

1726
         Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi);
1727

1728
         Temp msb = bld.tmp(v1);
1729
         Temp carry =
1730
            bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1731
         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);
1732
      } else {
1733
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1734
      }
1735
      break;
1736
   }
1737
   case nir_op_bitfield_reverse: {
1738
      if (dst.regClass() == s1) {
1739
         bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1740
      } else if (dst.regClass() == v1) {
1741
         bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1742
      } else {
1743
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1744
      }
1745
      break;
1746
   }
1747
   case nir_op_iadd: {
1748
      if (dst.regClass() == s1) {
1749
         emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1750
         break;
1751
      } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
1752
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1753
         break;
1754
      } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
1755
         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1756
         break;
1757
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1758
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1759
         break;
1760
      }
1761

1762
      Temp src0 = get_alu_src(ctx, instr->src[0]);
1763
      Temp src1 = get_alu_src(ctx, instr->src[1]);
1764
      if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1765
         bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1766
         break;
1767
      }
1768

1769
      assert(src0.size() == 2 && src1.size() == 2);
1770
      Temp src00 = bld.tmp(src0.type(), 1);
1771
      Temp src01 = bld.tmp(dst.type(), 1);
1772
      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1773
      Temp src10 = bld.tmp(src1.type(), 1);
1774
      Temp src11 = bld.tmp(dst.type(), 1);
1775
      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1776

1777
      if (dst.regClass() == s2) {
1778
         Temp carry = bld.tmp(s1);
1779
         Temp dst0 =
1780
            bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1781
         Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1782
                              bld.scc(carry));
1783
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1784
      } else if (dst.regClass() == v2) {
1785
         Temp dst0 = bld.tmp(v1);
1786
         Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1787
         Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1788
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1789
      } else {
1790
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1791
      }
1792
      break;
1793
   }
1794
   case nir_op_uadd_sat: {
1795
      Temp src0 = get_alu_src(ctx, instr->src[0]);
1796
      Temp src1 = get_alu_src(ctx, instr->src[1]);
1797
      if (dst.regClass() == s1) {
1798
         Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1799
         bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1800
         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1801
                  bld.scc(carry));
1802
      } else if (dst.regClass() == v2b) {
1803
         Instruction* add_instr;
1804
         if (ctx->program->chip_class >= GFX10) {
1805
            add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1806
         } else {
1807
            if (src1.type() == RegType::sgpr)
1808
               std::swap(src0, src1);
1809
            add_instr =
1810
               bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1811
         }
1812
         add_instr->vop3().clamp = 1;
1813
      } else if (dst.regClass() == v1) {
1814
         uadd32_sat(bld, Definition(dst), src0, src1);
1815
      } else {
1816
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1817
      }
1818
      break;
1819
   }
1820
   case nir_op_uadd_carry: {
1821
      Temp src0 = get_alu_src(ctx, instr->src[0]);
1822
      Temp src1 = get_alu_src(ctx, instr->src[1]);
1823
      if (dst.regClass() == s1) {
1824
         bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1825
         break;
1826
      }
1827
      if (dst.regClass() == v1) {
1828
         Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1829
         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1830
                      carry);
1831
         break;
1832
      }
1833

1834
      Temp src00 = bld.tmp(src0.type(), 1);
1835
      Temp src01 = bld.tmp(dst.type(), 1);
1836
      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1837
      Temp src10 = bld.tmp(src1.type(), 1);
1838
      Temp src11 = bld.tmp(dst.type(), 1);
1839
      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1840
      if (dst.regClass() == s2) {
1841
         Temp carry = bld.tmp(s1);
1842
         bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1843
         carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1844
                          bld.scc(carry))
1845
                    .def(1)
1846
                    .getTemp();
1847
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1848
      } else if (dst.regClass() == v2) {
1849
         Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1850
         carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1851
         carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1852
                              Operand::c32(1u), carry);
1853
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1854
      } else {
1855
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1856
      }
1857
      break;
1858
   }
1859
   case nir_op_isub: {
1860
      if (dst.regClass() == s1) {
1861
         emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1862
         break;
1863
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1864
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
1865
         break;
1866
      }
1867

1868
      Temp src0 = get_alu_src(ctx, instr->src[0]);
1869
      Temp src1 = get_alu_src(ctx, instr->src[1]);
1870
      if (dst.regClass() == v1) {
1871
         bld.vsub32(Definition(dst), src0, src1);
1872
         break;
1873
      } else if (dst.bytes() <= 2) {
1874
         if (ctx->program->chip_class >= GFX10)
1875
            bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
1876
         else if (src1.type() == RegType::sgpr)
1877
            bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
1878
         else if (ctx->program->chip_class >= GFX8)
1879
            bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
1880
         else
1881
            bld.vsub32(Definition(dst), src0, src1);
1882
         break;
1883
      }
1884

1885
      Temp src00 = bld.tmp(src0.type(), 1);
1886
      Temp src01 = bld.tmp(dst.type(), 1);
1887
      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1888
      Temp src10 = bld.tmp(src1.type(), 1);
1889
      Temp src11 = bld.tmp(dst.type(), 1);
1890
      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1891
      if (dst.regClass() == s2) {
1892
         Temp borrow = bld.tmp(s1);
1893
         Temp dst0 =
1894
            bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1895
         Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1896
                              bld.scc(borrow));
1897
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1898
      } else if (dst.regClass() == v2) {
1899
         Temp lower = bld.tmp(v1);
1900
         Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1901
         Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1902
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1903
      } else {
1904
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1905
      }
1906
      break;
1907
   }
1908
   case nir_op_usub_borrow: {
1909
      Temp src0 = get_alu_src(ctx, instr->src[0]);
1910
      Temp src1 = get_alu_src(ctx, instr->src[1]);
1911
      if (dst.regClass() == s1) {
1912
         bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1913
         break;
1914
      } else if (dst.regClass() == v1) {
1915
         Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1916
         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1917
                      borrow);
1918
         break;
1919
      }
1920

1921
      Temp src00 = bld.tmp(src0.type(), 1);
1922
      Temp src01 = bld.tmp(dst.type(), 1);
1923
      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1924
      Temp src10 = bld.tmp(src1.type(), 1);
1925
      Temp src11 = bld.tmp(dst.type(), 1);
1926
      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1927
      if (dst.regClass() == s2) {
1928
         Temp borrow = bld.tmp(s1);
1929
         bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1930
         borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1931
                           bld.scc(borrow))
1932
                     .def(1)
1933
                     .getTemp();
1934
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1935
      } else if (dst.regClass() == v2) {
1936
         Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1937
         borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1938
         borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1939
                               Operand::c32(1u), borrow);
1940
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1941
      } else {
1942
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1943
      }
1944
      break;
1945
   }
1946
   case nir_op_imul: {
1947
      if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
1948
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
1949
      } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
1950
         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
1951
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1952
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
1953
      } else if (dst.type() == RegType::vgpr) {
1954
         Temp src0 = get_alu_src(ctx, instr->src[0]);
1955
         Temp src1 = get_alu_src(ctx, instr->src[1]);
1956
         uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
1957
         uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
1958

1959
         if (src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff &&
1960
             (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9)) {
1961
            /* If the 16-bit multiplication can't overflow, emit v_mul_lo_u16
1962
             * but only on GFX8-9 because GFX10 doesn't zero the upper 16
1963
             * bits.
1964
             */
1965
            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true /* commutative */,
1966
                                  false, false, true /* nuw */);
1967
         } else if (src0_ub <= 0xffff && src1_ub <= 0xffff && ctx->options->chip_class >= GFX9) {
1968
            /* Initialize the accumulator to 0 to allow further combinations
1969
             * in the optimizer.
1970
             */
1971
            Operand op0(src0);
1972
            Operand op1(src1);
1973
            bld.vop3(aco_opcode::v_mad_u32_u16, Definition(dst), bld.set16bit(op0),
1974
                     bld.set16bit(op1), Operand::zero());
1975
         } else if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
1976
            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, true);
1977
         } else if (nir_src_is_const(instr->src[0].src)) {
1978
            bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
1979
                          nir_src_as_uint(instr->src[0].src), false);
1980
         } else if (nir_src_is_const(instr->src[1].src)) {
1981
            bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
1982
                          nir_src_as_uint(instr->src[1].src), false);
1983
         } else {
1984
            emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
1985
         }
1986
      } else if (dst.regClass() == s1) {
1987
         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1988
      } else {
1989
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1990
      }
1991
      break;
1992
   }
1993
   case nir_op_umul_high: {
1994
      if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1995
         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
1996
      } else if (dst.bytes() == 4) {
1997
         uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
1998
         uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
1999

2000
         Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
2001
         if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2002
            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2003
         } else {
2004
            emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2005
         }
2006

2007
         if (dst.regClass() == s1)
2008
            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2009
      } else {
2010
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2011
      }
2012
      break;
2013
   }
2014
   case nir_op_imul_high: {
2015
      if (dst.regClass() == v1) {
2016
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2017
      } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
2018
         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2019
      } else if (dst.regClass() == s1) {
2020
         Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2021
                             as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2022
         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2023
      } else {
2024
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2025
      }
2026
      break;
2027
   }
2028
   case nir_op_fmul: {
2029
      if (dst.regClass() == v2b) {
2030
         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2031
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2032
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2033
      } else if (dst.regClass() == v1) {
2034
         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2035
      } else if (dst.regClass() == v2) {
2036
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst);
2037
      } else {
2038
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2039
      }
2040
      break;
2041
   }
2042
   case nir_op_fadd: {
2043
      if (dst.regClass() == v2b) {
2044
         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2045
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2046
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2047
      } else if (dst.regClass() == v1) {
2048
         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2049
      } else if (dst.regClass() == v2) {
2050
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst);
2051
      } else {
2052
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2053
      }
2054
      break;
2055
   }
2056
   case nir_op_fsub: {
2057
      if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2058
         Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2059
         VOP3P_instruction& sub = add->vop3p();
2060
         sub.neg_lo[1] = true;
2061
         sub.neg_hi[1] = true;
2062
         break;
2063
      }
2064

2065
      Temp src0 = get_alu_src(ctx, instr->src[0]);
2066
      Temp src1 = get_alu_src(ctx, instr->src[1]);
2067
      if (dst.regClass() == v2b) {
2068
         if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2069
            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2070
         else
2071
            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2072
      } else if (dst.regClass() == v1) {
2073
         if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2074
            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2075
         else
2076
            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2077
      } else if (dst.regClass() == v2) {
2078
         Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),
2079
                                     as_vgpr(ctx, src1));
2080
         add->vop3().neg[1] = true;
2081
      } else {
2082
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2083
      }
2084
      break;
2085
   }
2086
   case nir_op_fmax: {
2087
      if (dst.regClass() == v2b) {
2088
         // TODO: check fp_mode.must_flush_denorms16_64
2089
         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
2090
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2091
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2092
      } else if (dst.regClass() == v1) {
2093
         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2094
                               ctx->block->fp_mode.must_flush_denorms32);
2095
      } else if (dst.regClass() == v2) {
2096
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,
2097
                                ctx->block->fp_mode.must_flush_denorms16_64);
2098
      } else {
2099
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2100
      }
2101
      break;
2102
   }
2103
   case nir_op_fmin: {
2104
      if (dst.regClass() == v2b) {
2105
         // TODO: check fp_mode.must_flush_denorms16_64
2106
         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
2107
      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2108
         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2109
      } else if (dst.regClass() == v1) {
2110
         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2111
                               ctx->block->fp_mode.must_flush_denorms32);
2112
      } else if (dst.regClass() == v2) {
2113
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,
2114
                                ctx->block->fp_mode.must_flush_denorms16_64);
2115
      } else {
2116
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2117
      }
2118
      break;
2119
   }
2120
   case nir_op_cube_face_coord_amd: {
2121
      Temp in = get_alu_src(ctx, instr->src[0], 3);
2122
      Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2123
                     emit_extract_vector(ctx, in, 2, v1)};
2124
      Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2125
      ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
2126
      Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2127
      Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2128
      sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),
2129
                    bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma));
2130
      tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),
2131
                    bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma));
2132
      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
2133
      break;
2134
   }
2135
   case nir_op_cube_face_index_amd: {
2136
      Temp in = get_alu_src(ctx, instr->src[0], 3);
2137
      Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2138
                     emit_extract_vector(ctx, in, 2, v1)};
2139
      bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
2140
      break;
2141
   }
2142
   case nir_op_bcsel: {
2143
      emit_bcsel(ctx, instr, dst);
2144
      break;
2145
   }
2146
   case nir_op_frsq: {
2147
      if (dst.regClass() == v2b) {
2148
         emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2149
      } else if (dst.regClass() == v1) {
2150
         Temp src = get_alu_src(ctx, instr->src[0]);
2151
         emit_rsq(ctx, bld, Definition(dst), src);
2152
      } else if (dst.regClass() == v2) {
2153
         /* Lowered at NIR level for precision reasons. */
2154
         emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2155
      } else {
2156
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2157
      }
2158
      break;
2159
   }
2160
   case nir_op_fneg: {
2161
      if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2162
         Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2163
         bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0xBC00),
2164
                   instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2165
         emit_split_vector(ctx, dst, 2);
2166
         break;
2167
      }
2168
      Temp src = get_alu_src(ctx, instr->src[0]);
2169
      if (dst.regClass() == v2b) {
2170
         bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2171
      } else if (dst.regClass() == v1) {
2172
         bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2173
                  as_vgpr(ctx, src));
2174
      } else if (dst.regClass() == v2) {
2175
         if (ctx->block->fp_mode.must_flush_denorms16_64)
2176
            src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2177
                           as_vgpr(ctx, src));
2178
         Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2179
         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2180
         upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2181
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2182
      } else {
2183
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2184
      }
2185
      break;
2186
   }
2187
   case nir_op_fabs: {
2188
      Temp src = get_alu_src(ctx, instr->src[0]);
2189
      if (dst.regClass() == v2b) {
2190
         Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2191
                                         Operand::c16(0x3c00), as_vgpr(ctx, src))
2192
                               .instr;
2193
         mul->vop3().abs[1] = true;
2194
      } else if (dst.regClass() == v1) {
2195
         Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2196
                                         Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2197
                               .instr;
2198
         mul->vop3().abs[1] = true;
2199
      } else if (dst.regClass() == v2) {
2200
         if (ctx->block->fp_mode.must_flush_denorms16_64)
2201
            src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2202
                           as_vgpr(ctx, src));
2203
         Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2204
         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2205
         upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2206
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2207
      } else {
2208
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2209
      }
2210
      break;
2211
   }
2212
   case nir_op_fsat: {
2213
      if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2214
         Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2215
         Instruction* vop3p =
2216
            bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2217
                      instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2218
         vop3p->vop3p().clamp = true;
2219
         emit_split_vector(ctx, dst, 2);
2220
         break;
2221
      }
2222
      Temp src = get_alu_src(ctx, instr->src[0]);
2223
      if (dst.regClass() == v2b) {
2224
         bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2225
                  src);
2226
      } else if (dst.regClass() == v1) {
2227
         bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2228
                  Operand::c32(0x3f800000u), src);
2229
         /* apparently, it is not necessary to flush denorms if this instruction is used with these
2230
          * operands */
2231
         // TODO: confirm that this holds under any circumstances
2232
      } else if (dst.regClass() == v2) {
2233
         Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero());
2234
         add->vop3().clamp = true;
2235
      } else {
2236
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2237
      }
2238
      break;
2239
   }
2240
   case nir_op_flog2: {
2241
      if (dst.regClass() == v2b) {
2242
         emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2243
      } else if (dst.regClass() == v1) {
2244
         Temp src = get_alu_src(ctx, instr->src[0]);
2245
         emit_log2(ctx, bld, Definition(dst), src);
2246
      } else {
2247
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2248
      }
2249
      break;
2250
   }
2251
   case nir_op_frcp: {
2252
      if (dst.regClass() == v2b) {
2253
         emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2254
      } else if (dst.regClass() == v1) {
2255
         Temp src = get_alu_src(ctx, instr->src[0]);
2256
         emit_rcp(ctx, bld, Definition(dst), src);
2257
      } else if (dst.regClass() == v2) {
2258
         /* Lowered at NIR level for precision reasons. */
2259
         emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2260
      } else {
2261
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2262
      }
2263
      break;
2264
   }
2265
   case nir_op_fexp2: {
2266
      if (dst.regClass() == v2b) {
2267
         emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2268
      } else if (dst.regClass() == v1) {
2269
         emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2270
      } else {
2271
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2272
      }
2273
      break;
2274
   }
2275
   case nir_op_fsqrt: {
2276
      if (dst.regClass() == v2b) {
2277
         emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2278
      } else if (dst.regClass() == v1) {
2279
         Temp src = get_alu_src(ctx, instr->src[0]);
2280
         emit_sqrt(ctx, bld, Definition(dst), src);
2281
      } else if (dst.regClass() == v2) {
2282
         /* Lowered at NIR level for precision reasons. */
2283
         emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2284
      } else {
2285
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2286
      }
2287
      break;
2288
   }
2289
   case nir_op_ffract: {
2290
      if (dst.regClass() == v2b) {
2291
         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2292
      } else if (dst.regClass() == v1) {
2293
         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2294
      } else if (dst.regClass() == v2) {
2295
         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2296
      } else {
2297
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2298
      }
2299
      break;
2300
   }
2301
   case nir_op_ffloor: {
2302
      if (dst.regClass() == v2b) {
2303
         emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2304
      } else if (dst.regClass() == v1) {
2305
         emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2306
      } else if (dst.regClass() == v2) {
2307
         Temp src = get_alu_src(ctx, instr->src[0]);
2308
         emit_floor_f64(ctx, bld, Definition(dst), src);
2309
      } else {
2310
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2311
      }
2312
      break;
2313
   }
2314
   case nir_op_fceil: {
2315
      if (dst.regClass() == v2b) {
2316
         emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2317
      } else if (dst.regClass() == v1) {
2318
         emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2319
      } else if (dst.regClass() == v2) {
2320
         if (ctx->options->chip_class >= GFX7) {
2321
            emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2322
         } else {
2323
            /* GFX6 doesn't support V_CEIL_F64, lower it. */
2324
            /* trunc = trunc(src0)
2325
             * if (src0 > 0.0 && src0 != trunc)
2326
             *    trunc += 1.0
2327
             */
2328
            Temp src0 = get_alu_src(ctx, instr->src[0]);
2329
            Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2330
            Temp tmp0 =
2331
               bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2332
            Temp tmp1 =
2333
               bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
2334
            Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc),
2335
                                 tmp0, tmp1);
2336
            Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2337
                                bld.copy(bld.def(v1), Operand::zero()),
2338
                                bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2339
            add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2340
                             bld.copy(bld.def(v1), Operand::zero()), add);
2341
            bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2342
         }
2343
      } else {
2344
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2345
      }
2346
      break;
2347
   }
2348
   case nir_op_ftrunc: {
2349
      if (dst.regClass() == v2b) {
2350
         emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2351
      } else if (dst.regClass() == v1) {
2352
         emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2353
      } else if (dst.regClass() == v2) {
2354
         Temp src = get_alu_src(ctx, instr->src[0]);
2355
         emit_trunc_f64(ctx, bld, Definition(dst), src);
2356
      } else {
2357
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2358
      }
2359
      break;
2360
   }
2361
   case nir_op_fround_even: {
2362
      if (dst.regClass() == v2b) {
2363
         emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2364
      } else if (dst.regClass() == v1) {
2365
         emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2366
      } else if (dst.regClass() == v2) {
2367
         if (ctx->options->chip_class >= GFX7) {
2368
            emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2369
         } else {
2370
            /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2371
            Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2372
            Temp src0 = get_alu_src(ctx, instr->src[0]);
2373
            bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2374

2375
            Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2376
                                    bld.copy(bld.def(s1), Operand::c32(-2u)));
2377
            Temp bfi =
2378
               bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2379
                        bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2380
            Temp tmp =
2381
               bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0,
2382
                        bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2383
            Instruction* sub =
2384
               bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp,
2385
                        bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2386
            sub->vop3().neg[1] = true;
2387
            tmp = sub->definitions[0].getTemp();
2388

2389
            Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2390
                                Operand::c32(0x432fffffu));
2391
            Instruction* vop3 =
2392
               bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2393
            vop3->vop3().abs[0] = true;
2394
            Temp cond = vop3->definitions[0].getTemp();
2395

2396
            Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2397
            bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2398
            Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2399
                                     as_vgpr(ctx, src0_lo), cond);
2400
            Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2401
                                     as_vgpr(ctx, src0_hi), cond);
2402

2403
            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2404
         }
2405
      } else {
2406
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2407
      }
2408
      break;
2409
   }
2410
   case nir_op_fsin:
2411
   case nir_op_fcos: {
2412
      Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2413
      aco_ptr<Instruction> norm;
2414
      if (dst.regClass() == v2b) {
2415
         Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3118u));
2416
         Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2417
         aco_opcode opcode =
2418
            instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2419
         bld.vop1(opcode, Definition(dst), tmp);
2420
      } else if (dst.regClass() == v1) {
2421
         Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3e22f983u));
2422
         Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2423

2424
         /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2425
         if (ctx->options->chip_class < GFX9)
2426
            tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2427

2428
         aco_opcode opcode =
2429
            instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2430
         bld.vop1(opcode, Definition(dst), tmp);
2431
      } else {
2432
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2433
      }
2434
      break;
2435
   }
2436
   case nir_op_ldexp: {
2437
      if (dst.regClass() == v2b) {
2438
         emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2439
      } else if (dst.regClass() == v1) {
2440
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2441
      } else if (dst.regClass() == v2) {
2442
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2443
      } else {
2444
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2445
      }
2446
      break;
2447
   }
2448
   case nir_op_frexp_sig: {
2449
      if (dst.regClass() == v2b) {
2450
         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2451
      } else if (dst.regClass() == v1) {
2452
         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2453
      } else if (dst.regClass() == v2) {
2454
         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2455
      } else {
2456
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2457
      }
2458
      break;
2459
   }
2460
   case nir_op_frexp_exp: {
2461
      if (instr->src[0].src.ssa->bit_size == 16) {
2462
         Temp src = get_alu_src(ctx, instr->src[0]);
2463
         Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2464
         tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2465
         convert_int(ctx, bld, tmp, 8, 32, true, dst);
2466
      } else if (instr->src[0].src.ssa->bit_size == 32) {
2467
         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2468
      } else if (instr->src[0].src.ssa->bit_size == 64) {
2469
         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2470
      } else {
2471
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2472
      }
2473
      break;
2474
   }
2475
   case nir_op_fsign: {
2476
      Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2477
      if (dst.regClass() == v2b) {
2478
         assert(ctx->program->chip_class >= GFX9);
2479
         /* replace negative zero with positive zero */
2480
         src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src);
2481
         src =
2482
            bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src, Operand::c16(1u));
2483
         bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2484
      } else if (dst.regClass() == v1) {
2485
         src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src);
2486
         src =
2487
            bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u));
2488
         bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2489
      } else if (dst.regClass() == v2) {
2490
         Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)),
2491
                              Operand::zero(), src);
2492
         Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2493
         Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2494
                                   emit_extract_vector(ctx, src, 1, v1), cond);
2495

2496
         cond =
2497
            bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);
2498
         tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2499
         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2500

2501
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2502
      } else {
2503
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2504
      }
2505
      break;
2506
   }
2507
   case nir_op_f2f16:
2508
   case nir_op_f2f16_rtne: {
2509
      Temp src = get_alu_src(ctx, instr->src[0]);
2510
      if (instr->src[0].src.ssa->bit_size == 64)
2511
         src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2512
      if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2513
         /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2514
          * keep value numbering and the scheduler simpler.
2515
          */
2516
         bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2517
      else
2518
         bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2519
      break;
2520
   }
2521
   case nir_op_f2f16_rtz: {
2522
      Temp src = get_alu_src(ctx, instr->src[0]);
2523
      if (instr->src[0].src.ssa->bit_size == 64)
2524
         src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2525
      if (ctx->block->fp_mode.round16_64 == fp_round_tz)
2526
         bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2527
      else if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
2528
         bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
2529
      else
2530
         bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2531
      break;
2532
   }
2533
   case nir_op_f2f32: {
2534
      if (instr->src[0].src.ssa->bit_size == 16) {
2535
         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2536
      } else if (instr->src[0].src.ssa->bit_size == 64) {
2537
         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2538
      } else {
2539
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2540
      }
2541
      break;
2542
   }
2543
   case nir_op_f2f64: {
2544
      Temp src = get_alu_src(ctx, instr->src[0]);
2545
      if (instr->src[0].src.ssa->bit_size == 16)
2546
         src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2547
      bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2548
      break;
2549
   }
2550
   case nir_op_i2f16: {
2551
      assert(dst.regClass() == v2b);
2552
      Temp src = get_alu_src(ctx, instr->src[0]);
2553
      const unsigned input_size = instr->src[0].src.ssa->bit_size;
2554
      if (input_size <= 16) {
2555
         /* Expand integer to the size expected by the uint→float converter used below */
2556
         unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);
2557
         if (input_size != target_size) {
2558
            src = convert_int(ctx, bld, src, input_size, target_size, true);
2559
         }
2560
      } else if (input_size == 64) {
2561
         /* Truncate down to 32 bits; if any of the upper bits are relevant,
2562
          * the value does not fall into the single-precision float range
2563
          * anyway. SPIR-V does not mandate any specific behavior for such
2564
          * large inputs.
2565
          */
2566
         src = convert_int(ctx, bld, src, 64, 32, false);
2567
      }
2568

2569
      if (ctx->program->chip_class >= GFX8 && input_size <= 16) {
2570
         bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2571
      } else {
2572
         /* Convert to f32 and then down to f16. This is needed to handle
2573
          * inputs slightly outside the range [INT16_MIN, INT16_MAX],
2574
          * which are representable via f16 but wouldn't be converted
2575
          * correctly by v_cvt_f16_i16.
2576
          *
2577
          * This is also the fallback-path taken on GFX7 and earlier, which
2578
          * do not support direct f16⟷i16 conversions.
2579
          */
2580
         src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
2581
         bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2582
      }
2583
      break;
2584
   }
2585
   case nir_op_i2f32: {
2586
      assert(dst.size() == 1);
2587
      Temp src = get_alu_src(ctx, instr->src[0]);
2588
      const unsigned input_size = instr->src[0].src.ssa->bit_size;
2589
      if (input_size <= 32) {
2590
         if (input_size <= 16) {
2591
            /* Sign-extend to 32-bits */
2592
            src = convert_int(ctx, bld, src, input_size, 32, true);
2593
         }
2594
         bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2595
      } else {
2596
         assert(input_size == 64);
2597
         RegClass rc = RegClass(src.type(), 1);
2598
         Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2599
         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2600
         lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2601
         upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2602
         upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2603
         upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);
2604
         bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);
2605
      }
2606

2607
      break;
2608
   }
2609
   case nir_op_i2f64: {
2610
      if (instr->src[0].src.ssa->bit_size <= 32) {
2611
         Temp src = get_alu_src(ctx, instr->src[0]);
2612
         if (instr->src[0].src.ssa->bit_size <= 16)
2613
            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2614
         bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2615
      } else if (instr->src[0].src.ssa->bit_size == 64) {
2616
         Temp src = get_alu_src(ctx, instr->src[0]);
2617
         RegClass rc = RegClass(src.type(), 1);
2618
         Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2619
         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2620
         lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2621
         upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2622
         upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2623
         bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2624

2625
      } else {
2626
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2627
      }
2628
      break;
2629
   }
2630
   case nir_op_u2f16: {
2631
      assert(dst.regClass() == v2b);
2632
      Temp src = get_alu_src(ctx, instr->src[0]);
2633
      const unsigned input_size = instr->src[0].src.ssa->bit_size;
2634
      if (input_size <= 16) {
2635
         /* Expand integer to the size expected by the uint→float converter used below */
2636
         unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);
2637
         if (input_size != target_size) {
2638
            src = convert_int(ctx, bld, src, input_size, target_size, false);
2639
         }
2640
      } else if (input_size == 64) {
2641
         /* Truncate down to 32 bits; if any of the upper bits are non-zero,
2642
          * the value does not fall into the single-precision float range
2643
          * anyway. SPIR-V does not mandate any specific behavior for such
2644
          * large inputs.
2645
          */
2646
         src = convert_int(ctx, bld, src, 64, 32, false);
2647
      }
2648

2649
      if (ctx->program->chip_class >= GFX8) {
2650
         /* float16 has a range of [0, 65519]. Converting from larger
2651
          * inputs is UB, so we just need to consider the lower 16 bits */
2652
         bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2653
      } else {
2654
         /* GFX7 and earlier do not support direct f16⟷u16 conversions */
2655
         src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
2656
         bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2657
      }
2658
      break;
2659
   }
2660
   case nir_op_u2f32: {
2661
      assert(dst.size() == 1);
2662
      Temp src = get_alu_src(ctx, instr->src[0]);
2663
      const unsigned input_size = instr->src[0].src.ssa->bit_size;
2664
      if (input_size == 8) {
2665
         bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
2666
      } else if (input_size <= 32) {
2667
         if (input_size == 16)
2668
            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2669
         bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
2670
      } else {
2671
         assert(input_size == 64);
2672
         RegClass rc = RegClass(src.type(), 1);
2673
         Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2674
         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2675
         lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2676
         upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2677
         upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2678
         upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);
2679
         bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);
2680
      }
2681
      break;
2682
   }
2683
   case nir_op_u2f64: {
2684
      if (instr->src[0].src.ssa->bit_size <= 32) {
2685
         Temp src = get_alu_src(ctx, instr->src[0]);
2686
         if (instr->src[0].src.ssa->bit_size <= 16)
2687
            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2688
         bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
2689
      } else if (instr->src[0].src.ssa->bit_size == 64) {
2690
         Temp src = get_alu_src(ctx, instr->src[0]);
2691
         RegClass rc = RegClass(src.type(), 1);
2692
         Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2693
         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2694
         lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2695
         upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2696
         upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2697
         bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2698
      } else {
2699
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2700
      }
2701
      break;
2702
   }
2703
   case nir_op_f2i8:
2704
   case nir_op_f2i16: {
2705
      if (instr->src[0].src.ssa->bit_size == 16) {
2706
         if (ctx->program->chip_class >= GFX8) {
2707
            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
2708
         } else {
2709
            /* GFX7 and earlier do not support direct f16⟷i16 conversions */
2710
            Temp tmp = bld.tmp(v1);
2711
            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
2712
            tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
2713
            tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
2714
                              (dst.type() == RegType::sgpr) ? Temp() : dst);
2715
            if (dst.type() == RegType::sgpr) {
2716
               bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2717
            }
2718
         }
2719
      } else if (instr->src[0].src.ssa->bit_size == 32) {
2720
         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2721
      } else {
2722
         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2723
      }
2724
      break;
2725
   }
2726
   case nir_op_f2u8:
2727
   case nir_op_f2u16: {
2728
      if (instr->src[0].src.ssa->bit_size == 16) {
2729
         if (ctx->program->chip_class >= GFX8) {
2730
            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
2731
         } else {
2732
            /* GFX7 and earlier do not support direct f16⟷u16 conversions */
2733
            Temp tmp = bld.tmp(v1);
2734
            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
2735
            tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
2736
            tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
2737
                              (dst.type() == RegType::sgpr) ? Temp() : dst);
2738
            if (dst.type() == RegType::sgpr) {
2739
               bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2740
            }
2741
         }
2742
      } else if (instr->src[0].src.ssa->bit_size == 32) {
2743
         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2744
      } else {
2745
         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2746
      }
2747
      break;
2748
   }
2749
   case nir_op_f2i32: {
2750
      Temp src = get_alu_src(ctx, instr->src[0]);
2751
      if (instr->src[0].src.ssa->bit_size == 16) {
2752
         Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2753
         if (dst.type() == RegType::vgpr) {
2754
            bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
2755
         } else {
2756
            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2757
                       bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
2758
         }
2759
      } else if (instr->src[0].src.ssa->bit_size == 32) {
2760
         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2761
      } else if (instr->src[0].src.ssa->bit_size == 64) {
2762
         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2763
      } else {
2764
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2765
      }
2766
      break;
2767
   }
2768
   case nir_op_f2u32: {
2769
      Temp src = get_alu_src(ctx, instr->src[0]);
2770
      if (instr->src[0].src.ssa->bit_size == 16) {
2771
         Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2772
         if (dst.type() == RegType::vgpr) {
2773
            bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
2774
         } else {
2775
            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2776
                       bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
2777
         }
2778
      } else if (instr->src[0].src.ssa->bit_size == 32) {
2779
         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2780
      } else if (instr->src[0].src.ssa->bit_size == 64) {
2781
         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2782
      } else {
2783
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2784
      }
2785
      break;
2786
   }
2787
   case nir_op_f2i64: {
2788
      Temp src = get_alu_src(ctx, instr->src[0]);
2789
      if (instr->src[0].src.ssa->bit_size == 16)
2790
         src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2791

2792
      if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2793
         Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2794
         exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::zero(), exponent,
2795
                             Operand::c32(64u));
2796
         Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);
2797
         Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), src);
2798
         mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);
2799
         mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), mantissa);
2800
         mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);
2801
         Temp new_exponent = bld.tmp(v1);
2802
         Temp borrow =
2803
            bld.vsub32(Definition(new_exponent), Operand::c32(63u), exponent, true).def(1).getTemp();
2804
         if (ctx->program->chip_class >= GFX8)
2805
            mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
2806
         else
2807
            mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
2808
         Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand::c32(0xfffffffeu));
2809
         Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2810
         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2811
         lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower,
2812
                              Operand::c32(0xffffffffu), borrow);
2813
         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
2814
         lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
2815
         upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
2816
         Temp new_lower = bld.tmp(v1);
2817
         borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
2818
         Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
2819
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
2820

2821
      } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2822
         if (src.type() == RegType::vgpr)
2823
            src = bld.as_uniform(src);
2824
         Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,
2825
                                  Operand::c32(0x80017u));
2826
         exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,
2827
                             Operand::c32(126u));
2828
         exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),
2829
                             exponent);
2830
         exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc),
2831
                             Operand::c32(64u), exponent);
2832
         Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
2833
                                  Operand::c32(0x7fffffu), src);
2834
         Temp sign =
2835
            bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(31u));
2836
         mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
2837
                             Operand::c32(0x800000u), mantissa);
2838
         mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa,
2839
                             Operand::c32(7u));
2840
         mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);
2841
         exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2842
                             Operand::c32(63u), exponent);
2843
         mantissa =
2844
            bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
2845
         Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent,
2846
                              Operand::c32(0xffffffffu)); // exp >= 64
2847
         Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand::c32(0xfffffffeu));
2848
         mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
2849
         Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2850
         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2851
         lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
2852
         upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
2853
         Temp borrow = bld.tmp(s1);
2854
         lower =
2855
            bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
2856
         upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign,
2857
                          bld.scc(borrow));
2858
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2859

2860
      } else if (instr->src[0].src.ssa->bit_size == 64) {
2861
         Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2862
                               Operand::c32(0x3df00000u));
2863
         Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2864
         Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2865
         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2866
                          Operand::c32(0xc1f00000u));
2867
         Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2868
         Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2869
         Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2870
         Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
2871
         if (dst.type() == RegType::sgpr) {
2872
            lower = bld.as_uniform(lower);
2873
            upper = bld.as_uniform(upper);
2874
         }
2875
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2876

2877
      } else {
2878
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2879
      }
2880
      break;
2881
   }
2882
   case nir_op_f2u64: {
2883
      Temp src = get_alu_src(ctx, instr->src[0]);
2884
      if (instr->src[0].src.ssa->bit_size == 16)
2885
         src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2886

2887
      if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2888
         Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2889
         Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)),
2890
                                           Operand::c32(64u), exponent);
2891
         exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::zero(), exponent);
2892
         Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);
2893
         mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);
2894
         Temp exponent_small = bld.vsub32(bld.def(v1), Operand::c32(24u), exponent);
2895
         Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
2896
         mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);
2897
         Temp new_exponent = bld.tmp(v1);
2898
         Temp cond_small =
2899
            bld.vsub32(Definition(new_exponent), exponent, Operand::c32(24u), true).def(1).getTemp();
2900
         if (ctx->program->chip_class >= GFX8)
2901
            mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
2902
         else
2903
            mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
2904
         Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2905
         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2906
         lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
2907
         upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand::zero(),
2908
                              cond_small);
2909
         lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), lower,
2910
                          exponent_in_range);
2911
         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), upper,
2912
                          exponent_in_range);
2913
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2914

2915
      } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2916
         if (src.type() == RegType::vgpr)
2917
            src = bld.as_uniform(src);
2918
         Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,
2919
                                  Operand::c32(0x80017u));
2920
         exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,
2921
                             Operand::c32(126u));
2922
         exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),
2923
                             exponent);
2924
         Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
2925
                                  Operand::c32(0x7fffffu), src);
2926
         mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
2927
                             Operand::c32(0x800000u), mantissa);
2928
         Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2929
                                        Operand::c32(24u), exponent);
2930
         Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa,
2931
                               exponent_small);
2932
         mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);
2933
         Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2934
                                        exponent, Operand::c32(24u));
2935
         mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa,
2936
                             exponent_large);
2937
         Temp cond =
2938
            bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand::c32(64u), exponent);
2939
         mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa,
2940
                             Operand::c32(0xffffffffu), cond);
2941
         Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2942
         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2943
         Temp cond_small =
2944
            bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand::c32(24u));
2945
         lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2946
         upper =
2947
            bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::zero(), upper, cond_small);
2948
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2949

2950
      } else if (instr->src[0].src.ssa->bit_size == 64) {
2951
         Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2952
                               Operand::c32(0x3df00000u));
2953
         Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2954
         Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2955
         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2956
                          Operand::c32(0xc1f00000u));
2957
         Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2958
         Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2959
         Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2960
         Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2961
         if (dst.type() == RegType::sgpr) {
2962
            lower = bld.as_uniform(lower);
2963
            upper = bld.as_uniform(upper);
2964
         }
2965
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2966

2967
      } else {
2968
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2969
      }
2970
      break;
2971
   }
2972
   case nir_op_b2f16: {
2973
      Temp src = get_alu_src(ctx, instr->src[0]);
2974
      assert(src.regClass() == bld.lm);
2975

2976
      if (dst.regClass() == s1) {
2977
         src = bool_to_scalar_condition(ctx, src);
2978
         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
2979
      } else if (dst.regClass() == v2b) {
2980
         Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
2981
         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
2982
      } else {
2983
         unreachable("Wrong destination register class for nir_op_b2f16.");
2984
      }
2985
      break;
2986
   }
2987
   case nir_op_b2f32: {
2988
      Temp src = get_alu_src(ctx, instr->src[0]);
2989
      assert(src.regClass() == bld.lm);
2990

2991
      if (dst.regClass() == s1) {
2992
         src = bool_to_scalar_condition(ctx, src);
2993
         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
2994
      } else if (dst.regClass() == v1) {
2995
         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
2996
                      Operand::c32(0x3f800000u), src);
2997
      } else {
2998
         unreachable("Wrong destination register class for nir_op_b2f32.");
2999
      }
3000
      break;
3001
   }
3002
   case nir_op_b2f64: {
3003
      Temp src = get_alu_src(ctx, instr->src[0]);
3004
      assert(src.regClass() == bld.lm);
3005

3006
      if (dst.regClass() == s2) {
3007
         src = bool_to_scalar_condition(ctx, src);
3008
         bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3009
                  Operand::zero(), bld.scc(src));
3010
      } else if (dst.regClass() == v2) {
3011
         Temp one = bld.copy(bld.def(v2), Operand::c32(0x3FF00000u));
3012
         Temp upper =
3013
            bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3014
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3015
      } else {
3016
         unreachable("Wrong destination register class for nir_op_b2f64.");
3017
      }
3018
      break;
3019
   }
3020
   case nir_op_i2i8:
3021
   case nir_op_i2i16:
3022
   case nir_op_i2i32:
3023
   case nir_op_i2i64: {
3024
      if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3025
         /* no need to do the extract in get_alu_src() */
3026
         sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
3027
                                     ? sgpr_extract_sext
3028
                                     : sgpr_extract_undef;
3029
         extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3030
      } else {
3031
         const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3032
         const unsigned output_bitsize = instr->dest.dest.ssa.bit_size;
3033
         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3034
                     output_bitsize > input_bitsize, dst);
3035
      }
3036
      break;
3037
   }
3038
   case nir_op_u2u8:
3039
   case nir_op_u2u16:
3040
   case nir_op_u2u32:
3041
   case nir_op_u2u64: {
3042
      if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3043
         /* no need to do the extract in get_alu_src() */
3044
         sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
3045
                                     ? sgpr_extract_zext
3046
                                     : sgpr_extract_undef;
3047
         extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3048
      } else {
3049
         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3050
                     instr->dest.dest.ssa.bit_size, false, dst);
3051
      }
3052
      break;
3053
   }
3054
   case nir_op_b2b32:
3055
   case nir_op_b2i8:
3056
   case nir_op_b2i16:
3057
   case nir_op_b2i32:
3058
   case nir_op_b2i64: {
3059
      Temp src = get_alu_src(ctx, instr->src[0]);
3060
      assert(src.regClass() == bld.lm);
3061

3062
      Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
3063
      if (tmp.regClass() == s1) {
3064
         bool_to_scalar_condition(ctx, src, tmp);
3065
      } else if (tmp.type() == RegType::vgpr) {
3066
         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand::zero(), Operand::c32(1u),
3067
                      src);
3068
      } else {
3069
         unreachable("Invalid register class for b2i32");
3070
      }
3071

3072
      if (tmp != dst)
3073
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
3074
      break;
3075
   }
3076
   case nir_op_b2b1:
3077
   case nir_op_i2b1: {
3078
      Temp src = get_alu_src(ctx, instr->src[0]);
3079
      assert(dst.regClass() == bld.lm);
3080

3081
      if (src.type() == RegType::vgpr) {
3082
         assert(src.regClass() == v1 || src.regClass() == v2);
3083
         assert(dst.regClass() == bld.lm);
3084
         bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3085
                  Definition(dst), Operand::zero(), src)
3086
            .def(0)
3087
            .setHint(vcc);
3088
      } else {
3089
         assert(src.regClass() == s1 || src.regClass() == s2);
3090
         Temp tmp;
3091
         if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
3092
            tmp =
3093
               bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3094
                  .def(1)
3095
                  .getTemp();
3096
         } else {
3097
            tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3098
                           bld.scc(bld.def(s1)), Operand::zero(), src);
3099
         }
3100
         bool_to_vector_condition(ctx, tmp, dst);
3101
      }
3102
      break;
3103
   }
3104
   case nir_op_unpack_64_2x32:
3105
   case nir_op_unpack_32_2x16:
3106
   case nir_op_unpack_64_4x16:
3107
      bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3108
      emit_split_vector(ctx, dst, instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3109
      break;
3110
   case nir_op_pack_64_2x32_split: {
3111
      Temp src0 = get_alu_src(ctx, instr->src[0]);
3112
      Temp src1 = get_alu_src(ctx, instr->src[1]);
3113

3114
      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3115
      break;
3116
   }
3117
   case nir_op_unpack_64_2x32_split_x:
3118
      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3119
                 get_alu_src(ctx, instr->src[0]));
3120
      break;
3121
   case nir_op_unpack_64_2x32_split_y:
3122
      bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3123
                 get_alu_src(ctx, instr->src[0]));
3124
      break;
3125
   case nir_op_unpack_32_2x16_split_x:
3126
      if (dst.type() == RegType::vgpr) {
3127
         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3128
                    get_alu_src(ctx, instr->src[0]));
3129
      } else {
3130
         bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3131
      }
3132
      break;
3133
   case nir_op_unpack_32_2x16_split_y:
3134
      if (dst.type() == RegType::vgpr) {
3135
         bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3136
                    get_alu_src(ctx, instr->src[0]));
3137
      } else {
3138
         bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3139
                    get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3140
                    Operand::zero());
3141
      }
3142
      break;
3143
   case nir_op_pack_32_2x16_split: {
3144
      Temp src0 = get_alu_src(ctx, instr->src[0]);
3145
      Temp src1 = get_alu_src(ctx, instr->src[1]);
3146
      if (dst.regClass() == v1) {
3147
         src0 = emit_extract_vector(ctx, src0, 0, v2b);
3148
         src1 = emit_extract_vector(ctx, src1, 0, v2b);
3149
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3150
      } else {
3151
         src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3152
                         Operand::c32(0xFFFFu));
3153
         src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3154
                         Operand::c32(16u));
3155
         bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3156
      }
3157
      break;
3158
   }
3159
   case nir_op_pack_half_2x16_split: {
3160
      if (dst.regClass() == v1) {
3161
         nir_const_value* val = nir_src_as_const_value(instr->src[1].src);
3162
         if (val && val->u32 == 0 && ctx->program->chip_class <= GFX9) {
3163
            /* upper bits zero on GFX6-GFX9 */
3164
            bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), get_alu_src(ctx, instr->src[0]));
3165
         } else if (!ctx->block->fp_mode.care_about_round16_64 ||
3166
                    ctx->block->fp_mode.round16_64 == fp_round_tz) {
3167
            if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
3168
               emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3169
            else
3170
               emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3171
         } else {
3172
            Temp src0 =
3173
               bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[0]));
3174
            Temp src1 =
3175
               bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[1]));
3176
            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3177
         }
3178
      } else {
3179
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3180
      }
3181
      break;
3182
   }
3183
   case nir_op_unpack_half_2x16_split_x_flush_to_zero:
3184
   case nir_op_unpack_half_2x16_split_x: {
3185
      Temp src = get_alu_src(ctx, instr->src[0]);
3186
      if (src.regClass() == v1)
3187
         src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3188
      if (dst.regClass() == v1) {
3189
         assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3190
                (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));
3191
         bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3192
      } else {
3193
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3194
      }
3195
      break;
3196
   }
3197
   case nir_op_unpack_half_2x16_split_y_flush_to_zero:
3198
   case nir_op_unpack_half_2x16_split_y: {
3199
      Temp src = get_alu_src(ctx, instr->src[0]);
3200
      if (src.regClass() == s1)
3201
         src =
3202
            bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(16u));
3203
      else
3204
         src =
3205
            bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3206
      if (dst.regClass() == v1) {
3207
         assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3208
                (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));
3209
         bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3210
      } else {
3211
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3212
      }
3213
      break;
3214
   }
3215
   case nir_op_sad_u8x4: {
3216
      assert(dst.regClass() == v1);
3217
      emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false);
3218
      break;
3219
   }
3220
   case nir_op_fquantize2f16: {
3221
      Temp src = get_alu_src(ctx, instr->src[0]);
3222
      Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
3223
      Temp f32, cmp_res;
3224

3225
      if (ctx->program->chip_class >= GFX8) {
3226
         Temp mask = bld.copy(
3227
            bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */
3228
         cmp_res =
3229
            bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
3230
         f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3231
      } else {
3232
         /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3233
          * so compare the result and flush to 0 if it's smaller.
3234
          */
3235
         f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3236
         Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3237
         Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3238
         tmp0->vop3().abs[0] = true;
3239
         Temp tmp1 =
3240
            bld.vopc(aco_opcode::v_cmp_lg_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), f32);
3241
         cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc),
3242
                            tmp0->definitions[0].getTemp(), tmp1);
3243
      }
3244

3245
      if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
3246
         Temp copysign_0 =
3247
            bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3248
         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
3249
      } else {
3250
         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res);
3251
      }
3252
      break;
3253
   }
3254
   case nir_op_bfm: {
3255
      Temp bits = get_alu_src(ctx, instr->src[0]);
3256
      Temp offset = get_alu_src(ctx, instr->src[1]);
3257

3258
      if (dst.regClass() == s1) {
3259
         bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3260
      } else if (dst.regClass() == v1) {
3261
         bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3262
      } else {
3263
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3264
      }
3265
      break;
3266
   }
3267
   case nir_op_bitfield_select: {
3268

3269
      /* dst = (insert & bitmask) | (base & ~bitmask) */
3270
      if (dst.regClass() == s1) {
3271
         Temp bitmask = get_alu_src(ctx, instr->src[0]);
3272
         Temp insert = get_alu_src(ctx, instr->src[1]);
3273
         Temp base = get_alu_src(ctx, instr->src[2]);
3274
         aco_ptr<Instruction> sop2;
3275
         nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3276
         nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3277
         Operand lhs;
3278
         if (const_insert && const_bitmask) {
3279
            lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3280
         } else {
3281
            insert =
3282
               bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3283
            lhs = Operand(insert);
3284
         }
3285

3286
         Operand rhs;
3287
         nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3288
         if (const_base && const_bitmask) {
3289
            rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3290
         } else {
3291
            base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3292
            rhs = Operand(base);
3293
         }
3294

3295
         bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3296

3297
      } else if (dst.regClass() == v1) {
3298
         emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3299
      } else {
3300
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3301
      }
3302
      break;
3303
   }
3304
   case nir_op_ubfe:
3305
   case nir_op_ibfe: {
3306
      if (dst.bytes() != 4)
3307
         unreachable("Unsupported BFE bit size");
3308

3309
      if (dst.type() == RegType::sgpr) {
3310
         Temp base = get_alu_src(ctx, instr->src[0]);
3311

3312
         nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3313
         nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3314
         if (const_offset && const_bits) {
3315
            uint32_t extract = (const_bits->u32 << 16) | (const_offset->u32 & 0x1f);
3316
            aco_opcode opcode =
3317
               instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3318
            bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3319
            break;
3320
         }
3321

3322
         Temp offset = get_alu_src(ctx, instr->src[1]);
3323
         Temp bits = get_alu_src(ctx, instr->src[2]);
3324
         if (instr->op == nir_op_ubfe) {
3325
            Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3326
            Temp masked =
3327
               bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3328
            bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3329
         } else {
3330
            Operand bits_op = const_bits ? Operand::c32(const_bits->u32 << 16)
3331
                                         : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1),
3332
                                                    bld.def(s1, scc), bits, Operand::c32(16u));
3333
            Operand offset_op = const_offset
3334
                                   ? Operand::c32(const_offset->u32 & 0x1fu)
3335
                                   : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3336
                                              offset, Operand::c32(0x1fu));
3337

3338
            Temp extract =
3339
               bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3340
            bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3341
         }
3342

3343
      } else {
3344
         aco_opcode opcode =
3345
            instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3346
         emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3347
      }
3348
      break;
3349
   }
3350
   case nir_op_extract_u8:
3351
   case nir_op_extract_i8:
3352
   case nir_op_extract_u16:
3353
   case nir_op_extract_i16: {
3354
      bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3355
      unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3356
      uint32_t bits = comp == 4 ? 8 : 16;
3357
      unsigned index = nir_src_as_uint(instr->src[1].src);
3358
      if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {
3359
         assert(index == 0);
3360
         bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3361
      } else if (dst.regClass() == s1 && instr->dest.dest.ssa.bit_size == 16) {
3362
         Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3363
         unsigned swizzle = instr->src[0].swizzle[0];
3364
         if (vec.size() > 1) {
3365
            vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3366
            swizzle = swizzle & 1;
3367
         }
3368
         index += swizzle * instr->dest.dest.ssa.bit_size / bits;
3369
         bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3370
                    Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3371
      } else {
3372
         Temp src = get_alu_src(ctx, instr->src[0]);
3373
         Definition def(dst);
3374
         if (dst.bytes() == 8) {
3375
            src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3376
            index %= comp;
3377
            def = bld.def(src.type(), 1);
3378
         }
3379
         assert(def.bytes() <= 4);
3380
         if (def.regClass() == s1) {
3381
            bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
3382
                       Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3383
         } else {
3384
            src = emit_extract_vector(ctx, src, 0, def.regClass());
3385
            bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3386
                       Operand::c32(bits), Operand::c32(is_signed));
3387
         }
3388
         if (dst.size() == 2)
3389
            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3390
                       Operand::zero());
3391
      }
3392
      break;
3393
   }
3394
   case nir_op_insert_u8:
3395
   case nir_op_insert_u16: {
3396
      unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3397
      uint32_t bits = comp == 4 ? 8 : 16;
3398
      unsigned index = nir_src_as_uint(instr->src[1].src);
3399
      if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {
3400
         assert(index == 0);
3401
         bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3402
      } else {
3403
         Temp src = get_alu_src(ctx, instr->src[0]);
3404
         Definition def(dst);
3405
         bool swap = false;
3406
         if (dst.bytes() == 8) {
3407
            src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3408
            swap = index >= comp;
3409
            index %= comp;
3410
            def = bld.def(src.type(), 1);
3411
         }
3412
         if (def.regClass() == s1) {
3413
            bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3414
                       Operand::c32(index), Operand::c32(bits));
3415
         } else {
3416
            src = emit_extract_vector(ctx, src, 0, def.regClass());
3417
            bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3418
                       Operand::c32(bits));
3419
         }
3420
         if (dst.size() == 2 && swap)
3421
            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3422
                       def.getTemp());
3423
         else if (dst.size() == 2)
3424
            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3425
                       Operand::zero());
3426
      }
3427
      break;
3428
   }
3429
   case nir_op_bit_count: {
3430
      Temp src = get_alu_src(ctx, instr->src[0]);
3431
      if (src.regClass() == s1) {
3432
         bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3433
      } else if (src.regClass() == v1) {
3434
         bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3435
      } else if (src.regClass() == v2) {
3436
         bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3437
                  bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3438
                           emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3439
      } else if (src.regClass() == s2) {
3440
         bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3441
      } else {
3442
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3443
      }
3444
      break;
3445
   }
3446
   case nir_op_flt: {
3447
      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3448
                      aco_opcode::v_cmp_lt_f64);
3449
      break;
3450
   }
3451
   case nir_op_fge: {
3452
      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3453
                      aco_opcode::v_cmp_ge_f64);
3454
      break;
3455
   }
3456
   case nir_op_feq: {
3457
      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3458
                      aco_opcode::v_cmp_eq_f64);
3459
      break;
3460
   }
3461
   case nir_op_fneu: {
3462
      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3463
                      aco_opcode::v_cmp_neq_f64);
3464
      break;
3465
   }
3466
   case nir_op_ilt: {
3467
      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3468
                      aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
3469
      break;
3470
   }
3471
   case nir_op_ige: {
3472
      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3473
                      aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3474
      break;
3475
   }
3476
   case nir_op_ieq: {
3477
      if (instr->src[0].src.ssa->bit_size == 1)
3478
         emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3479
      else
3480
         emit_comparison(
3481
            ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3482
            aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3483
            ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3484
      break;
3485
   }
3486
   case nir_op_ine: {
3487
      if (instr->src[0].src.ssa->bit_size == 1)
3488
         emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3489
      else
3490
         emit_comparison(
3491
            ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3492
            aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3493
            ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3494
      break;
3495
   }
3496
   case nir_op_ult: {
3497
      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3498
                      aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3499
      break;
3500
   }
3501
   case nir_op_uge: {
3502
      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3503
                      aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3504
      break;
3505
   }
3506
   case nir_op_fddx:
3507
   case nir_op_fddy:
3508
   case nir_op_fddx_fine:
3509
   case nir_op_fddy_fine:
3510
   case nir_op_fddx_coarse:
3511
   case nir_op_fddy_coarse: {
3512
      Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
3513
      uint16_t dpp_ctrl1, dpp_ctrl2;
3514
      if (instr->op == nir_op_fddx_fine) {
3515
         dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3516
         dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3517
      } else if (instr->op == nir_op_fddy_fine) {
3518
         dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3519
         dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3520
      } else {
3521
         dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3522
         if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3523
            dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3524
         else
3525
            dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3526
      }
3527

3528
      Temp tmp;
3529
      if (ctx->program->chip_class >= GFX8) {
3530
         Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3531
         tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
3532
      } else {
3533
         Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3534
         Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3535
         tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
3536
      }
3537
      emit_wqm(bld, tmp, dst, true);
3538
      break;
3539
   }
3540
   default: isel_err(&instr->instr, "Unknown NIR ALU instr");
3541
   }
3542
}
3543

3544
void
3545
visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
3546
{
3547
   Temp dst = get_ssa_temp(ctx, &instr->def);
3548

3549
   // TODO: we really want to have the resulting type as this would allow for 64bit literals
3550
   // which get truncated the lsb if double and msb if int
3551
   // for now, we only use s_mov_b64 with 64bit inline constants
3552
   assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3553
   assert(dst.type() == RegType::sgpr);
3554

3555
   Builder bld(ctx->program, ctx->block);
3556

3557
   if (instr->def.bit_size == 1) {
3558
      assert(dst.regClass() == bld.lm);
3559
      int val = instr->value[0].b ? -1 : 0;
3560
      Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
3561
      bld.copy(Definition(dst), op);
3562
   } else if (instr->def.bit_size == 8) {
3563
      bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
3564
   } else if (instr->def.bit_size == 16) {
3565
      /* sign-extend to use s_movk_i32 instead of a literal */
3566
      bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
3567
   } else if (dst.size() == 1) {
3568
      bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
3569
   } else {
3570
      assert(dst.size() != 1);
3571
      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3572
         aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3573
      if (instr->def.bit_size == 64)
3574
         for (unsigned i = 0; i < dst.size(); i++)
3575
            vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
3576
      else {
3577
         for (unsigned i = 0; i < dst.size(); i++)
3578
            vec->operands[i] = Operand::c32(instr->value[i].u32);
3579
      }
3580
      vec->definitions[0] = Definition(dst);
3581
      ctx->block->instructions.emplace_back(std::move(vec));
3582
   }
3583
}
3584

3585
uint32_t
3586
widen_mask(uint32_t mask, unsigned multiplier)
3587
{
3588
   uint32_t new_mask = 0;
3589
   for (unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
3590
      if (mask & (1u << i))
3591
         new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
3592
   return new_mask;
3593
}
3594

3595
struct LoadEmitInfo {
3596
   Operand offset;
3597
   Temp dst;
3598
   unsigned num_components;
3599
   unsigned component_size;
3600
   Temp resource = Temp(0, s1);
3601
   unsigned component_stride = 0;
3602
   unsigned const_offset = 0;
3603
   unsigned align_mul = 0;
3604
   unsigned align_offset = 0;
3605

3606
   bool glc = false;
3607
   bool slc = false;
3608
   unsigned swizzle_component_size = 0;
3609
   memory_sync_info sync;
3610
   Temp soffset = Temp(0, s1);
3611
};
3612

3613
struct EmitLoadParameters {
3614
   using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
3615
                             unsigned bytes_needed, unsigned align, unsigned const_offset,
3616
                             Temp dst_hint);
3617

3618
   Callback callback;
3619
   bool byte_align_loads;
3620
   bool supports_8bit_16bit_loads;
3621
   unsigned max_const_offset_plus_one;
3622
};
3623

3624
void
3625
emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
3626
          const EmitLoadParameters& params)
3627
{
3628
   unsigned load_size = info.num_components * info.component_size;
3629
   unsigned component_size = info.component_size;
3630

3631
   unsigned num_vals = 0;
3632
   Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
3633

3634
   unsigned const_offset = info.const_offset;
3635

3636
   const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
3637
   unsigned align_offset = (info.align_offset + const_offset) % align_mul;
3638

3639
   unsigned bytes_read = 0;
3640
   while (bytes_read < load_size) {
3641
      unsigned bytes_needed = load_size - bytes_read;
3642

3643
      /* add buffer for unaligned loads */
3644
      int byte_align = 0;
3645
      if (params.byte_align_loads) {
3646
         byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
3647
      }
3648

3649
      if (byte_align) {
3650
         if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
3651
             !params.supports_8bit_16bit_loads) {
3652
            if (info.component_stride) {
3653
               assert(params.supports_8bit_16bit_loads && "unimplemented");
3654
               bytes_needed = 2;
3655
               byte_align = 0;
3656
            } else {
3657
               bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;
3658
               bytes_needed = align(bytes_needed, 4);
3659
            }
3660
         } else {
3661
            byte_align = 0;
3662
         }
3663
      }
3664

3665
      if (info.swizzle_component_size)
3666
         bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
3667
      if (info.component_stride)
3668
         bytes_needed = MIN2(bytes_needed, info.component_size);
3669

3670
      bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
3671

3672
      /* reduce constant offset */
3673
      Operand offset = info.offset;
3674
      unsigned reduced_const_offset = const_offset;
3675
      bool remove_const_offset_completely = need_to_align_offset;
3676
      if (const_offset &&
3677
          (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
3678
         unsigned to_add = const_offset;
3679
         if (remove_const_offset_completely) {
3680
            reduced_const_offset = 0;
3681
         } else {
3682
            to_add =
3683
               const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
3684
            reduced_const_offset %= params.max_const_offset_plus_one;
3685
         }
3686
         Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3687
         if (offset.isConstant()) {
3688
            offset = Operand::c32(offset.constantValue() + to_add);
3689
         } else if (offset_tmp.regClass() == s1) {
3690
            offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
3691
                              Operand::c32(to_add));
3692
         } else if (offset_tmp.regClass() == v1) {
3693
            offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
3694
         } else {
3695
            Temp lo = bld.tmp(offset_tmp.type(), 1);
3696
            Temp hi = bld.tmp(offset_tmp.type(), 1);
3697
            bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3698

3699
            if (offset_tmp.regClass() == s2) {
3700
               Temp carry = bld.tmp(s1);
3701
               lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
3702
                             Operand::c32(to_add));
3703
               hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
3704
               offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
3705
            } else {
3706
               Temp new_lo = bld.tmp(v1);
3707
               Temp carry =
3708
                  bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
3709
               hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
3710
               offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
3711
            }
3712
         }
3713
      }
3714

3715
      /* align offset down if needed */
3716
      Operand aligned_offset = offset;
3717
      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
3718
      if (need_to_align_offset) {
3719
         align = 4;
3720
         Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3721
         if (offset.isConstant()) {
3722
            aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);
3723
         } else if (offset_tmp.regClass() == s1) {
3724
            aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3725
                                      Operand::c32(0xfffffffcu), offset_tmp);
3726
         } else if (offset_tmp.regClass() == s2) {
3727
            aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
3728
                                      Operand::c64(0xfffffffffffffffcllu), offset_tmp);
3729
         } else if (offset_tmp.regClass() == v1) {
3730
            aligned_offset =
3731
               bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);
3732
         } else if (offset_tmp.regClass() == v2) {
3733
            Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
3734
            bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3735
            lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);
3736
            aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
3737
         }
3738
      }
3739
      Temp aligned_offset_tmp =
3740
         aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset);
3741

3742
      Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
3743
                                 reduced_const_offset, byte_align ? Temp() : info.dst);
3744

3745
      /* the callback wrote directly to dst */
3746
      if (val == info.dst) {
3747
         assert(num_vals == 0);
3748
         emit_split_vector(ctx, info.dst, info.num_components);
3749
         return;
3750
      }
3751

3752
      /* shift result right if needed */
3753
      if (params.byte_align_loads && info.component_size < 4) {
3754
         Operand byte_align_off = Operand::c32(byte_align);
3755
         if (byte_align == -1) {
3756
            if (offset.isConstant())
3757
               byte_align_off = Operand::c32(offset.constantValue() % 4u);
3758
            else if (offset.size() == 2)
3759
               byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
3760
                                                            RegClass(offset.getTemp().type(), 1)));
3761
            else
3762
               byte_align_off = offset;
3763
         }
3764

3765
         assert(val.bytes() >= load_size && "unimplemented");
3766
         if (val.type() == RegType::sgpr)
3767
            byte_align_scalar(ctx, val, byte_align_off, info.dst);
3768
         else
3769
            byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
3770
         return;
3771
      }
3772

3773
      /* add result to list and advance */
3774
      if (info.component_stride) {
3775
         assert(val.bytes() == info.component_size && "unimplemented");
3776
         const_offset += info.component_stride;
3777
         align_offset = (align_offset + info.component_stride) % align_mul;
3778
      } else {
3779
         const_offset += val.bytes();
3780
         align_offset = (align_offset + val.bytes()) % align_mul;
3781
      }
3782
      bytes_read += val.bytes();
3783
      vals[num_vals++] = val;
3784
   }
3785

3786
   /* create array of components */
3787
   unsigned components_split = 0;
3788
   std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3789
   bool has_vgprs = false;
3790
   for (unsigned i = 0; i < num_vals;) {
3791
      Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
3792
      unsigned num_tmps = 0;
3793
      unsigned tmp_size = 0;
3794
      RegType reg_type = RegType::sgpr;
3795
      while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
3796
         if (vals[i].type() == RegType::vgpr)
3797
            reg_type = RegType::vgpr;
3798
         tmp_size += vals[i].bytes();
3799
         tmp[num_tmps++] = vals[i++];
3800
      }
3801
      if (num_tmps > 1) {
3802
         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3803
            aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
3804
         for (unsigned j = 0; j < num_tmps; j++)
3805
            vec->operands[j] = Operand(tmp[j]);
3806
         tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
3807
         vec->definitions[0] = Definition(tmp[0]);
3808
         bld.insert(std::move(vec));
3809
      }
3810

3811
      if (tmp[0].bytes() % component_size) {
3812
         /* trim tmp[0] */
3813
         assert(i == num_vals);
3814
         RegClass new_rc =
3815
            RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
3816
         tmp[0] =
3817
            bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
3818
      }
3819

3820
      RegClass elem_rc = RegClass::get(reg_type, component_size);
3821

3822
      unsigned start = components_split;
3823

3824
      if (tmp_size == elem_rc.bytes()) {
3825
         allocated_vec[components_split++] = tmp[0];
3826
      } else {
3827
         assert(tmp_size % elem_rc.bytes() == 0);
3828
         aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
3829
            aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
3830
         for (auto& def : split->definitions) {
3831
            Temp component = bld.tmp(elem_rc);
3832
            allocated_vec[components_split++] = component;
3833
            def = Definition(component);
3834
         }
3835
         split->operands[0] = Operand(tmp[0]);
3836
         bld.insert(std::move(split));
3837
      }
3838

3839
      /* try to p_as_uniform early so we can create more optimizable code and
3840
       * also update allocated_vec */
3841
      for (unsigned j = start; j < components_split; j++) {
3842
         if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr)
3843
            allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
3844
         has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
3845
      }
3846
   }
3847

3848
   /* concatenate components and p_as_uniform() result if needed */
3849
   if (info.dst.type() == RegType::vgpr || !has_vgprs)
3850
      ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
3851

3852
   int padding_bytes =
3853
      MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
3854

3855
   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3856
      aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};
3857
   for (unsigned i = 0; i < info.num_components; i++)
3858
      vec->operands[i] = Operand(allocated_vec[i]);
3859
   if (padding_bytes)
3860
      vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
3861
   if (info.dst.type() == RegType::sgpr && has_vgprs) {
3862
      Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
3863
      vec->definitions[0] = Definition(tmp);
3864
      bld.insert(std::move(vec));
3865
      bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
3866
   } else {
3867
      vec->definitions[0] = Definition(info.dst);
3868
      bld.insert(std::move(vec));
3869
   }
3870
}
3871

3872
Operand
3873
load_lds_size_m0(Builder& bld)
3874
{
3875
   /* TODO: m0 does not need to be initialized on GFX9+ */
3876
   return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
3877
}
3878

3879
Temp
3880
lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3881
                  unsigned align, unsigned const_offset, Temp dst_hint)
3882
{
3883
   offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
3884

3885
   Operand m = load_lds_size_m0(bld);
3886

3887
   bool large_ds_read = bld.program->chip_class >= GFX7;
3888
   bool usable_read2 = bld.program->chip_class >= GFX7;
3889

3890
   bool read2 = false;
3891
   unsigned size = 0;
3892
   aco_opcode op;
3893
   if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
3894
      size = 16;
3895
      op = aco_opcode::ds_read_b128;
3896
   } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
3897
      size = 16;
3898
      read2 = true;
3899
      op = aco_opcode::ds_read2_b64;
3900
   } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
3901
      size = 12;
3902
      op = aco_opcode::ds_read_b96;
3903
   } else if (bytes_needed >= 8 && align % 8 == 0) {
3904
      size = 8;
3905
      op = aco_opcode::ds_read_b64;
3906
   } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
3907
      size = 8;
3908
      read2 = true;
3909
      op = aco_opcode::ds_read2_b32;
3910
   } else if (bytes_needed >= 4 && align % 4 == 0) {
3911
      size = 4;
3912
      op = aco_opcode::ds_read_b32;
3913
   } else if (bytes_needed >= 2 && align % 2 == 0) {
3914
      size = 2;
3915
      op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
3916
   } else {
3917
      size = 1;
3918
      op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
3919
   }
3920

3921
   unsigned const_offset_unit = read2 ? size / 2u : 1u;
3922
   unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
3923

3924
   if (const_offset > (const_offset_range - const_offset_unit)) {
3925
      unsigned excess = const_offset - (const_offset % const_offset_range);
3926
      offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
3927
      const_offset -= excess;
3928
   }
3929

3930
   const_offset /= const_offset_unit;
3931

3932
   RegClass rc = RegClass::get(RegType::vgpr, size);
3933
   Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
3934
   Instruction* instr;
3935
   if (read2)
3936
      instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
3937
   else
3938
      instr = bld.ds(op, Definition(val), offset, m, const_offset);
3939
   instr->ds().sync = info.sync;
3940

3941
   return val;
3942
}
3943

3944
const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
3945

3946
Temp
3947
smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3948
                   unsigned align, unsigned const_offset, Temp dst_hint)
3949
{
3950
   unsigned size = 0;
3951
   aco_opcode op;
3952
   if (bytes_needed <= 4) {
3953
      size = 1;
3954
      op = info.resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
3955
   } else if (bytes_needed <= 8) {
3956
      size = 2;
3957
      op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
3958
   } else if (bytes_needed <= 16) {
3959
      size = 4;
3960
      op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
3961
   } else if (bytes_needed <= 32) {
3962
      size = 8;
3963
      op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
3964
   } else {
3965
      size = 16;
3966
      op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
3967
   }
3968
   aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3969
   if (info.resource.id()) {
3970
      load->operands[0] = Operand(info.resource);
3971
      load->operands[1] = Operand(offset);
3972
   } else {
3973
      load->operands[0] = Operand(offset);
3974
      load->operands[1] = Operand::zero();
3975
   }
3976
   RegClass rc(RegType::sgpr, size);
3977
   Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
3978
   load->definitions[0] = Definition(val);
3979
   load->glc = info.glc;
3980
   load->dlc = info.glc && bld.program->chip_class >= GFX10;
3981
   load->sync = info.sync;
3982
   bld.insert(std::move(load));
3983
   return val;
3984
}
3985

3986
const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
3987

3988
Temp
3989
mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3990
                    unsigned align_, unsigned const_offset, Temp dst_hint)
3991
{
3992
   Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3993
   Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
3994

3995
   if (info.soffset.id()) {
3996
      if (soffset.isTemp())
3997
         vaddr = bld.copy(bld.def(v1), soffset);
3998
      soffset = Operand(info.soffset);
3999
   }
4000

4001
   unsigned bytes_size = 0;
4002
   aco_opcode op;
4003
   if (bytes_needed == 1 || align_ % 2) {
4004
      bytes_size = 1;
4005
      op = aco_opcode::buffer_load_ubyte;
4006
   } else if (bytes_needed == 2 || align_ % 4) {
4007
      bytes_size = 2;
4008
      op = aco_opcode::buffer_load_ushort;
4009
   } else if (bytes_needed <= 4) {
4010
      bytes_size = 4;
4011
      op = aco_opcode::buffer_load_dword;
4012
   } else if (bytes_needed <= 8) {
4013
      bytes_size = 8;
4014
      op = aco_opcode::buffer_load_dwordx2;
4015
   } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {
4016
      bytes_size = 12;
4017
      op = aco_opcode::buffer_load_dwordx3;
4018
   } else {
4019
      bytes_size = 16;
4020
      op = aco_opcode::buffer_load_dwordx4;
4021
   }
4022
   aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4023
   mubuf->operands[0] = Operand(info.resource);
4024
   mubuf->operands[1] = vaddr;
4025
   mubuf->operands[2] = soffset;
4026
   mubuf->offen = (offset.type() == RegType::vgpr);
4027
   mubuf->glc = info.glc;
4028
   mubuf->dlc = info.glc && bld.program->chip_class >= GFX10;
4029
   mubuf->slc = info.slc;
4030
   mubuf->sync = info.sync;
4031
   mubuf->offset = const_offset;
4032
   mubuf->swizzled = info.swizzle_component_size != 0;
4033
   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4034
   Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4035
   mubuf->definitions[0] = Definition(val);
4036
   bld.insert(std::move(mubuf));
4037

4038
   return val;
4039
}
4040

4041
const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
4042
const EmitLoadParameters scratch_load_params{mubuf_load_callback, false, true, 4096};
4043

4044
Temp
4045
get_gfx6_global_rsrc(Builder& bld, Temp addr)
4046
{
4047
   uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4048
                        S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4049

4050
   if (addr.type() == RegType::vgpr)
4051
      return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4052
                        Operand::c32(-1u), Operand::c32(rsrc_conf));
4053
   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u),
4054
                     Operand::c32(rsrc_conf));
4055
}
4056

4057
Temp
4058
global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4059
                     unsigned align_, unsigned const_offset, Temp dst_hint)
4060
{
4061
   unsigned bytes_size = 0;
4062
   bool use_mubuf = bld.program->chip_class == GFX6;
4063
   bool global = bld.program->chip_class >= GFX9;
4064
   aco_opcode op;
4065
   if (bytes_needed == 1) {
4066
      bytes_size = 1;
4067
      op = use_mubuf ? aco_opcode::buffer_load_ubyte
4068
           : global  ? aco_opcode::global_load_ubyte
4069
                     : aco_opcode::flat_load_ubyte;
4070
   } else if (bytes_needed == 2) {
4071
      bytes_size = 2;
4072
      op = use_mubuf ? aco_opcode::buffer_load_ushort
4073
           : global  ? aco_opcode::global_load_ushort
4074
                     : aco_opcode::flat_load_ushort;
4075
   } else if (bytes_needed <= 4) {
4076
      bytes_size = 4;
4077
      op = use_mubuf ? aco_opcode::buffer_load_dword
4078
           : global  ? aco_opcode::global_load_dword
4079
                     : aco_opcode::flat_load_dword;
4080
   } else if (bytes_needed <= 8) {
4081
      bytes_size = 8;
4082
      op = use_mubuf ? aco_opcode::buffer_load_dwordx2
4083
           : global  ? aco_opcode::global_load_dwordx2
4084
                     : aco_opcode::flat_load_dwordx2;
4085
   } else if (bytes_needed <= 12 && !use_mubuf) {
4086
      bytes_size = 12;
4087
      op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4088
   } else {
4089
      bytes_size = 16;
4090
      op = use_mubuf ? aco_opcode::buffer_load_dwordx4
4091
           : global  ? aco_opcode::global_load_dwordx4
4092
                     : aco_opcode::flat_load_dwordx4;
4093
   }
4094
   RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
4095
   Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4096
   if (use_mubuf) {
4097
      aco_ptr<MUBUF_instruction> mubuf{
4098
         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4099
      mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
4100
      mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4101
      mubuf->operands[2] = Operand::zero();
4102
      mubuf->glc = info.glc;
4103
      mubuf->dlc = false;
4104
      mubuf->offset = 0;
4105
      mubuf->addr64 = offset.type() == RegType::vgpr;
4106
      mubuf->disable_wqm = false;
4107
      mubuf->sync = info.sync;
4108
      mubuf->definitions[0] = Definition(val);
4109
      bld.insert(std::move(mubuf));
4110
   } else {
4111
      offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
4112

4113
      aco_ptr<FLAT_instruction> flat{
4114
         create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4115
      flat->operands[0] = Operand(offset);
4116
      flat->operands[1] = Operand(s1);
4117
      flat->glc = info.glc;
4118
      flat->dlc = info.glc && bld.program->chip_class >= GFX10;
4119
      flat->sync = info.sync;
4120
      flat->offset = 0u;
4121
      flat->definitions[0] = Definition(val);
4122
      bld.insert(std::move(flat));
4123
   }
4124

4125
   return val;
4126
}
4127

4128
const EmitLoadParameters global_load_params{global_load_callback, true, true, 1};
4129

4130
Temp
4131
load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4132
         Temp address, unsigned base_offset, unsigned align)
4133
{
4134
   assert(util_is_power_of_two_nonzero(align));
4135

4136
   Builder bld(ctx->program, ctx->block);
4137

4138
   LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4139
   info.align_mul = align;
4140
   info.align_offset = 0;
4141
   info.sync = memory_sync_info(storage_shared);
4142
   info.const_offset = base_offset;
4143
   emit_load(ctx, bld, info, lds_load_params);
4144

4145
   return dst;
4146
}
4147

4148
void
4149
split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4150
                 Temp src)
4151
{
4152
   if (!count)
4153
      return;
4154

4155
   Builder bld(ctx->program, ctx->block);
4156

4157
   /* count == 1 fast path */
4158
   if (count == 1) {
4159
      if (dst_type == RegType::sgpr)
4160
         dst[0] = bld.as_uniform(src);
4161
      else
4162
         dst[0] = as_vgpr(ctx, src);
4163
      return;
4164
   }
4165

4166
   /* elem_size_bytes is the greatest common divisor which is a power of 2 */
4167
   unsigned elem_size_bytes =
4168
      1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
4169

4170
   ASSERTED bool is_subdword = elem_size_bytes < 4;
4171
   assert(!is_subdword || dst_type == RegType::vgpr);
4172

4173
   for (unsigned i = 0; i < count; i++)
4174
      dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
4175

4176
   std::vector<Temp> temps;
4177
   /* use allocated_vec if possible */
4178
   auto it = ctx->allocated_vec.find(src.id());
4179
   if (it != ctx->allocated_vec.end()) {
4180
      if (!it->second[0].id())
4181
         goto split;
4182
      unsigned elem_size = it->second[0].bytes();
4183
      assert(src.bytes() % elem_size == 0);
4184

4185
      for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
4186
         if (!it->second[i].id())
4187
            goto split;
4188
      }
4189
      if (elem_size_bytes % elem_size)
4190
         goto split;
4191

4192
      temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
4193
      elem_size_bytes = elem_size;
4194
   }
4195

4196
split:
4197
   /* split src if necessary */
4198
   if (temps.empty()) {
4199
      if (is_subdword && src.type() == RegType::sgpr)
4200
         src = as_vgpr(ctx, src);
4201
      if (dst_type == RegType::sgpr)
4202
         src = bld.as_uniform(src);
4203

4204
      unsigned num_elems = src.bytes() / elem_size_bytes;
4205
      aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
4206
         aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
4207
      split->operands[0] = Operand(src);
4208
      for (unsigned i = 0; i < num_elems; i++) {
4209
         temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
4210
         split->definitions[i] = Definition(temps.back());
4211
      }
4212
      bld.insert(std::move(split));
4213
   }
4214

4215
   unsigned idx = 0;
4216
   for (unsigned i = 0; i < count; i++) {
4217
      unsigned op_count = dst[i].bytes() / elem_size_bytes;
4218
      if (op_count == 1) {
4219
         if (dst_type == RegType::sgpr)
4220
            dst[i] = bld.as_uniform(temps[idx++]);
4221
         else
4222
            dst[i] = as_vgpr(ctx, temps[idx++]);
4223
         continue;
4224
      }
4225

4226
      aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4227
                                                                      Format::PSEUDO, op_count, 1)};
4228
      for (unsigned j = 0; j < op_count; j++) {
4229
         Temp tmp = temps[idx++];
4230
         if (dst_type == RegType::sgpr)
4231
            tmp = bld.as_uniform(tmp);
4232
         vec->operands[j] = Operand(tmp);
4233
      }
4234
      vec->definitions[0] = Definition(dst[i]);
4235
      bld.insert(std::move(vec));
4236
   }
4237
   return;
4238
}
4239

4240
bool
4241
scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
4242
{
4243
   unsigned start_elem = ffs(todo_mask) - 1;
4244
   bool skip = !(mask & (1 << start_elem));
4245
   if (skip)
4246
      mask = ~mask & todo_mask;
4247

4248
   mask &= todo_mask;
4249

4250
   u_bit_scan_consecutive_range(&mask, start, count);
4251

4252
   return !skip;
4253
}
4254

4255
void
4256
advance_write_mask(uint32_t* todo_mask, int start, int count)
4257
{
4258
   *todo_mask &= ~u_bit_consecutive(0, count) << start;
4259
}
4260

4261
void
4262
store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
4263
          unsigned base_offset, unsigned align)
4264
{
4265
   assert(util_is_power_of_two_nonzero(align));
4266
   assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
4267

4268
   Builder bld(ctx->program, ctx->block);
4269
   bool large_ds_write = ctx->options->chip_class >= GFX7;
4270
   bool usable_write2 = ctx->options->chip_class >= GFX7;
4271

4272
   unsigned write_count = 0;
4273
   Temp write_datas[32];
4274
   unsigned offsets[32];
4275
   unsigned bytes[32];
4276
   aco_opcode opcodes[32];
4277

4278
   wrmask = widen_mask(wrmask, elem_size_bytes);
4279

4280
   uint32_t todo = u_bit_consecutive(0, data.bytes());
4281
   while (todo) {
4282
      int offset, byte;
4283
      if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
4284
         offsets[write_count] = offset;
4285
         bytes[write_count] = byte;
4286
         opcodes[write_count] = aco_opcode::num_opcodes;
4287
         write_count++;
4288
         advance_write_mask(&todo, offset, byte);
4289
         continue;
4290
      }
4291

4292
      bool aligned2 = offset % 2 == 0 && align % 2 == 0;
4293
      bool aligned4 = offset % 4 == 0 && align % 4 == 0;
4294
      bool aligned8 = offset % 8 == 0 && align % 8 == 0;
4295
      bool aligned16 = offset % 16 == 0 && align % 16 == 0;
4296

4297
      // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
4298
      aco_opcode op = aco_opcode::num_opcodes;
4299
      if (byte >= 16 && aligned16 && large_ds_write) {
4300
         op = aco_opcode::ds_write_b128;
4301
         byte = 16;
4302
      } else if (byte >= 12 && aligned16 && large_ds_write) {
4303
         op = aco_opcode::ds_write_b96;
4304
         byte = 12;
4305
      } else if (byte >= 8 && aligned8) {
4306
         op = aco_opcode::ds_write_b64;
4307
         byte = 8;
4308
      } else if (byte >= 4 && aligned4) {
4309
         op = aco_opcode::ds_write_b32;
4310
         byte = 4;
4311
      } else if (byte >= 2 && aligned2) {
4312
         op = aco_opcode::ds_write_b16;
4313
         byte = 2;
4314
      } else if (byte >= 1) {
4315
         op = aco_opcode::ds_write_b8;
4316
         byte = 1;
4317
      } else {
4318
         assert(false);
4319
      }
4320

4321
      offsets[write_count] = offset;
4322
      bytes[write_count] = byte;
4323
      opcodes[write_count] = op;
4324
      write_count++;
4325
      advance_write_mask(&todo, offset, byte);
4326
   }
4327

4328
   Operand m = load_lds_size_m0(bld);
4329

4330
   split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
4331

4332
   for (unsigned i = 0; i < write_count; i++) {
4333
      aco_opcode op = opcodes[i];
4334
      if (op == aco_opcode::num_opcodes)
4335
         continue;
4336

4337
      Temp split_data = write_datas[i];
4338

4339
      unsigned second = write_count;
4340
      if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
4341
         for (second = i + 1; second < write_count; second++) {
4342
            if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
4343
               op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
4344
               opcodes[second] = aco_opcode::num_opcodes;
4345
               break;
4346
            }
4347
         }
4348
      }
4349

4350
      bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
4351
      unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
4352

4353
      unsigned inline_offset = base_offset + offsets[i];
4354
      unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
4355
      Temp address_offset = address;
4356
      if (inline_offset > max_offset) {
4357
         address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
4358
         inline_offset = offsets[i];
4359
      }
4360

4361
      /* offsets[i] shouldn't be large enough for this to happen */
4362
      assert(inline_offset <= max_offset);
4363

4364
      Instruction* instr;
4365
      if (write2) {
4366
         Temp second_data = write_datas[second];
4367
         inline_offset /= split_data.bytes();
4368
         instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
4369
                        inline_offset + write2_off);
4370
      } else {
4371
         instr = bld.ds(op, address_offset, split_data, m, inline_offset);
4372
      }
4373
      instr->ds().sync = memory_sync_info(storage_shared);
4374
   }
4375
}
4376

4377
aco_opcode
4378
get_buffer_store_op(unsigned bytes)
4379
{
4380
   switch (bytes) {
4381
   case 1: return aco_opcode::buffer_store_byte;
4382
   case 2: return aco_opcode::buffer_store_short;
4383
   case 4: return aco_opcode::buffer_store_dword;
4384
   case 8: return aco_opcode::buffer_store_dwordx2;
4385
   case 12: return aco_opcode::buffer_store_dwordx3;
4386
   case 16: return aco_opcode::buffer_store_dwordx4;
4387
   }
4388
   unreachable("Unexpected store size");
4389
   return aco_opcode::num_opcodes;
4390
}
4391

4392
void
4393
split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
4394
                   Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
4395
                   Temp* write_datas, unsigned* offsets)
4396
{
4397
   unsigned write_count_with_skips = 0;
4398
   bool skips[16];
4399
   unsigned bytes[16];
4400

4401
   /* determine how to split the data */
4402
   unsigned todo = u_bit_consecutive(0, data.bytes());
4403
   while (todo) {
4404
      int offset, byte;
4405
      skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
4406
      offsets[write_count_with_skips] = offset;
4407
      if (skips[write_count_with_skips]) {
4408
         bytes[write_count_with_skips] = byte;
4409
         advance_write_mask(&todo, offset, byte);
4410
         write_count_with_skips++;
4411
         continue;
4412
      }
4413

4414
      /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
4415
       * larger than swizzle_element_size */
4416
      byte = MIN2(byte, swizzle_element_size);
4417
      if (byte % 4)
4418
         byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
4419

4420
      /* SMEM and GFX6 VMEM can't emit 12-byte stores */
4421
      if ((ctx->program->chip_class == GFX6 || smem) && byte == 12)
4422
         byte = 8;
4423

4424
      /* dword or larger stores have to be dword-aligned */
4425
      unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
4426
      unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
4427
      bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
4428
      if (!dword_aligned)
4429
         byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
4430

4431
      bytes[write_count_with_skips] = byte;
4432
      advance_write_mask(&todo, offset, byte);
4433
      write_count_with_skips++;
4434
   }
4435

4436
   /* actually split data */
4437
   split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
4438

4439
   /* remove skips */
4440
   for (unsigned i = 0; i < write_count_with_skips; i++) {
4441
      if (skips[i])
4442
         continue;
4443
      write_datas[*write_count] = write_datas[i];
4444
      offsets[*write_count] = offsets[i];
4445
      (*write_count)++;
4446
   }
4447
}
4448

4449
Temp
4450
create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
4451
                      unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
4452
{
4453
   Builder bld(ctx->program, ctx->block);
4454
   unsigned dword_size = elem_size_bytes / 4;
4455

4456
   if (!dst.id())
4457
      dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
4458

4459
   std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4460
   aco_ptr<Pseudo_instruction> instr{
4461
      create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
4462
   instr->definitions[0] = Definition(dst);
4463

4464
   for (unsigned i = 0; i < cnt; ++i) {
4465
      if (arr[i].id()) {
4466
         assert(arr[i].size() == dword_size);
4467
         allocated_vec[i] = arr[i];
4468
         instr->operands[i] = Operand(arr[i]);
4469
      } else {
4470
         Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
4471
                              Operand::zero(dword_size == 2 ? 8 : 4));
4472
         allocated_vec[i] = zero;
4473
         instr->operands[i] = Operand(zero);
4474
      }
4475
   }
4476

4477
   bld.insert(std::move(instr));
4478

4479
   if (split_cnt)
4480
      emit_split_vector(ctx, dst, split_cnt);
4481
   else
4482
      ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
4483

4484
   return dst;
4485
}
4486

4487
inline unsigned
4488
resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
4489
{
4490
   if (const_offset >= 4096) {
4491
      unsigned excess_const_offset = const_offset / 4096u * 4096u;
4492
      const_offset %= 4096u;
4493

4494
      if (!voffset.id())
4495
         voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
4496
      else if (unlikely(voffset.regClass() == s1))
4497
         voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
4498
                            Operand::c32(excess_const_offset), Operand(voffset));
4499
      else if (likely(voffset.regClass() == v1))
4500
         voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
4501
      else
4502
         unreachable("Unsupported register class of voffset");
4503
   }
4504

4505
   return const_offset;
4506
}
4507

4508
void
4509
emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
4510
                        unsigned const_offset = 0u, memory_sync_info sync = memory_sync_info(),
4511
                        bool slc = false, bool swizzled = false)
4512
{
4513
   assert(vdata.id());
4514
   assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
4515
   assert(vdata.size() >= 1 && vdata.size() <= 4);
4516

4517
   Builder bld(ctx->program, ctx->block);
4518
   aco_opcode op = get_buffer_store_op(vdata.bytes());
4519
   const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
4520

4521
   Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
4522
   Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();
4523
   Builder::Result r =
4524
      bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
4525
                /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
4526
                /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
4527
                /* dlc*/ false, /* slc */ slc);
4528

4529
   r.instr->mubuf().sync = sync;
4530
}
4531

4532
void
4533
store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
4534
                 unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
4535
                 bool allow_combining = true, memory_sync_info sync = memory_sync_info(),
4536
                 bool slc = false)
4537
{
4538
   Builder bld(ctx->program, ctx->block);
4539
   assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4540
   assert(write_mask);
4541
   write_mask = widen_mask(write_mask, elem_size_bytes);
4542

4543
   unsigned write_count = 0;
4544
   Temp write_datas[32];
4545
   unsigned offsets[32];
4546
   split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, allow_combining ? 16 : 4,
4547
                      &write_count, write_datas, offsets);
4548

4549
   for (unsigned i = 0; i < write_count; i++) {
4550
      unsigned const_offset = offsets[i] + base_const_offset;
4551
      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync,
4552
                              slc, !allow_combining);
4553
   }
4554
}
4555

4556
void
4557
load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
4558
                unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
4559
                unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true,
4560
                bool slc = false)
4561
{
4562
   assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4563
   assert((num_components * elem_size_bytes) == dst.bytes());
4564
   assert(!!stride != allow_combining);
4565

4566
   Builder bld(ctx->program, ctx->block);
4567

4568
   LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
4569
   info.component_stride = allow_combining ? 0 : stride;
4570
   info.glc = true;
4571
   info.slc = slc;
4572
   info.swizzle_component_size = allow_combining ? 0 : 4;
4573
   info.align_mul = MIN2(elem_size_bytes, 4);
4574
   info.align_offset = 0;
4575
   info.soffset = soffset;
4576
   info.const_offset = base_const_offset;
4577
   emit_load(ctx, bld, info, mubuf_load_params);
4578
}
4579

4580
Temp
4581
wave_id_in_threadgroup(isel_context* ctx)
4582
{
4583
   Builder bld(ctx->program, ctx->block);
4584
   return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
4585
                   get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(24u | (4u << 16)));
4586
}
4587

4588
Temp
4589
thread_id_in_threadgroup(isel_context* ctx)
4590
{
4591
   /* tid_in_tg = wave_id * wave_size + tid_in_wave */
4592

4593
   Builder bld(ctx->program, ctx->block);
4594
   Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
4595

4596
   if (ctx->program->workgroup_size <= ctx->program->wave_size)
4597
      return tid_in_wave;
4598

4599
   Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
4600
   Temp num_pre_threads =
4601
      bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
4602
               Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
4603
   return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
4604
}
4605

4606
Temp
4607
get_tess_rel_patch_id(isel_context* ctx)
4608
{
4609
   Builder bld(ctx->program, ctx->block);
4610

4611
   switch (ctx->shader->info.stage) {
4612
   case MESA_SHADER_TESS_CTRL:
4613
      return bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
4614
                        Operand::zero(), Operand::c32(8u), Operand::zero());
4615
   case MESA_SHADER_TESS_EVAL: return get_arg(ctx, ctx->args->ac.tes_rel_patch_id);
4616
   default: unreachable("Unsupported stage in get_tess_rel_patch_id");
4617
   }
4618
}
4619

4620
bool
4621
store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
4622
{
4623
   unsigned write_mask = nir_intrinsic_write_mask(instr);
4624
   unsigned component = nir_intrinsic_component(instr);
4625
   unsigned idx = nir_intrinsic_base(instr) * 4u + component;
4626
   nir_src offset = *nir_get_io_offset_src(instr);
4627

4628
   if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
4629
      return false;
4630

4631
   Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4632

4633
   if (instr->src[0].ssa->bit_size == 64)
4634
      write_mask = widen_mask(write_mask, 2);
4635

4636
   RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
4637

4638
   for (unsigned i = 0; i < 8; ++i) {
4639
      if (write_mask & (1 << i)) {
4640
         ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
4641
         ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
4642
      }
4643
      idx++;
4644
   }
4645

4646
   return true;
4647
}
4648

4649
bool
4650
load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
4651
{
4652
   /* Only TCS per-vertex inputs are supported by this function.
4653
    * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
4654
    * is the same.
4655
    */
4656
   if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
4657
      return false;
4658

4659
   nir_src* off_src = nir_get_io_offset_src(instr);
4660
   nir_src* vertex_index_src = nir_get_io_vertex_index_src(instr);
4661
   nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
4662
   bool can_use_temps =
4663
      nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
4664
      nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
4665

4666
   if (!can_use_temps)
4667
      return false;
4668

4669
   unsigned idx = nir_intrinsic_base(instr) * 4u + nir_intrinsic_component(instr) +
4670
                  4 * nir_src_as_uint(*off_src);
4671
   Temp* src = &ctx->inputs.temps[idx];
4672
   create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
4673

4674
   return true;
4675
}
4676

4677
static void export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos);
4678

4679
void
4680
visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
4681
{
4682
   if (ctx->stage == vertex_vs || ctx->stage == tess_eval_vs || ctx->stage == fragment_fs ||
4683
       ctx->stage == vertex_ngg || ctx->stage == tess_eval_ngg ||
4684
       (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4685
       ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
4686
      bool stored_to_temps = store_output_to_temps(ctx, instr);
4687
      if (!stored_to_temps) {
4688
         isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
4689
         abort();
4690
      }
4691
   } else {
4692
      unreachable("Shader stage not implemented");
4693
   }
4694

4695
   /* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we
4696
    * have to emit an exp here manually */
4697
   if (ctx->stage.hw == HWStage::NGG &&
4698
       (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) &&
4699
       nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PRIMITIVE_ID)
4700
      export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, NULL);
4701
}
4702

4703
void
4704
emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
4705
                  Temp prim_mask)
4706
{
4707
   Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
4708
   Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
4709

4710
   Builder bld(ctx->program, ctx->block);
4711

4712
   if (dst.regClass() == v2b) {
4713
      if (ctx->program->dev.has_16bank_lds) {
4714
         assert(ctx->options->chip_class <= GFX8);
4715
         Builder::Result interp_p1 =
4716
            bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
4717
                       bld.m0(prim_mask), idx, component);
4718
         interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1,
4719
                                bld.m0(prim_mask), interp_p1, idx, component);
4720
         bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
4721
                    interp_p1, idx, component);
4722
      } else {
4723
         aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
4724

4725
         if (ctx->options->chip_class == GFX8)
4726
            interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
4727

4728
         Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
4729
                                                bld.m0(prim_mask), idx, component);
4730
         bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
4731
                    component);
4732
      }
4733
   } else {
4734
      Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
4735
                                             bld.m0(prim_mask), idx, component);
4736

4737
      if (ctx->program->dev.has_16bank_lds)
4738
         interp_p1.instr->operands[0].setLateKill(true);
4739

4740
      bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
4741
                 idx, component);
4742
   }
4743
}
4744

4745
void
4746
emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
4747
{
4748
   Builder bld(ctx->program, ctx->block);
4749

4750
   aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
4751
      aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
4752
   for (unsigned i = 0; i < num_components; i++)
4753
      vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
4754
   if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4755
      assert(num_components == 4);
4756
      vec->operands[3] =
4757
         bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
4758
   }
4759

4760
   if (ctx->options->adjust_frag_coord_z &&
4761
       G_0286CC_POS_Z_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4762
      /* Adjust gl_FragCoord.z for VRS due to a hw bug on some GFX10.3 chips. */
4763
      Operand frag_z = vec->operands[2];
4764
      Temp adjusted_frag_z = bld.tmp(v1);
4765
      Temp tmp;
4766

4767
      /* dFdx fine */
4768
      Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), frag_z, dpp_quad_perm(0, 0, 2, 2));
4769
      tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), frag_z, tl, dpp_quad_perm(1, 1, 3, 3));
4770
      emit_wqm(bld, tmp, adjusted_frag_z, true);
4771

4772
      /* adjusted_frag_z * 0.0625 + frag_z */
4773
      adjusted_frag_z = bld.vop3(aco_opcode::v_fma_f32, bld.def(v1), adjusted_frag_z,
4774
                                 Operand::c32(0x3d800000u /* 0.0625 */), frag_z);
4775

4776
      /* VRS Rate X = Ancillary[2:3] */
4777
      Temp x_rate =
4778
         bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4779
                  Operand::c32(2u), Operand::c32(2u));
4780

4781
      /* xRate = xRate == 0x1 ? adjusted_frag_z : frag_z. */
4782
      Temp cond =
4783
         bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
4784
      vec->operands[2] =
4785
         bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), frag_z, adjusted_frag_z, cond);
4786
   }
4787

4788
   for (Operand& op : vec->operands)
4789
      op = op.isUndefined() ? Operand::zero() : op;
4790

4791
   vec->definitions[0] = Definition(dst);
4792
   ctx->block->instructions.emplace_back(std::move(vec));
4793
   emit_split_vector(ctx, dst, num_components);
4794
   return;
4795
}
4796

4797
void
4798
emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
4799
{
4800
   Builder bld(ctx->program, ctx->block);
4801
   Temp cond;
4802

4803
   /* VRS Rate X = Ancillary[2:3]
4804
    * VRS Rate Y = Ancillary[4:5]
4805
    */
4806
   Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4807
                          Operand::c32(2u), Operand::c32(2u));
4808
   Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4809
                          Operand::c32(4u), Operand::c32(2u));
4810

4811
   /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
4812
   cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
4813
   x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
4814
                     bld.copy(bld.def(v1), Operand::c32(4u)), cond);
4815

4816
   /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
4817
   cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));
4818
   y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
4819
                     bld.copy(bld.def(v1), Operand::c32(1u)), cond);
4820

4821
   bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
4822
}
4823

4824
void
4825
visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
4826
{
4827
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4828
   Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
4829
   unsigned idx = nir_intrinsic_base(instr);
4830
   unsigned component = nir_intrinsic_component(instr);
4831
   Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4832

4833
   assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
4834

4835
   if (instr->dest.ssa.num_components == 1) {
4836
      emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
4837
   } else {
4838
      aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
4839
         aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
4840
      for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) {
4841
         Temp tmp = ctx->program->allocateTmp(v1);
4842
         emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
4843
         vec->operands[i] = Operand(tmp);
4844
      }
4845
      vec->definitions[0] = Definition(dst);
4846
      ctx->block->instructions.emplace_back(std::move(vec));
4847
   }
4848
}
4849

4850
bool
4851
check_vertex_fetch_size(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
4852
                        unsigned binding_align, unsigned channels)
4853
{
4854
   unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
4855
   if (vtx_info->chan_byte_size != 4 && channels == 3)
4856
      return false;
4857

4858
   /* Split typed vertex buffer loads on GFX6 and GFX10+ to avoid any
4859
    * alignment issues that triggers memory violations and eventually a GPU
4860
    * hang. This can happen if the stride (static or dynamic) is unaligned and
4861
    * also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO
4862
    * offset is 2 for R16G16B16A16_SNORM).
4863
    */
4864
   return (ctx->options->chip_class >= GFX7 && ctx->options->chip_class <= GFX9) ||
4865
          (offset % vertex_byte_size == 0 && MAX2(binding_align, 1) % vertex_byte_size == 0);
4866
}
4867

4868
uint8_t
4869
get_fetch_data_format(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
4870
                      unsigned* channels, unsigned max_channels, unsigned binding_align)
4871
{
4872
   if (!vtx_info->chan_byte_size) {
4873
      *channels = vtx_info->num_channels;
4874
      return vtx_info->chan_format;
4875
   }
4876

4877
   unsigned num_channels = *channels;
4878
   if (!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, *channels)) {
4879
      unsigned new_channels = num_channels + 1;
4880
      /* first, assume more loads is worse and try using a larger data format */
4881
      while (new_channels <= max_channels &&
4882
             !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) {
4883
         new_channels++;
4884
      }
4885

4886
      if (new_channels > max_channels) {
4887
         /* then try decreasing load size (at the cost of more loads) */
4888
         new_channels = *channels;
4889
         while (new_channels > 1 &&
4890
                !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels))
4891
            new_channels--;
4892
      }
4893

4894
      if (new_channels < *channels)
4895
         *channels = new_channels;
4896
      num_channels = new_channels;
4897
   }
4898

4899
   switch (vtx_info->chan_format) {
4900
   case V_008F0C_BUF_DATA_FORMAT_8:
4901
      return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
4902
                                    V_008F0C_BUF_DATA_FORMAT_INVALID,
4903
                                    V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
4904
   case V_008F0C_BUF_DATA_FORMAT_16:
4905
      return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
4906
                                    V_008F0C_BUF_DATA_FORMAT_INVALID,
4907
                                    V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
4908
   case V_008F0C_BUF_DATA_FORMAT_32:
4909
      return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
4910
                                    V_008F0C_BUF_DATA_FORMAT_32_32_32,
4911
                                    V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
4912
   }
4913
   unreachable("shouldn't reach here");
4914
   return V_008F0C_BUF_DATA_FORMAT_INVALID;
4915
}
4916

4917
/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
4918
 * so we may need to fix it up. */
4919
Temp
4920
adjust_vertex_fetch_alpha(isel_context* ctx, unsigned adjustment, Temp alpha)
4921
{
4922
   Builder bld(ctx->program, ctx->block);
4923

4924
   if (adjustment == AC_FETCH_FORMAT_SSCALED)
4925
      alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
4926

4927
   /* For the integer-like cases, do a natural sign extension.
4928
    *
4929
    * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
4930
    * and happen to contain 0, 1, 2, 3 as the two LSBs of the
4931
    * exponent.
4932
    */
4933
   unsigned offset = adjustment == AC_FETCH_FORMAT_SNORM ? 23u : 0u;
4934
   alpha =
4935
      bld.vop3(aco_opcode::v_bfe_i32, bld.def(v1), alpha, Operand::c32(offset), Operand::c32(2u));
4936

4937
   /* Convert back to the right type. */
4938
   if (adjustment == AC_FETCH_FORMAT_SNORM) {
4939
      alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4940
      alpha = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::c32(0xbf800000u), alpha);
4941
   } else if (adjustment == AC_FETCH_FORMAT_SSCALED) {
4942
      alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4943
   }
4944

4945
   return alpha;
4946
}
4947

4948
void
4949
visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
4950
{
4951
   Builder bld(ctx->program, ctx->block);
4952
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4953
   nir_src offset = *nir_get_io_offset_src(instr);
4954

4955
   if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
4956

4957
      if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
4958
         isel_err(offset.ssa->parent_instr,
4959
                  "Unimplemented non-zero nir_intrinsic_load_input offset");
4960

4961
      Temp vertex_buffers =
4962
         convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers));
4963

4964
      unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
4965
      unsigned component = nir_intrinsic_component(instr);
4966
      unsigned bitsize = instr->dest.ssa.bit_size;
4967
      unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
4968
      uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
4969
      uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
4970
      unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
4971
      unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding];
4972
      enum ac_fetch_format alpha_adjust = ctx->options->key.vs.alpha_adjust[location];
4973

4974
      unsigned dfmt = attrib_format & 0xf;
4975
      unsigned nfmt = (attrib_format >> 4) & 0x7;
4976
      const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
4977

4978
      unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
4979
      unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
4980
      bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
4981
      if (post_shuffle)
4982
         num_channels = MAX2(num_channels, 3);
4983

4984
      unsigned desc_index =
4985
         ctx->program->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;
4986
      desc_index = util_bitcount(ctx->program->info->vs.vb_desc_usage_mask &
4987
                                 u_bit_consecutive(0, desc_index));
4988
      Operand off = bld.copy(bld.def(s1), Operand::c32(desc_index * 16u));
4989
      Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
4990

4991
      Temp index;
4992
      if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
4993
         uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
4994
         Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
4995
         if (divisor) {
4996
            Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
4997
            if (divisor != 1) {
4998
               Temp divided = bld.tmp(v1);
4999
               emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
5000
               index = bld.vadd32(bld.def(v1), start_instance, divided);
5001
            } else {
5002
               index = bld.vadd32(bld.def(v1), start_instance, instance_id);
5003
            }
5004
         } else {
5005
            index = bld.copy(bld.def(v1), start_instance);
5006
         }
5007
      } else {
5008
         index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.base_vertex),
5009
                            get_arg(ctx, ctx->args->ac.vertex_id));
5010
      }
5011

5012
      Temp* const channels = (Temp*)alloca(num_channels * sizeof(Temp));
5013
      unsigned channel_start = 0;
5014
      bool direct_fetch = false;
5015

5016
      /* skip unused channels at the start */
5017
      if (vtx_info->chan_byte_size && !post_shuffle) {
5018
         channel_start = ffs(mask) - 1;
5019
         for (unsigned i = 0; i < MIN2(channel_start, num_channels); i++)
5020
            channels[i] = Temp(0, s1);
5021
      } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {
5022
         num_channels = 3 - (ffs(mask) - 1);
5023
      }
5024

5025
      /* load channels */
5026
      while (channel_start < num_channels) {
5027
         unsigned fetch_component = num_channels - channel_start;
5028
         unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
5029
         bool expanded = false;
5030

5031
         /* use MUBUF when possible to avoid possible alignment issues */
5032
         /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
5033
         bool use_mubuf =
5034
            (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
5035
             nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
5036
            vtx_info->chan_byte_size == 4;
5037
         unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
5038
         if (!use_mubuf) {
5039
            fetch_dfmt =
5040
               get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component,
5041
                                     vtx_info->num_channels - channel_start, binding_align);
5042
         } else {
5043
            if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
5044
               /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
5045
               fetch_component = 4;
5046
               expanded = true;
5047
            }
5048
         }
5049

5050
         unsigned fetch_bytes = fetch_component * bitsize / 8;
5051

5052
         Temp fetch_index = index;
5053
         if (attrib_stride != 0 && fetch_offset > attrib_stride) {
5054
            fetch_index =
5055
               bld.vadd32(bld.def(v1), Operand::c32(fetch_offset / attrib_stride), fetch_index);
5056
            fetch_offset = fetch_offset % attrib_stride;
5057
         }
5058

5059
         Operand soffset = Operand::zero();
5060
         if (fetch_offset >= 4096) {
5061
            soffset = bld.copy(bld.def(s1), Operand::c32(fetch_offset / 4096 * 4096));
5062
            fetch_offset %= 4096;
5063
         }
5064

5065
         aco_opcode opcode;
5066
         switch (fetch_bytes) {
5067
         case 2:
5068
            assert(!use_mubuf && bitsize == 16);
5069
            opcode = aco_opcode::tbuffer_load_format_d16_x;
5070
            break;
5071
         case 4:
5072
            if (bitsize == 16) {
5073
               assert(!use_mubuf);
5074
               opcode = aco_opcode::tbuffer_load_format_d16_xy;
5075
            } else {
5076
               opcode =
5077
                  use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
5078
            }
5079
            break;
5080
         case 6:
5081
            assert(!use_mubuf && bitsize == 16);
5082
            opcode = aco_opcode::tbuffer_load_format_d16_xyz;
5083
            break;
5084
         case 8:
5085
            if (bitsize == 16) {
5086
               assert(!use_mubuf);
5087
               opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
5088
            } else {
5089
               opcode =
5090
                  use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
5091
            }
5092
            break;
5093
         case 12:
5094
            assert(ctx->options->chip_class >= GFX7 ||
5095
                   (!use_mubuf && ctx->options->chip_class == GFX6));
5096
            opcode =
5097
               use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
5098
            break;
5099
         case 16:
5100
            opcode =
5101
               use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
5102
            break;
5103
         default: unreachable("Unimplemented load_input vector size");
5104
         }
5105

5106
         Temp fetch_dst;
5107
         if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded &&
5108
             (alpha_adjust == AC_FETCH_FORMAT_NONE || num_channels <= 3)) {
5109
            direct_fetch = true;
5110
            fetch_dst = dst;
5111
         } else {
5112
            fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
5113
         }
5114

5115
         if (use_mubuf) {
5116
            Instruction* mubuf = bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index,
5117
                                           soffset, fetch_offset, false, false, true)
5118
                                    .instr;
5119
            mubuf->mubuf().vtx_binding = attrib_binding + 1;
5120
         } else {
5121
            Instruction* mtbuf = bld.mtbuf(opcode, Definition(fetch_dst), list, fetch_index,
5122
                                           soffset, fetch_dfmt, nfmt, fetch_offset, false, true)
5123
                                    .instr;
5124
            mtbuf->mtbuf().vtx_binding = attrib_binding + 1;
5125
         }
5126

5127
         emit_split_vector(ctx, fetch_dst, fetch_dst.size());
5128

5129
         if (fetch_component == 1) {
5130
            channels[channel_start] = fetch_dst;
5131
         } else {
5132
            for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
5133
               channels[channel_start + i] =
5134
                  emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1);
5135
         }
5136

5137
         channel_start += fetch_component;
5138
      }
5139

5140
      if (!direct_fetch) {
5141
         bool is_float =
5142
            nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
5143

5144
         static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
5145
         static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
5146
         const unsigned* swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
5147
         unsigned num_components = instr->dest.ssa.num_components;
5148

5149
         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
5150
            aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5151
         std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5152
         unsigned num_temp = 0;
5153
         for (unsigned i = 0; i < num_components; i++) {
5154
            unsigned idx = i + component;
5155
            if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
5156
               Temp channel = channels[swizzle[idx]];
5157
               if (idx == 3 && alpha_adjust != AC_FETCH_FORMAT_NONE)
5158
                  channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
5159
               vec->operands[i] = Operand(channel);
5160

5161
               num_temp++;
5162
               elems[i] = channel;
5163
            } else if (is_float && idx == 3) {
5164
               vec->operands[i] = Operand::c32(0x3f800000u);
5165
            } else if (!is_float && idx == 3) {
5166
               vec->operands[i] = Operand::c32(1u);
5167
            } else {
5168
               vec->operands[i] = Operand::zero();
5169
            }
5170
         }
5171
         vec->definitions[0] = Definition(dst);
5172
         ctx->block->instructions.emplace_back(std::move(vec));
5173
         emit_split_vector(ctx, dst, num_components);
5174

5175
         if (num_temp == num_components)
5176
            ctx->allocated_vec.emplace(dst.id(), elems);
5177
      }
5178
   } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
5179
      if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5180
         isel_err(offset.ssa->parent_instr,
5181
                  "Unimplemented non-zero nir_intrinsic_load_input offset");
5182

5183
      Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
5184

5185
      unsigned idx = nir_intrinsic_base(instr);
5186
      unsigned component = nir_intrinsic_component(instr);
5187
      unsigned vertex_id = 2; /* P0 */
5188

5189
      if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
5190
         nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);
5191
         switch (src0->u32) {
5192
         case 0:
5193
            vertex_id = 2; /* P0 */
5194
            break;
5195
         case 1:
5196
            vertex_id = 0; /* P10 */
5197
            break;
5198
         case 2:
5199
            vertex_id = 1; /* P20 */
5200
            break;
5201
         default: unreachable("invalid vertex index");
5202
         }
5203
      }
5204

5205
      if (dst.size() == 1) {
5206
         bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id),
5207
                    bld.m0(prim_mask), idx, component);
5208
      } else {
5209
         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5210
            aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
5211
         for (unsigned i = 0; i < dst.size(); i++)
5212
            vec->operands[i] =
5213
               bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(vertex_id),
5214
                          bld.m0(prim_mask), idx, component + i);
5215
         vec->definitions[0] = Definition(dst);
5216
         bld.insert(std::move(vec));
5217
      }
5218
   } else {
5219
      unreachable("Shader stage not implemented");
5220
   }
5221
}
5222

5223
void
5224
visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5225
{
5226
   assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5227

5228
   Builder bld(ctx->program, ctx->block);
5229
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5230

5231
   if (load_input_from_temps(ctx, instr, dst))
5232
      return;
5233

5234
   unreachable("LDS-based TCS input should have been lowered in NIR.");
5235
}
5236

5237
void
5238
visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5239
{
5240
   switch (ctx->shader->info.stage) {
5241
   case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5242
   default: unreachable("Unimplemented shader stage");
5243
   }
5244
}
5245

5246
void
5247
visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5248
{
5249
   assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5250

5251
   Builder bld(ctx->program, ctx->block);
5252
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5253

5254
   Operand tes_u(get_arg(ctx, ctx->args->ac.tes_u));
5255
   Operand tes_v(get_arg(ctx, ctx->args->ac.tes_v));
5256
   Operand tes_w = Operand::zero();
5257

5258
   if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {
5259
      Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5260
      tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);
5261
      tes_w = Operand(tmp);
5262
   }
5263

5264
   Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5265
   emit_split_vector(ctx, tess_coord, 3);
5266
}
5267

5268
Temp
5269
load_desc_ptr(isel_context* ctx, unsigned desc_set)
5270
{
5271
   if (ctx->program->info->need_indirect_descriptor_sets) {
5272
      Builder bld(ctx->program, ctx->block);
5273
      Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
5274
      Operand off = bld.copy(bld.def(s1), Operand::c32(desc_set << 2));
5275
      return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off); //, false, false, false);
5276
   }
5277

5278
   return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
5279
}
5280

5281
void
5282
visit_load_resource(isel_context* ctx, nir_intrinsic_instr* instr)
5283
{
5284
   Builder bld(ctx->program, ctx->block);
5285
   Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
5286
   if (!nir_dest_is_divergent(instr->dest))
5287
      index = bld.as_uniform(index);
5288
   unsigned desc_set = nir_intrinsic_desc_set(instr);
5289
   unsigned binding = nir_intrinsic_binding(instr);
5290

5291
   Temp desc_ptr;
5292
   radv_pipeline_layout* pipeline_layout = ctx->options->layout;
5293
   radv_descriptor_set_layout* layout = pipeline_layout->set[desc_set].layout;
5294
   unsigned offset = layout->binding[binding].offset;
5295
   unsigned stride;
5296
   if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
5297
       layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
5298
      unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
5299
                     layout->binding[binding].dynamic_offset_offset;
5300
      desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
5301
      offset = pipeline_layout->push_constant_size + 16 * idx;
5302
      stride = 16;
5303
   } else {
5304
      desc_ptr = load_desc_ptr(ctx, desc_set);
5305
      stride = layout->binding[binding].size;
5306
   }
5307

5308
   if (nir_src_is_const(instr->src[0])) {
5309
      index =
5310
         bld.copy(bld.def(s1), Operand::c32((offset + nir_src_as_uint(instr->src[0]) * stride)));
5311
   } else if (index.type() == RegType::vgpr) {
5312
      if (stride != 1) {
5313
         bool index24bit = layout->binding[binding].array_size <= 0x1000000;
5314
         index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
5315
      }
5316
      if (offset)
5317
         index = bld.vadd32(bld.def(v1), Operand::c32(offset), index);
5318
   } else {
5319
      if (stride != 1)
5320
         index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index);
5321
      if (offset)
5322
         index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5323
                          Operand::c32(offset), index);
5324
   }
5325

5326
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5327
   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5328
   elems[0] = desc_ptr;
5329
   elems[1] = index;
5330
   ctx->allocated_vec.emplace(dst.id(), elems);
5331
   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), desc_ptr, index, Operand::zero());
5332
}
5333

5334
void
5335
load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5336
            Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,
5337
            bool allow_smem = true, memory_sync_info sync = memory_sync_info())
5338
{
5339
   Builder bld(ctx->program, ctx->block);
5340

5341
   bool use_smem =
5342
      dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
5343
   if (use_smem)
5344
      offset = bld.as_uniform(offset);
5345
   else {
5346
      /* GFX6-7 are affected by a hw bug that prevents address clamping to
5347
       * work correctly when the SGPR offset is used.
5348
       */
5349
      if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)
5350
         offset = as_vgpr(ctx, offset);
5351
   }
5352

5353
   LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5354
   info.glc = glc;
5355
   info.sync = sync;
5356
   info.align_mul = align_mul;
5357
   info.align_offset = align_offset;
5358
   if (use_smem)
5359
      emit_load(ctx, bld, info, smem_load_params);
5360
   else
5361
      emit_load(ctx, bld, info, mubuf_load_params);
5362
}
5363

5364
Temp
5365
load_buffer_rsrc(isel_context* ctx, Temp rsrc)
5366
{
5367
   Builder bld(ctx->program, ctx->block);
5368
   Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));
5369
   Temp binding = bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
5370
   set_ptr = convert_pointer_to_64_bit(ctx, set_ptr);
5371
   return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), set_ptr, binding);
5372
}
5373

5374
bool
5375
is_inline_ubo(isel_context* ctx, nir_src rsrc)
5376
{
5377
   nir_binding binding = nir_chase_binding(rsrc);
5378
   if (!binding.success)
5379
      return false;
5380

5381
   radv_descriptor_set_layout* layout = ctx->options->layout->set[binding.desc_set].layout;
5382
   return layout->binding[binding.binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT;
5383
}
5384

5385
void
5386
visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5387
{
5388
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5389
   Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
5390

5391
   Builder bld(ctx->program, ctx->block);
5392

5393
   if (is_inline_ubo(ctx, instr->src[0])) {
5394
      Temp set_ptr = bld.as_uniform(emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1)));
5395
      Temp binding_off =
5396
         bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
5397
      rsrc = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), set_ptr, binding_off);
5398

5399
      uint32_t desc_type =
5400
         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5401
         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5402
      if (ctx->options->chip_class >= GFX10) {
5403
         desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5404
                      S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5405
      } else {
5406
         desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5407
                      S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5408
      }
5409
      rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), rsrc,
5410
                        Operand::c32(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
5411
                        Operand::c32(0xFFFFFFFFu), Operand::c32(desc_type));
5412
   } else {
5413
      rsrc = load_buffer_rsrc(ctx, rsrc);
5414
   }
5415
   unsigned size = instr->dest.ssa.bit_size / 8;
5416
   load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5417
               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5418
}
5419

5420
void
5421
visit_load_sbt_amd(isel_context* ctx, nir_intrinsic_instr* instr)
5422
{
5423
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5424
   Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
5425
   unsigned binding = nir_intrinsic_binding(instr);
5426
   unsigned base = nir_intrinsic_base(instr);
5427

5428
   index = as_vgpr(ctx, index);
5429

5430
   Builder bld(ctx->program, ctx->block);
5431
   Temp desc_base = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.sbt_descriptors));
5432
   Operand desc_off = bld.copy(bld.def(s1), Operand::c32(binding * 16u));
5433
   Temp rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), desc_base, desc_off);
5434

5435
   /* If we want more we need to implement */
5436
   assert(instr->dest.ssa.bit_size == 32);
5437
   assert(instr->num_components == 1);
5438

5439
   bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst), rsrc, index, Operand::zero(), base,
5440
             false, false, true);
5441
}
5442

5443
void
5444
visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5445
{
5446
   Builder bld(ctx->program, ctx->block);
5447
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5448
   unsigned offset = nir_intrinsic_base(instr);
5449
   unsigned count = instr->dest.ssa.num_components;
5450
   nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
5451

5452
   if (index_cv && instr->dest.ssa.bit_size == 32) {
5453
      unsigned start = (offset + index_cv->u32) / 4u;
5454
      start -= ctx->args->ac.base_inline_push_consts;
5455
      if (start + count <= ctx->args->ac.num_inline_push_consts) {
5456
         std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5457
         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5458
            aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5459
         for (unsigned i = 0; i < count; ++i) {
5460
            elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
5461
            vec->operands[i] = Operand{elems[i]};
5462
         }
5463
         vec->definitions[0] = Definition(dst);
5464
         ctx->block->instructions.emplace_back(std::move(vec));
5465
         ctx->allocated_vec.emplace(dst.id(), elems);
5466
         return;
5467
      }
5468
   }
5469

5470
   Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5471
   if (offset != 0) // TODO check if index != 0 as well
5472
      index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5473
                             Operand::c32(offset), index);
5474
   Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
5475
   Temp vec = dst;
5476
   bool trim = false;
5477
   bool aligned = true;
5478

5479
   if (instr->dest.ssa.bit_size == 8) {
5480
      aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5481
      bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5482
      if (!aligned)
5483
         vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5484
   } else if (instr->dest.ssa.bit_size == 16) {
5485
      aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5486
      if (!aligned)
5487
         vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5488
   }
5489

5490
   aco_opcode op;
5491

5492
   switch (vec.size()) {
5493
   case 1: op = aco_opcode::s_load_dword; break;
5494
   case 2: op = aco_opcode::s_load_dwordx2; break;
5495
   case 3:
5496
      vec = bld.tmp(s4);
5497
      trim = true;
5498
      FALLTHROUGH;
5499
   case 4: op = aco_opcode::s_load_dwordx4; break;
5500
   case 6:
5501
      vec = bld.tmp(s8);
5502
      trim = true;
5503
      FALLTHROUGH;
5504
   case 8: op = aco_opcode::s_load_dwordx8; break;
5505
   default: unreachable("unimplemented or forbidden load_push_constant.");
5506
   }
5507

5508
   bld.smem(op, Definition(vec), ptr, index).instr->smem().prevent_overflow = true;
5509

5510
   if (!aligned) {
5511
      Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index);
5512
      byte_align_scalar(ctx, vec, byte_offset, dst);
5513
      return;
5514
   }
5515

5516
   if (trim) {
5517
      emit_split_vector(ctx, vec, 4);
5518
      RegClass rc = dst.size() == 3 ? s1 : s2;
5519
      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),
5520
                 emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));
5521
   }
5522
   emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
5523
}
5524

5525
void
5526
visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5527
{
5528
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5529

5530
   Builder bld(ctx->program, ctx->block);
5531

5532
   uint32_t desc_type =
5533
      S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5534
      S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5535
   if (ctx->options->chip_class >= GFX10) {
5536
      desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5537
                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5538
   } else {
5539
      desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5540
                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5541
   }
5542

5543
   unsigned base = nir_intrinsic_base(instr);
5544
   unsigned range = nir_intrinsic_range(instr);
5545

5546
   Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5547
   if (base && offset.type() == RegType::sgpr)
5548
      offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
5549
                              Operand::c32(base));
5550
   else if (base && offset.type() == RegType::vgpr)
5551
      offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
5552

5553
   Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5554
                          bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
5555
                                     Operand::c32(ctx->constant_data_offset)),
5556
                          Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5557
                          Operand::c32(desc_type));
5558
   unsigned size = instr->dest.ssa.bit_size / 8;
5559
   // TODO: get alignment information for subdword constants
5560
   load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5561
}
5562

5563
void
5564
visit_discard_if(isel_context* ctx, nir_intrinsic_instr* instr)
5565
{
5566
   if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5567
      ctx->cf_info.exec_potentially_empty_discard = true;
5568

5569
   ctx->program->needs_exact = true;
5570

5571
   // TODO: optimize uniform conditions
5572
   Builder bld(ctx->program, ctx->block);
5573
   Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5574
   assert(src.regClass() == bld.lm);
5575
   src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
5576
   bld.pseudo(aco_opcode::p_discard_if, src);
5577
   ctx->block->kind |= block_kind_uses_discard_if;
5578
   return;
5579
}
5580

5581
void
5582
visit_discard(isel_context* ctx, nir_intrinsic_instr* instr)
5583
{
5584
   Builder bld(ctx->program, ctx->block);
5585

5586
   if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5587
      ctx->cf_info.exec_potentially_empty_discard = true;
5588

5589
   bool divergent =
5590
      ctx->cf_info.parent_if.is_divergent || ctx->cf_info.parent_loop.has_divergent_continue;
5591

5592
   if (ctx->block->loop_nest_depth && (nir_instr_is_last(&instr->instr) && !divergent)) {
5593
      /* we handle discards the same way as jump instructions */
5594
      append_logical_end(ctx->block);
5595

5596
      /* in loops, discard behaves like break */
5597
      Block* linear_target = ctx->cf_info.parent_loop.exit;
5598
      ctx->block->kind |= block_kind_discard;
5599

5600
      /* uniform discard - loop ends here */
5601
      assert(nir_instr_is_last(&instr->instr));
5602
      ctx->block->kind |= block_kind_uniform;
5603
      ctx->cf_info.has_branch = true;
5604
      bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
5605
      add_linear_edge(ctx->block->index, linear_target);
5606
      return;
5607
   }
5608

5609
   /* it can currently happen that NIR doesn't remove the unreachable code */
5610
   if (!nir_instr_is_last(&instr->instr)) {
5611
      ctx->program->needs_exact = true;
5612
      /* save exec somewhere temporarily so that it doesn't get
5613
       * overwritten before the discard from outer exec masks */
5614
      Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc),
5615
                           Operand::c32(0xFFFFFFFF), Operand(exec, bld.lm));
5616
      bld.pseudo(aco_opcode::p_discard_if, cond);
5617
      ctx->block->kind |= block_kind_uses_discard_if;
5618
      return;
5619
   }
5620

5621
   /* This condition is incorrect for uniformly branched discards in a loop
5622
    * predicated by a divergent condition, but the above code catches that case
5623
    * and the discard would end up turning into a discard_if.
5624
    * For example:
5625
    * if (divergent) {
5626
    *    while (...) {
5627
    *       if (uniform) {
5628
    *          discard;
5629
    *       }
5630
    *    }
5631
    * }
5632
    */
5633
   if (!ctx->cf_info.parent_if.is_divergent) {
5634
      /* program just ends here */
5635
      ctx->block->kind |= block_kind_uses_discard_if;
5636
      bld.pseudo(aco_opcode::p_discard_if, Operand::c32(0xFFFFFFFFu));
5637
      // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
5638
   } else {
5639
      ctx->block->kind |= block_kind_discard;
5640
      /* branch and linear edge is added by visit_if() */
5641
   }
5642
}
5643

5644
enum aco_descriptor_type {
5645
   ACO_DESC_IMAGE,
5646
   ACO_DESC_FMASK,
5647
   ACO_DESC_SAMPLER,
5648
   ACO_DESC_BUFFER,
5649
   ACO_DESC_PLANE_0,
5650
   ACO_DESC_PLANE_1,
5651
   ACO_DESC_PLANE_2,
5652
};
5653

5654
static bool
5655
should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool is_array)
5656
{
5657
   if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
5658
      return false;
5659
   ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
5660
   return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
5661
          dim == ac_image_2darraymsaa;
5662
}
5663

5664
Temp
5665
get_sampler_desc(isel_context* ctx, nir_deref_instr* deref_instr,
5666
                 enum aco_descriptor_type desc_type, const nir_tex_instr* tex_instr, bool write)
5667
{
5668
   /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
5669
      std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type <<
5670
      32 | deref_instr->dest.ssa.index); if (it != ctx->tex_desc.end()) return it->second;
5671
   */
5672
   Temp index = Temp();
5673
   bool index_set = false;
5674
   unsigned constant_index = 0;
5675
   unsigned descriptor_set;
5676
   unsigned base_index;
5677
   Builder bld(ctx->program, ctx->block);
5678

5679
   if (!deref_instr) {
5680
      assert(tex_instr);
5681
      descriptor_set = 0;
5682
      base_index = tex_instr->sampler_index;
5683
   } else {
5684
      while (deref_instr->deref_type != nir_deref_type_var) {
5685
         unsigned array_size = glsl_get_aoa_size(deref_instr->type);
5686
         if (!array_size)
5687
            array_size = 1;
5688

5689
         assert(deref_instr->deref_type == nir_deref_type_array);
5690
         nir_const_value* const_value = nir_src_as_const_value(deref_instr->arr.index);
5691
         if (const_value) {
5692
            constant_index += array_size * const_value->u32;
5693
         } else {
5694
            Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
5695
            if (indirect.type() == RegType::vgpr)
5696
               indirect = bld.as_uniform(indirect);
5697

5698
            if (array_size != 1)
5699
               indirect =
5700
                  bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(array_size), indirect);
5701

5702
            if (!index_set) {
5703
               index = indirect;
5704
               index_set = true;
5705
            } else {
5706
               index =
5707
                  bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
5708
            }
5709
         }
5710

5711
         deref_instr = nir_src_as_deref(deref_instr->parent);
5712
      }
5713
      descriptor_set = deref_instr->var->data.descriptor_set;
5714
      base_index = deref_instr->var->data.binding;
5715
   }
5716

5717
   Temp list = load_desc_ptr(ctx, descriptor_set);
5718
   list = convert_pointer_to_64_bit(ctx, list);
5719

5720
   struct radv_descriptor_set_layout* layout = ctx->options->layout->set[descriptor_set].layout;
5721
   struct radv_descriptor_set_binding_layout* binding = layout->binding + base_index;
5722
   unsigned offset = binding->offset;
5723
   unsigned stride = binding->size;
5724
   aco_opcode opcode;
5725
   RegClass type;
5726

5727
   assert(base_index < layout->binding_count);
5728

5729
   switch (desc_type) {
5730
   case ACO_DESC_IMAGE:
5731
      type = s8;
5732
      opcode = aco_opcode::s_load_dwordx8;
5733
      break;
5734
   case ACO_DESC_FMASK:
5735
      type = s8;
5736
      opcode = aco_opcode::s_load_dwordx8;
5737
      offset += 32;
5738
      break;
5739
   case ACO_DESC_SAMPLER:
5740
      type = s4;
5741
      opcode = aco_opcode::s_load_dwordx4;
5742
      if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
5743
         offset += radv_combined_image_descriptor_sampler_offset(binding);
5744
      break;
5745
   case ACO_DESC_BUFFER:
5746
      type = s4;
5747
      opcode = aco_opcode::s_load_dwordx4;
5748
      break;
5749
   case ACO_DESC_PLANE_0:
5750
   case ACO_DESC_PLANE_1:
5751
      type = s8;
5752
      opcode = aco_opcode::s_load_dwordx8;
5753
      offset += 32 * (desc_type - ACO_DESC_PLANE_0);
5754
      break;
5755
   case ACO_DESC_PLANE_2:
5756
      type = s4;
5757
      opcode = aco_opcode::s_load_dwordx4;
5758
      offset += 64;
5759
      break;
5760
   default: unreachable("invalid desc_type\n");
5761
   }
5762

5763
   offset += constant_index * stride;
5764

5765
   if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
5766
       (!index_set || binding->immutable_samplers_equal)) {
5767
      if (binding->immutable_samplers_equal)
5768
         constant_index = 0;
5769

5770
      const uint32_t* samplers = radv_immutable_samplers(layout, binding);
5771
      uint32_t dword0_mask = tex_instr->op == nir_texop_tg4 ? C_008F30_TRUNC_COORD : 0xffffffffu;
5772
      return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5773
                        Operand::c32(samplers[constant_index * 4 + 0] & dword0_mask),
5774
                        Operand::c32(samplers[constant_index * 4 + 1]),
5775
                        Operand::c32(samplers[constant_index * 4 + 2]),
5776
                        Operand::c32(samplers[constant_index * 4 + 3]));
5777
   }
5778

5779
   Operand off;
5780
   if (!index_set) {
5781
      off = bld.copy(bld.def(s1), Operand::c32(offset));
5782
   } else {
5783
      off = Operand(
5784
         (Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand::c32(offset),
5785
                        bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index)));
5786
   }
5787

5788
   Temp res = bld.smem(opcode, bld.def(type), list, off);
5789

5790
   if (desc_type == ACO_DESC_PLANE_2) {
5791
      Temp components[8];
5792
      for (unsigned i = 0; i < 8; i++)
5793
         components[i] = bld.tmp(s1);
5794
      bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5795
                 Definition(components[2]), Definition(components[3]), res);
5796

5797
      Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, write);
5798
      bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
5799
                 Definition(components[4]), Definition(components[5]), Definition(components[6]),
5800
                 Definition(components[7]), desc2);
5801

5802
      res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
5803
                       components[2], components[3], components[4], components[5], components[6],
5804
                       components[7]);
5805
   } else if (desc_type == ACO_DESC_IMAGE && ctx->options->has_image_load_dcc_bug && !tex_instr &&
5806
              !write) {
5807
      Temp components[8];
5808
      for (unsigned i = 0; i < 8; i++)
5809
         components[i] = bld.tmp(s1);
5810

5811
      bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5812
                 Definition(components[2]), Definition(components[3]), Definition(components[4]),
5813
                 Definition(components[5]), Definition(components[6]), Definition(components[7]),
5814
                 res);
5815

5816
      /* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a
5817
       * hardware bug.
5818
       */
5819
      components[6] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[6],
5820
                               bld.copy(bld.def(s1), Operand::c32(C_00A018_WRITE_COMPRESS_ENABLE)));
5821

5822
      res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
5823
                       components[2], components[3], components[4], components[5], components[6],
5824
                       components[7]);
5825
   } else if (desc_type == ACO_DESC_SAMPLER && tex_instr->op == nir_texop_tg4) {
5826
      Temp components[4];
5827
      for (unsigned i = 0; i < 4; i++)
5828
         components[i] = bld.tmp(s1);
5829

5830
      bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5831
                 Definition(components[2]), Definition(components[3]), res);
5832

5833
      /* We want to always use the linear filtering truncation behaviour for
5834
       * nir_texop_tg4, even if the sampler uses nearest/point filtering.
5835
       */
5836
      components[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[0],
5837
                               Operand::c32(C_008F30_TRUNC_COORD));
5838

5839
      res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), components[0], components[1],
5840
                       components[2], components[3]);
5841
   }
5842

5843
   return res;
5844
}
5845

5846
static int
5847
image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5848
{
5849
   switch (dim) {
5850
   case GLSL_SAMPLER_DIM_BUF: return 1;
5851
   case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
5852
   case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
5853
   case GLSL_SAMPLER_DIM_MS: return array ? 4 : 3;
5854
   case GLSL_SAMPLER_DIM_3D:
5855
   case GLSL_SAMPLER_DIM_CUBE: return 3;
5856
   case GLSL_SAMPLER_DIM_RECT:
5857
   case GLSL_SAMPLER_DIM_SUBPASS: return 2;
5858
   case GLSL_SAMPLER_DIM_SUBPASS_MS: return 3;
5859
   default: break;
5860
   }
5861
   return 0;
5862
}
5863

5864
static MIMG_instruction*
5865
emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp,
5866
          std::vector<Temp> coords, unsigned wqm_mask = 0, Operand vdata = Operand(v1))
5867
{
5868
   /* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues. */
5869
   unsigned max_nsa_size = bld.program->chip_class >= GFX10_3 ? 13 : 5;
5870
   bool use_nsa = bld.program->chip_class >= GFX10 && coords.size() <= max_nsa_size;
5871

5872
   if (!use_nsa) {
5873
      Temp coord = coords[0];
5874
      if (coords.size() > 1) {
5875
         coord = bld.tmp(RegType::vgpr, coords.size());
5876

5877
         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5878
            aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
5879
         for (unsigned i = 0; i < coords.size(); i++)
5880
            vec->operands[i] = Operand(coords[i]);
5881
         vec->definitions[0] = Definition(coord);
5882
         bld.insert(std::move(vec));
5883
      } else if (coord.type() == RegType::sgpr) {
5884
         coord = bld.copy(bld.def(v1), coord);
5885
      }
5886

5887
      if (wqm_mask) {
5888
         /* We don't need the bias, sample index, compare value or offset to be
5889
          * computed in WQM but if the p_create_vector copies the coordinates, then it
5890
          * needs to be in WQM. */
5891
         coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
5892
      }
5893

5894
      coords[0] = coord;
5895
      coords.resize(1);
5896
   } else {
5897
      for (unsigned i = 0; i < coords.size(); i++) {
5898
         if (wqm_mask & (1u << i))
5899
            coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true);
5900
      }
5901

5902
      for (Temp& coord : coords) {
5903
         if (coord.type() == RegType::sgpr)
5904
            coord = bld.copy(bld.def(v1), coord);
5905
      }
5906
   }
5907

5908
   aco_ptr<MIMG_instruction> mimg{
5909
      create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), dst.isTemp())};
5910
   if (dst.isTemp())
5911
      mimg->definitions[0] = dst;
5912
   mimg->operands[0] = Operand(rsrc);
5913
   mimg->operands[1] = samp;
5914
   mimg->operands[2] = vdata;
5915
   for (unsigned i = 0; i < coords.size(); i++)
5916
      mimg->operands[3 + i] = Operand(coords[i]);
5917

5918
   MIMG_instruction* res = mimg.get();
5919
   bld.insert(std::move(mimg));
5920
   return res;
5921
}
5922

5923
void
5924
visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
5925
{
5926
   Builder bld(ctx->program, ctx->block);
5927
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5928
   Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
5929
   Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
5930
   Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
5931
   Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
5932
   Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
5933
   Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
5934

5935
   std::vector<Temp> args;
5936
   args.push_back(emit_extract_vector(ctx, node, 0, v1));
5937
   args.push_back(emit_extract_vector(ctx, node, 1, v1));
5938
   args.push_back(as_vgpr(ctx, tmax));
5939
   args.push_back(emit_extract_vector(ctx, origin, 0, v1));
5940
   args.push_back(emit_extract_vector(ctx, origin, 1, v1));
5941
   args.push_back(emit_extract_vector(ctx, origin, 2, v1));
5942
   args.push_back(emit_extract_vector(ctx, dir, 0, v1));
5943
   args.push_back(emit_extract_vector(ctx, dir, 1, v1));
5944
   args.push_back(emit_extract_vector(ctx, dir, 2, v1));
5945
   args.push_back(emit_extract_vector(ctx, inv_dir, 0, v1));
5946
   args.push_back(emit_extract_vector(ctx, inv_dir, 1, v1));
5947
   args.push_back(emit_extract_vector(ctx, inv_dir, 2, v1));
5948

5949
   MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, Definition(dst),
5950
                                      resource, Operand(s4), args);
5951
   mimg->dim = ac_image_1d;
5952
   mimg->dmask = 0xf;
5953
   mimg->unrm = true;
5954
   mimg->r128 = true;
5955
}
5956

5957
/* Adjust the sample index according to FMASK.
5958
 *
5959
 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
5960
 * which is the identity mapping. Each nibble says which physical sample
5961
 * should be fetched to get that sample.
5962
 *
5963
 * For example, 0x11111100 means there are only 2 samples stored and
5964
 * the second sample covers 3/4 of the pixel. When reading samples 0
5965
 * and 1, return physical sample 0 (determined by the first two 0s
5966
 * in FMASK), otherwise return physical sample 1.
5967
 *
5968
 * The sample index should be adjusted as follows:
5969
 *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
5970
 */
5971
static Temp
5972
adjust_sample_index_using_fmask(isel_context* ctx, bool da, std::vector<Temp>& coords,
5973
                                Operand sample_index, Temp fmask_desc_ptr)
5974
{
5975
   Builder bld(ctx->program, ctx->block);
5976
   Temp fmask = bld.tmp(v1);
5977
   unsigned dim = ctx->options->chip_class >= GFX10
5978
                     ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
5979
                     : 0;
5980

5981
   MIMG_instruction* load = emit_mimg(bld, aco_opcode::image_load, Definition(fmask),
5982
                                      fmask_desc_ptr, Operand(s4), coords);
5983
   load->glc = false;
5984
   load->dlc = false;
5985
   load->dmask = 0x1;
5986
   load->unrm = true;
5987
   load->da = da;
5988
   load->dim = dim;
5989

5990
   Operand sample_index4;
5991
   if (sample_index.isConstant()) {
5992
      if (sample_index.constantValue() < 16) {
5993
         sample_index4 = Operand::c32(sample_index.constantValue() << 2);
5994
      } else {
5995
         sample_index4 = Operand::zero();
5996
      }
5997
   } else if (sample_index.regClass() == s1) {
5998
      sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index,
5999
                               Operand::c32(2u));
6000
   } else {
6001
      assert(sample_index.regClass() == v1);
6002
      sample_index4 =
6003
         bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), sample_index);
6004
   }
6005

6006
   Temp final_sample;
6007
   if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
6008
      final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(15u), fmask);
6009
   else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
6010
      final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand::c32(28u), fmask);
6011
   else
6012
      final_sample =
6013
         bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand::c32(4u));
6014

6015
   /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
6016
    * resource descriptor is 0 (invalid),
6017
    */
6018
   Temp compare = bld.tmp(bld.lm);
6019
   bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare), Operand::zero(),
6020
                emit_extract_vector(ctx, fmask_desc_ptr, 1, s1))
6021
      .def(0)
6022
      .setHint(vcc);
6023

6024
   Temp sample_index_v = bld.copy(bld.def(v1), sample_index);
6025

6026
   /* Replace the MSAA sample index. */
6027
   return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
6028
}
6029

6030
static std::vector<Temp>
6031
get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr, const struct glsl_type* type)
6032
{
6033

6034
   Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6035
   enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6036
   bool is_array = glsl_sampler_type_is_array(type);
6037
   ASSERTED bool add_frag_pos =
6038
      (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6039
   assert(!add_frag_pos && "Input attachments should be lowered.");
6040
   bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6041
   bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6042
   int count = image_type_to_components_count(dim, is_array);
6043
   std::vector<Temp> coords(count);
6044
   Builder bld(ctx->program, ctx->block);
6045

6046
   if (is_ms) {
6047
      count--;
6048
      Temp src2 = get_ssa_temp(ctx, instr->src[2].ssa);
6049
      /* get sample index */
6050
      if (instr->intrinsic == nir_intrinsic_image_deref_load ||
6051
          instr->intrinsic == nir_intrinsic_image_deref_sparse_load) {
6052
         nir_const_value* sample_cv = nir_src_as_const_value(instr->src[2]);
6053
         Operand sample_index = sample_cv ? Operand::c32(sample_cv->u32)
6054
                                          : Operand(emit_extract_vector(ctx, src2, 0, v1));
6055
         std::vector<Temp> fmask_load_address;
6056
         for (unsigned i = 0; i < (is_array ? 3 : 2); i++)
6057
            fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1));
6058

6059
         Temp fmask_desc_ptr =
6060
            get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6061
                             ACO_DESC_FMASK, nullptr, false);
6062
         coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address,
6063
                                                         sample_index, fmask_desc_ptr);
6064
      } else {
6065
         coords[count] = emit_extract_vector(ctx, src2, 0, v1);
6066
      }
6067
   }
6068

6069
   if (gfx9_1d) {
6070
      coords[0] = emit_extract_vector(ctx, src0, 0, v1);
6071
      coords.resize(coords.size() + 1);
6072
      coords[1] = bld.copy(bld.def(v1), Operand::zero());
6073
      if (is_array)
6074
         coords[2] = emit_extract_vector(ctx, src0, 1, v1);
6075
   } else {
6076
      for (int i = 0; i < count; i++)
6077
         coords[i] = emit_extract_vector(ctx, src0, i, v1);
6078
   }
6079

6080
   if (instr->intrinsic == nir_intrinsic_image_deref_load ||
6081
       instr->intrinsic == nir_intrinsic_image_deref_sparse_load ||
6082
       instr->intrinsic == nir_intrinsic_image_deref_store) {
6083
      int lod_index = instr->intrinsic == nir_intrinsic_image_deref_store ? 4 : 3;
6084
      bool level_zero =
6085
         nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
6086

6087
      if (!level_zero)
6088
         coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
6089
   }
6090

6091
   return coords;
6092
}
6093

6094
memory_sync_info
6095
get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6096
{
6097
   /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6098
   if (semantics & semantic_atomicrmw)
6099
      return memory_sync_info(storage, semantics);
6100

6101
   unsigned access = nir_intrinsic_access(instr);
6102

6103
   if (access & ACCESS_VOLATILE)
6104
      semantics |= semantic_volatile;
6105
   if (access & ACCESS_CAN_REORDER)
6106
      semantics |= semantic_can_reorder | semantic_private;
6107

6108
   return memory_sync_info(storage, semantics);
6109
}
6110

6111
Operand
6112
emit_tfe_init(Builder& bld, Temp dst)
6113
{
6114
   Temp tmp = bld.tmp(dst.regClass());
6115

6116
   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6117
      aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6118
   for (unsigned i = 0; i < dst.size(); i++)
6119
      vec->operands[i] = Operand::zero();
6120
   vec->definitions[0] = Definition(tmp);
6121
   /* Since this is fixed to an instruction's definition register, any CSE will
6122
    * just create copies. Copying costs about the same as zero-initialization,
6123
    * but these copies can break up clauses.
6124
    */
6125
   vec->definitions[0].setNoCSE(true);
6126
   bld.insert(std::move(vec));
6127

6128
   return Operand(tmp);
6129
}
6130

6131
void
6132
visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6133
{
6134
   Builder bld(ctx->program, ctx->block);
6135
   const nir_variable* var =
6136
      nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6137
   const struct glsl_type* type = glsl_without_array(var->type);
6138
   const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6139
   bool is_array = glsl_sampler_type_is_array(type);
6140
   bool is_sparse = instr->intrinsic == nir_intrinsic_image_deref_sparse_load;
6141
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6142

6143
   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6144
   unsigned access = var->data.access | nir_intrinsic_access(instr);
6145

6146
   unsigned result_size = instr->dest.ssa.num_components - is_sparse;
6147
   unsigned expand_mask =
6148
      nir_ssa_def_components_read(&instr->dest.ssa) & u_bit_consecutive(0, result_size);
6149
   expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6150
   if (dim == GLSL_SAMPLER_DIM_BUF)
6151
      expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6152
   unsigned dmask = expand_mask;
6153
   if (instr->dest.ssa.bit_size == 64) {
6154
      expand_mask &= 0x9;
6155
      /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6156
      dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6157
   }
6158
   if (is_sparse)
6159
      expand_mask |= 1 << result_size;
6160
   unsigned num_components = util_bitcount(dmask) + is_sparse;
6161

6162
   Temp tmp;
6163
   if (num_components == dst.size() && dst.type() == RegType::vgpr)
6164
      tmp = dst;
6165
   else
6166
      tmp = ctx->program->allocateTmp(RegClass(RegType::vgpr, num_components));
6167

6168
   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6169
                                    dim == GLSL_SAMPLER_DIM_BUF ? ACO_DESC_BUFFER : ACO_DESC_IMAGE,
6170
                                    nullptr, false);
6171

6172
   if (dim == GLSL_SAMPLER_DIM_BUF) {
6173
      Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6174

6175
      aco_opcode opcode;
6176
      switch (util_bitcount(dmask)) {
6177
      case 1: opcode = aco_opcode::buffer_load_format_x; break;
6178
      case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6179
      case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6180
      case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6181
      default: unreachable(">4 channel buffer image load");
6182
      }
6183
      aco_ptr<MUBUF_instruction> load{
6184
         create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6185
      load->operands[0] = Operand(resource);
6186
      load->operands[1] = Operand(vindex);
6187
      load->operands[2] = Operand::c32(0);
6188
      load->definitions[0] = Definition(tmp);
6189
      load->idxen = true;
6190
      load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6191
      load->dlc = load->glc && ctx->options->chip_class >= GFX10;
6192
      load->sync = sync;
6193
      load->tfe = is_sparse;
6194
      if (load->tfe)
6195
         load->operands[3] = emit_tfe_init(bld, tmp);
6196
      ctx->block->instructions.emplace_back(std::move(load));
6197
   } else {
6198
      std::vector<Temp> coords = get_image_coords(ctx, instr, type);
6199

6200
      bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6201
      aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6202

6203
      Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6204
      MIMG_instruction* load =
6205
         emit_mimg(bld, opcode, Definition(tmp), resource, Operand(s4), coords, 0, vdata);
6206
      load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
6207
      load->dlc = load->glc && ctx->options->chip_class >= GFX10;
6208
      load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6209
      load->dmask = dmask;
6210
      load->unrm = true;
6211
      load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
6212
      load->sync = sync;
6213
      load->tfe = is_sparse;
6214
   }
6215

6216
   if (is_sparse && instr->dest.ssa.bit_size == 64) {
6217
      /* The result components are 64-bit but the sparse residency code is
6218
       * 32-bit. So add a zero to the end so expand_vector() works correctly.
6219
       */
6220
      tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6221
                       Operand::zero());
6222
   }
6223

6224
   expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, expand_mask);
6225
}
6226

6227
void
6228
visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6229
{
6230
   const nir_variable* var =
6231
      nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6232
   const struct glsl_type* type = glsl_without_array(var->type);
6233
   const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6234
   bool is_array = glsl_sampler_type_is_array(type);
6235
   Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6236

6237
   /* only R64_UINT and R64_SINT supported */
6238
   if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6239
      data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6240
   data = as_vgpr(ctx, data);
6241

6242
   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6243
   unsigned access = var->data.access | nir_intrinsic_access(instr);
6244
   bool glc = ctx->options->chip_class == GFX6 ||
6245
                    access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE)
6246
                 ? 1
6247
                 : 0;
6248

6249
   if (dim == GLSL_SAMPLER_DIM_BUF) {
6250
      Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6251
                                   ACO_DESC_BUFFER, nullptr, true);
6252
      Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6253
      aco_opcode opcode;
6254
      switch (data.size()) {
6255
      case 1: opcode = aco_opcode::buffer_store_format_x; break;
6256
      case 2: opcode = aco_opcode::buffer_store_format_xy; break;
6257
      case 3: opcode = aco_opcode::buffer_store_format_xyz; break;
6258
      case 4: opcode = aco_opcode::buffer_store_format_xyzw; break;
6259
      default: unreachable(">4 channel buffer image store");
6260
      }
6261
      aco_ptr<MUBUF_instruction> store{
6262
         create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
6263
      store->operands[0] = Operand(rsrc);
6264
      store->operands[1] = Operand(vindex);
6265
      store->operands[2] = Operand::c32(0);
6266
      store->operands[3] = Operand(data);
6267
      store->idxen = true;
6268
      store->glc = glc;
6269
      store->dlc = false;
6270
      store->disable_wqm = true;
6271
      store->sync = sync;
6272
      ctx->program->needs_exact = true;
6273
      ctx->block->instructions.emplace_back(std::move(store));
6274
      return;
6275
   }
6276

6277
   assert(data.type() == RegType::vgpr);
6278
   std::vector<Temp> coords = get_image_coords(ctx, instr, type);
6279
   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6280
                                    ACO_DESC_IMAGE, nullptr, true);
6281

6282
   bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6283
   aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6284

6285
   Builder bld(ctx->program, ctx->block);
6286
   MIMG_instruction* store =
6287
      emit_mimg(bld, opcode, Definition(), resource, Operand(s4), coords, 0, Operand(data));
6288
   store->glc = glc;
6289
   store->dlc = false;
6290
   store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6291
   store->dmask = (1 << data.size()) - 1;
6292
   store->unrm = true;
6293
   store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
6294
   store->disable_wqm = true;
6295
   store->sync = sync;
6296
   ctx->program->needs_exact = true;
6297
   return;
6298
}
6299

6300
void
6301
visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6302
{
6303
   bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6304
   const nir_variable* var =
6305
      nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6306
   const struct glsl_type* type = glsl_without_array(var->type);
6307
   const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6308
   bool is_array = glsl_sampler_type_is_array(type);
6309
   Builder bld(ctx->program, ctx->block);
6310

6311
   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6312
   bool is_64bit = data.bytes() == 8;
6313
   assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6314

6315
   if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
6316
      data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6317
                        get_ssa_temp(ctx, instr->src[4].ssa), data);
6318

6319
   aco_opcode buf_op, buf_op64, image_op;
6320
   switch (instr->intrinsic) {
6321
   case nir_intrinsic_image_deref_atomic_add:
6322
      buf_op = aco_opcode::buffer_atomic_add;
6323
      buf_op64 = aco_opcode::buffer_atomic_add_x2;
6324
      image_op = aco_opcode::image_atomic_add;
6325
      break;
6326
   case nir_intrinsic_image_deref_atomic_umin:
6327
      buf_op = aco_opcode::buffer_atomic_umin;
6328
      buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6329
      image_op = aco_opcode::image_atomic_umin;
6330
      break;
6331
   case nir_intrinsic_image_deref_atomic_imin:
6332
      buf_op = aco_opcode::buffer_atomic_smin;
6333
      buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6334
      image_op = aco_opcode::image_atomic_smin;
6335
      break;
6336
   case nir_intrinsic_image_deref_atomic_umax:
6337
      buf_op = aco_opcode::buffer_atomic_umax;
6338
      buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6339
      image_op = aco_opcode::image_atomic_umax;
6340
      break;
6341
   case nir_intrinsic_image_deref_atomic_imax:
6342
      buf_op = aco_opcode::buffer_atomic_smax;
6343
      buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6344
      image_op = aco_opcode::image_atomic_smax;
6345
      break;
6346
   case nir_intrinsic_image_deref_atomic_and:
6347
      buf_op = aco_opcode::buffer_atomic_and;
6348
      buf_op64 = aco_opcode::buffer_atomic_and_x2;
6349
      image_op = aco_opcode::image_atomic_and;
6350
      break;
6351
   case nir_intrinsic_image_deref_atomic_or:
6352
      buf_op = aco_opcode::buffer_atomic_or;
6353
      buf_op64 = aco_opcode::buffer_atomic_or_x2;
6354
      image_op = aco_opcode::image_atomic_or;
6355
      break;
6356
   case nir_intrinsic_image_deref_atomic_xor:
6357
      buf_op = aco_opcode::buffer_atomic_xor;
6358
      buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6359
      image_op = aco_opcode::image_atomic_xor;
6360
      break;
6361
   case nir_intrinsic_image_deref_atomic_exchange:
6362
      buf_op = aco_opcode::buffer_atomic_swap;
6363
      buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6364
      image_op = aco_opcode::image_atomic_swap;
6365
      break;
6366
   case nir_intrinsic_image_deref_atomic_comp_swap:
6367
      buf_op = aco_opcode::buffer_atomic_cmpswap;
6368
      buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6369
      image_op = aco_opcode::image_atomic_cmpswap;
6370
      break;
6371
   default:
6372
      unreachable("visit_image_atomic should only be called with "
6373
                  "nir_intrinsic_image_deref_atomic_* instructions.");
6374
   }
6375

6376
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6377
   memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6378

6379
   if (dim == GLSL_SAMPLER_DIM_BUF) {
6380
      Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6381
      Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6382
                                       ACO_DESC_BUFFER, nullptr, true);
6383
      // assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet
6384
      // implemented.");
6385
      aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
6386
         is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6387
      mubuf->operands[0] = Operand(resource);
6388
      mubuf->operands[1] = Operand(vindex);
6389
      mubuf->operands[2] = Operand::c32(0);
6390
      mubuf->operands[3] = Operand(data);
6391
      if (return_previous)
6392
         mubuf->definitions[0] = Definition(dst);
6393
      mubuf->offset = 0;
6394
      mubuf->idxen = true;
6395
      mubuf->glc = return_previous;
6396
      mubuf->dlc = false; /* Not needed for atomics */
6397
      mubuf->disable_wqm = true;
6398
      mubuf->sync = sync;
6399
      ctx->program->needs_exact = true;
6400
      ctx->block->instructions.emplace_back(std::move(mubuf));
6401
      return;
6402
   }
6403

6404
   std::vector<Temp> coords = get_image_coords(ctx, instr, type);
6405
   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6406
                                    ACO_DESC_IMAGE, nullptr, true);
6407
   Definition def = return_previous ? Definition(dst) : Definition();
6408
   MIMG_instruction* mimg =
6409
      emit_mimg(bld, image_op, def, resource, Operand(s4), coords, 0, Operand(data));
6410
   mimg->glc = return_previous;
6411
   mimg->dlc = false; /* Not needed for atomics */
6412
   mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6413
   mimg->dmask = (1 << data.size()) - 1;
6414
   mimg->unrm = true;
6415
   mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
6416
   mimg->disable_wqm = true;
6417
   mimg->sync = sync;
6418
   ctx->program->needs_exact = true;
6419
   return;
6420
}
6421

6422
void
6423
get_buffer_size(isel_context* ctx, Temp desc, Temp dst)
6424
{
6425
   if (ctx->options->chip_class == GFX8) {
6426
      /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
6427
      Builder bld(ctx->program, ctx->block);
6428

6429
      Temp size = emit_extract_vector(ctx, desc, 2, s1);
6430

6431
      Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1),
6432
                                bld.copy(bld.def(v1), Operand::c32(0xaaaaaaabu)), size);
6433
      size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
6434
                           bld.as_uniform(size_div3), Operand::c32(1u));
6435

6436
      Temp stride = emit_extract_vector(ctx, desc, 1, s1);
6437
      stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride,
6438
                        Operand::c32((5u << 16) | 16u));
6439

6440
      Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand::c32(12u));
6441
      size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
6442

6443
      Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
6444
      bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), size,
6445
               bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
6446
      if (dst.type() == RegType::vgpr)
6447
         bld.copy(Definition(dst), shr_dst);
6448

6449
      /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
6450
   } else {
6451
      emit_extract_vector(ctx, desc, 2, dst);
6452
   }
6453
}
6454

6455
void
6456
visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr)
6457
{
6458
   const nir_variable* var =
6459
      nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6460
   const struct glsl_type* type = glsl_without_array(var->type);
6461
   const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6462
   bool is_array = glsl_sampler_type_is_array(type);
6463
   Builder bld(ctx->program, ctx->block);
6464

6465
   if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
6466
      Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6467
                                   ACO_DESC_BUFFER, NULL, false);
6468
      return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa));
6469
   }
6470

6471
   /* LOD */
6472
   assert(nir_src_as_uint(instr->src[1]) == 0);
6473
   std::vector<Temp> lod{bld.copy(bld.def(v1), Operand::zero())};
6474

6475
   /* Resource */
6476
   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6477
                                    ACO_DESC_IMAGE, NULL, false);
6478

6479
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6480

6481
   MIMG_instruction* mimg =
6482
      emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(dst), resource, Operand(s4), lod);
6483
   uint8_t& dmask = mimg->dmask;
6484
   mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6485
   mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
6486
   mimg->da = glsl_sampler_type_is_array(type);
6487

6488
   if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE && glsl_sampler_type_is_array(type)) {
6489

6490
      assert(instr->dest.ssa.num_components == 3);
6491
      Temp tmp = ctx->program->allocateTmp(v3);
6492
      mimg->definitions[0] = Definition(tmp);
6493
      emit_split_vector(ctx, tmp, 3);
6494

6495
      /* divide 3rd value by 6 by multiplying with magic number */
6496
      Temp c = bld.copy(bld.def(s1), Operand::c32(0x2AAAAAAB));
6497
      Temp by_6 =
6498
         bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
6499

6500
      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, tmp, 0, v1),
6501
                 emit_extract_vector(ctx, tmp, 1, v1), by_6);
6502

6503
   } else if (ctx->options->chip_class == GFX9 &&
6504
              glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
6505
              glsl_sampler_type_is_array(type)) {
6506
      assert(instr->dest.ssa.num_components == 2);
6507
      dmask = 0x5;
6508
   }
6509

6510
   emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
6511
}
6512

6513
void
6514
get_image_samples(isel_context* ctx, Definition dst, Temp resource)
6515
{
6516
   Builder bld(ctx->program, ctx->block);
6517

6518
   Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6519
   Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
6520
                                Operand::c32(16u | 4u << 16));
6521
   Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand::c32(1u),
6522
                           samples_log2);
6523
   Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
6524
                        Operand::c32(28u | 4u << 16 /* offset=28, width=4 */));
6525

6526
   Operand default_sample = Operand::c32(1u);
6527
   if (ctx->options->robust_buffer_access) {
6528
      /* Extract the second dword of the descriptor, if it's
6529
       * all zero, then it's a null descriptor.
6530
       */
6531
      Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
6532
      Temp is_non_null_descriptor =
6533
         bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand::zero());
6534
      default_sample = Operand(is_non_null_descriptor);
6535
   }
6536

6537
   Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand::c32(14u));
6538
   bld.sop2(aco_opcode::s_cselect_b32, dst, samples, default_sample, bld.scc(is_msaa));
6539
}
6540

6541
void
6542
visit_image_samples(isel_context* ctx, nir_intrinsic_instr* instr)
6543
{
6544
   Builder bld(ctx->program, ctx->block);
6545
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6546
   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6547
                                    ACO_DESC_IMAGE, NULL, false);
6548
   get_image_samples(ctx, Definition(dst), resource);
6549
}
6550

6551
void
6552
visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6553
{
6554
   Builder bld(ctx->program, ctx->block);
6555
   unsigned num_components = instr->num_components;
6556

6557
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6558
   Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6559

6560
   unsigned access = nir_intrinsic_access(instr);
6561
   bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6562
   unsigned size = instr->dest.ssa.bit_size / 8;
6563

6564
   bool allow_smem = access & ACCESS_CAN_REORDER;
6565

6566
   load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6567
               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
6568
               get_memory_sync_info(instr, storage_buffer, 0));
6569
}
6570

6571
void
6572
visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6573
{
6574
   Builder bld(ctx->program, ctx->block);
6575
   Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6576
   unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6577
   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6578
   Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6579

6580
   Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6581

6582
   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6583
   bool glc =
6584
      nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6585

6586
   unsigned write_count = 0;
6587
   Temp write_datas[32];
6588
   unsigned offsets[32];
6589
   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6590
                      write_datas, offsets);
6591

6592
   /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6593
    * correctly when the SGPR offset is used.
6594
    */
6595
   if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)
6596
      offset = as_vgpr(ctx, offset);
6597

6598
   for (unsigned i = 0; i < write_count; i++) {
6599
      aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6600

6601
      aco_ptr<MUBUF_instruction> store{
6602
         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6603
      store->operands[0] = Operand(rsrc);
6604
      store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6605
      store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6606
      store->operands[3] = Operand(write_datas[i]);
6607
      store->offset = offsets[i];
6608
      store->offen = (offset.type() == RegType::vgpr);
6609
      store->glc = glc;
6610
      store->dlc = false;
6611
      store->disable_wqm = true;
6612
      store->sync = sync;
6613
      ctx->program->needs_exact = true;
6614
      ctx->block->instructions.emplace_back(std::move(store));
6615
   }
6616
}
6617

6618
void
6619
visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6620
{
6621
   Builder bld(ctx->program, ctx->block);
6622
   bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6623
   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6624

6625
   if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
6626
      data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6627
                        get_ssa_temp(ctx, instr->src[3].ssa), data);
6628

6629
   Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6630
   Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6631

6632
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6633

6634
   aco_opcode op32, op64;
6635
   switch (instr->intrinsic) {
6636
   case nir_intrinsic_ssbo_atomic_add:
6637
      op32 = aco_opcode::buffer_atomic_add;
6638
      op64 = aco_opcode::buffer_atomic_add_x2;
6639
      break;
6640
   case nir_intrinsic_ssbo_atomic_imin:
6641
      op32 = aco_opcode::buffer_atomic_smin;
6642
      op64 = aco_opcode::buffer_atomic_smin_x2;
6643
      break;
6644
   case nir_intrinsic_ssbo_atomic_umin:
6645
      op32 = aco_opcode::buffer_atomic_umin;
6646
      op64 = aco_opcode::buffer_atomic_umin_x2;
6647
      break;
6648
   case nir_intrinsic_ssbo_atomic_imax:
6649
      op32 = aco_opcode::buffer_atomic_smax;
6650
      op64 = aco_opcode::buffer_atomic_smax_x2;
6651
      break;
6652
   case nir_intrinsic_ssbo_atomic_umax:
6653
      op32 = aco_opcode::buffer_atomic_umax;
6654
      op64 = aco_opcode::buffer_atomic_umax_x2;
6655
      break;
6656
   case nir_intrinsic_ssbo_atomic_and:
6657
      op32 = aco_opcode::buffer_atomic_and;
6658
      op64 = aco_opcode::buffer_atomic_and_x2;
6659
      break;
6660
   case nir_intrinsic_ssbo_atomic_or:
6661
      op32 = aco_opcode::buffer_atomic_or;
6662
      op64 = aco_opcode::buffer_atomic_or_x2;
6663
      break;
6664
   case nir_intrinsic_ssbo_atomic_xor:
6665
      op32 = aco_opcode::buffer_atomic_xor;
6666
      op64 = aco_opcode::buffer_atomic_xor_x2;
6667
      break;
6668
   case nir_intrinsic_ssbo_atomic_exchange:
6669
      op32 = aco_opcode::buffer_atomic_swap;
6670
      op64 = aco_opcode::buffer_atomic_swap_x2;
6671
      break;
6672
   case nir_intrinsic_ssbo_atomic_comp_swap:
6673
      op32 = aco_opcode::buffer_atomic_cmpswap;
6674
      op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6675
      break;
6676
   default:
6677
      unreachable(
6678
         "visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
6679
   }
6680
   aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6681
   aco_ptr<MUBUF_instruction> mubuf{
6682
      create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6683
   mubuf->operands[0] = Operand(rsrc);
6684
   mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6685
   mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6686
   mubuf->operands[3] = Operand(data);
6687
   if (return_previous)
6688
      mubuf->definitions[0] = Definition(dst);
6689
   mubuf->offset = 0;
6690
   mubuf->offen = (offset.type() == RegType::vgpr);
6691
   mubuf->glc = return_previous;
6692
   mubuf->dlc = false; /* Not needed for atomics */
6693
   mubuf->disable_wqm = true;
6694
   mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6695
   ctx->program->needs_exact = true;
6696
   ctx->block->instructions.emplace_back(std::move(mubuf));
6697
}
6698

6699
void
6700
visit_get_ssbo_size(isel_context* ctx, nir_intrinsic_instr* instr)
6701
{
6702

6703
   Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
6704
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6705
   bool non_uniform = dst.type() == RegType::vgpr;
6706

6707
   Builder bld(ctx->program, ctx->block);
6708
   if (non_uniform) {
6709
      Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));
6710
      Temp binding = emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1));
6711
      Temp index = bld.vadd32(bld.def(v1), set_ptr, binding);
6712
      index = convert_pointer_to_64_bit(ctx, index, non_uniform);
6713

6714
      LoadEmitInfo info = {Operand(index), dst, 1, 4};
6715
      info.align_mul = 4;
6716
      info.const_offset = 8;
6717
      emit_load(ctx, bld, info, global_load_params);
6718
   } else {
6719
      emit_extract_vector(ctx, load_buffer_rsrc(ctx, rsrc), 2, dst);
6720
   }
6721
}
6722

6723
void
6724
visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6725
{
6726
   Builder bld(ctx->program, ctx->block);
6727
   unsigned num_components = instr->num_components;
6728
   unsigned component_size = instr->dest.ssa.bit_size / 8;
6729

6730
   LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),
6731
                        get_ssa_temp(ctx, &instr->dest.ssa), num_components, component_size};
6732
   info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6733
   info.align_mul = nir_intrinsic_align_mul(instr);
6734
   info.align_offset = nir_intrinsic_align_offset(instr);
6735
   info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6736
   /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6737
    * it's safe to use SMEM */
6738
   bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
6739
   if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) ||
6740
       !can_use_smem) {
6741
      emit_load(ctx, bld, info, global_load_params);
6742
   } else {
6743
      info.offset = Operand(bld.as_uniform(info.offset));
6744
      emit_load(ctx, bld, info, smem_load_params);
6745
   }
6746
}
6747

6748
void
6749
visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6750
{
6751
   Builder bld(ctx->program, ctx->block);
6752
   unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6753
   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6754

6755
   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6756
   Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
6757
   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6758
   bool glc =
6759
      nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6760

6761
   if (ctx->options->chip_class >= GFX7)
6762
      addr = as_vgpr(ctx, addr);
6763

6764
   unsigned write_count = 0;
6765
   Temp write_datas[32];
6766
   unsigned offsets[32];
6767
   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6768
                      write_datas, offsets);
6769

6770
   for (unsigned i = 0; i < write_count; i++) {
6771
      if (ctx->options->chip_class >= GFX7) {
6772
         unsigned offset = offsets[i];
6773
         Temp store_addr = addr;
6774
         if (offset > 0 && ctx->options->chip_class < GFX9) {
6775
            Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
6776
            Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
6777
            Temp carry = bld.tmp(bld.lm);
6778
            bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
6779

6780
            bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0),
6781
                     bld.hint_vcc(Definition(carry)), Operand::c32(offset), addr0);
6782
            bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
6783
                     Operand::zero(), addr1, carry)
6784
               .def(1)
6785
               .setHint(vcc);
6786

6787
            store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
6788

6789
            offset = 0;
6790
         }
6791

6792
         bool global = ctx->options->chip_class >= GFX9;
6793
         aco_opcode op;
6794
         switch (write_datas[i].bytes()) {
6795
         case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6796
         case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6797
         case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6798
         case 8:
6799
            op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6800
            break;
6801
         case 12:
6802
            op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6803
            break;
6804
         case 16:
6805
            op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6806
            break;
6807
         default: unreachable("store_global not implemented for this size.");
6808
         }
6809

6810
         aco_ptr<FLAT_instruction> flat{
6811
            create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6812
         flat->operands[0] = Operand(store_addr);
6813
         flat->operands[1] = Operand(s1);
6814
         flat->operands[2] = Operand(write_datas[i]);
6815
         flat->glc = glc;
6816
         flat->dlc = false;
6817
         flat->offset = offset;
6818
         flat->disable_wqm = true;
6819
         flat->sync = sync;
6820
         ctx->program->needs_exact = true;
6821
         ctx->block->instructions.emplace_back(std::move(flat));
6822
      } else {
6823
         assert(ctx->options->chip_class == GFX6);
6824

6825
         aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6826

6827
         Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6828

6829
         aco_ptr<MUBUF_instruction> mubuf{
6830
            create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6831
         mubuf->operands[0] = Operand(rsrc);
6832
         mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6833
         mubuf->operands[2] = Operand::zero();
6834
         mubuf->operands[3] = Operand(write_datas[i]);
6835
         mubuf->glc = glc;
6836
         mubuf->dlc = false;
6837
         mubuf->offset = offsets[i];
6838
         mubuf->addr64 = addr.type() == RegType::vgpr;
6839
         mubuf->disable_wqm = true;
6840
         mubuf->sync = sync;
6841
         ctx->program->needs_exact = true;
6842
         ctx->block->instructions.emplace_back(std::move(mubuf));
6843
      }
6844
   }
6845
}
6846

6847
void
6848
visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6849
{
6850
   Builder bld(ctx->program, ctx->block);
6851
   bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6852
   Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
6853
   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6854

6855
   if (ctx->options->chip_class >= GFX7)
6856
      addr = as_vgpr(ctx, addr);
6857

6858
   if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
6859
      data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6860
                        get_ssa_temp(ctx, instr->src[2].ssa), data);
6861

6862
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6863

6864
   aco_opcode op32, op64;
6865

6866
   if (ctx->options->chip_class >= GFX7) {
6867
      bool global = ctx->options->chip_class >= GFX9;
6868
      switch (instr->intrinsic) {
6869
      case nir_intrinsic_global_atomic_add:
6870
         op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6871
         op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6872
         break;
6873
      case nir_intrinsic_global_atomic_imin:
6874
         op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6875
         op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6876
         break;
6877
      case nir_intrinsic_global_atomic_umin:
6878
         op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6879
         op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6880
         break;
6881
      case nir_intrinsic_global_atomic_imax:
6882
         op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6883
         op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6884
         break;
6885
      case nir_intrinsic_global_atomic_umax:
6886
         op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6887
         op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6888
         break;
6889
      case nir_intrinsic_global_atomic_and:
6890
         op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6891
         op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6892
         break;
6893
      case nir_intrinsic_global_atomic_or:
6894
         op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6895
         op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6896
         break;
6897
      case nir_intrinsic_global_atomic_xor:
6898
         op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6899
         op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6900
         break;
6901
      case nir_intrinsic_global_atomic_exchange:
6902
         op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6903
         op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6904
         break;
6905
      case nir_intrinsic_global_atomic_comp_swap:
6906
         op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6907
         op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6908
         break;
6909
      default:
6910
         unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
6911
                     "instructions.");
6912
      }
6913

6914
      aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6915
      aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(
6916
         op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6917
      flat->operands[0] = Operand(addr);
6918
      flat->operands[1] = Operand(s1);
6919
      flat->operands[2] = Operand(data);
6920
      if (return_previous)
6921
         flat->definitions[0] = Definition(dst);
6922
      flat->glc = return_previous;
6923
      flat->dlc = false; /* Not needed for atomics */
6924
      flat->offset = 0;
6925
      flat->disable_wqm = true;
6926
      flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6927
      ctx->program->needs_exact = true;
6928
      ctx->block->instructions.emplace_back(std::move(flat));
6929
   } else {
6930
      assert(ctx->options->chip_class == GFX6);
6931

6932
      switch (instr->intrinsic) {
6933
      case nir_intrinsic_global_atomic_add:
6934
         op32 = aco_opcode::buffer_atomic_add;
6935
         op64 = aco_opcode::buffer_atomic_add_x2;
6936
         break;
6937
      case nir_intrinsic_global_atomic_imin:
6938
         op32 = aco_opcode::buffer_atomic_smin;
6939
         op64 = aco_opcode::buffer_atomic_smin_x2;
6940
         break;
6941
      case nir_intrinsic_global_atomic_umin:
6942
         op32 = aco_opcode::buffer_atomic_umin;
6943
         op64 = aco_opcode::buffer_atomic_umin_x2;
6944
         break;
6945
      case nir_intrinsic_global_atomic_imax:
6946
         op32 = aco_opcode::buffer_atomic_smax;
6947
         op64 = aco_opcode::buffer_atomic_smax_x2;
6948
         break;
6949
      case nir_intrinsic_global_atomic_umax:
6950
         op32 = aco_opcode::buffer_atomic_umax;
6951
         op64 = aco_opcode::buffer_atomic_umax_x2;
6952
         break;
6953
      case nir_intrinsic_global_atomic_and:
6954
         op32 = aco_opcode::buffer_atomic_and;
6955
         op64 = aco_opcode::buffer_atomic_and_x2;
6956
         break;
6957
      case nir_intrinsic_global_atomic_or:
6958
         op32 = aco_opcode::buffer_atomic_or;
6959
         op64 = aco_opcode::buffer_atomic_or_x2;
6960
         break;
6961
      case nir_intrinsic_global_atomic_xor:
6962
         op32 = aco_opcode::buffer_atomic_xor;
6963
         op64 = aco_opcode::buffer_atomic_xor_x2;
6964
         break;
6965
      case nir_intrinsic_global_atomic_exchange:
6966
         op32 = aco_opcode::buffer_atomic_swap;
6967
         op64 = aco_opcode::buffer_atomic_swap_x2;
6968
         break;
6969
      case nir_intrinsic_global_atomic_comp_swap:
6970
         op32 = aco_opcode::buffer_atomic_cmpswap;
6971
         op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6972
         break;
6973
      default:
6974
         unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
6975
                     "instructions.");
6976
      }
6977

6978
      Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6979

6980
      aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6981

6982
      aco_ptr<MUBUF_instruction> mubuf{
6983
         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6984
      mubuf->operands[0] = Operand(rsrc);
6985
      mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6986
      mubuf->operands[2] = Operand::zero();
6987
      mubuf->operands[3] = Operand(data);
6988
      if (return_previous)
6989
         mubuf->definitions[0] = Definition(dst);
6990
      mubuf->glc = return_previous;
6991
      mubuf->dlc = false;
6992
      mubuf->offset = 0;
6993
      mubuf->addr64 = addr.type() == RegType::vgpr;
6994
      mubuf->disable_wqm = true;
6995
      mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6996
      ctx->program->needs_exact = true;
6997
      ctx->block->instructions.emplace_back(std::move(mubuf));
6998
   }
6999
}
7000

7001
void
7002
visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7003
{
7004
   Builder bld(ctx->program, ctx->block);
7005

7006
   Temp dst = get_ssa_temp(ctx, &intrin->dest.ssa);
7007
   Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
7008
   Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
7009
   Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
7010

7011
   bool swizzled = nir_intrinsic_is_swizzled(intrin);
7012
   bool reorder = nir_intrinsic_can_reorder(intrin);
7013
   bool slc = nir_intrinsic_slc_amd(intrin);
7014

7015
   unsigned const_offset = nir_intrinsic_base(intrin);
7016
   unsigned elem_size_bytes = intrin->dest.ssa.bit_size / 8u;
7017
   unsigned num_components = intrin->dest.ssa.num_components;
7018
   unsigned swizzle_element_size = swizzled ? (ctx->program->chip_class <= GFX8 ? 4 : 16) : 0;
7019

7020
   load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7021
                   num_components, swizzle_element_size, !swizzled, reorder, slc);
7022
}
7023

7024
void
7025
visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7026
{
7027
   Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7028
   Temp descriptor = get_ssa_temp(ctx, intrin->src[1].ssa);
7029
   Temp v_offset = get_ssa_temp(ctx, intrin->src[2].ssa);
7030
   Temp s_offset = get_ssa_temp(ctx, intrin->src[3].ssa);
7031

7032
   bool swizzled = nir_intrinsic_is_swizzled(intrin);
7033
   bool slc = nir_intrinsic_slc_amd(intrin);
7034

7035
   unsigned const_offset = nir_intrinsic_base(intrin);
7036
   unsigned write_mask = nir_intrinsic_write_mask(intrin);
7037
   unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
7038

7039
   nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7040
   memory_sync_info sync(mem_mode == nir_var_shader_out ? storage_vmem_output : storage_none);
7041

7042
   store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7043
                    write_mask, !swizzled, sync, slc);
7044
}
7045

7046
sync_scope
7047
translate_nir_scope(nir_scope scope)
7048
{
7049
   switch (scope) {
7050
   case NIR_SCOPE_NONE:
7051
   case NIR_SCOPE_INVOCATION: return scope_invocation;
7052
   case NIR_SCOPE_SUBGROUP: return scope_subgroup;
7053
   case NIR_SCOPE_WORKGROUP: return scope_workgroup;
7054
   case NIR_SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7055
   case NIR_SCOPE_DEVICE: return scope_device;
7056
   case NIR_SCOPE_SHADER_CALL: unreachable("unsupported scope");
7057
   }
7058
   unreachable("invalid scope");
7059
}
7060

7061
void
7062
emit_scoped_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7063
{
7064
   Builder bld(ctx->program, ctx->block);
7065

7066
   unsigned semantics = 0;
7067
   unsigned storage = 0;
7068
   sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7069
   sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7070

7071
   /* We use shared storage for the following:
7072
    * - compute shaders expose it in their API
7073
    * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7074
    * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7075
    * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7076
    */
7077
   bool shared_storage_used = ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::LS ||
7078
                              ctx->stage.hw == HWStage::HS ||
7079
                              (ctx->stage.hw == HWStage::GS && ctx->program->chip_class >= GFX9) ||
7080
                              ctx->stage.hw == HWStage::NGG;
7081

7082
   /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7083
    * They are allowed in CS, TCS, and in any NGG shader.
7084
    */
7085
   ASSERTED bool workgroup_scope_allowed =
7086
      ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::HS || ctx->stage.hw == HWStage::NGG;
7087

7088
   unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7089
   if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global))
7090
      storage |= storage_buffer | storage_image; // TODO: split this when NIR gets nir_var_mem_image
7091
   if (shared_storage_used && (nir_storage & nir_var_mem_shared))
7092
      storage |= storage_shared;
7093

7094
   unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7095
   if (nir_semantics & NIR_MEMORY_ACQUIRE)
7096
      semantics |= semantic_acquire | semantic_release;
7097
   if (nir_semantics & NIR_MEMORY_RELEASE)
7098
      semantics |= semantic_acquire | semantic_release;
7099

7100
   assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7101
   assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7102

7103
   bld.barrier(aco_opcode::p_barrier,
7104
               memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7105
               exec_scope);
7106
}
7107

7108
void
7109
visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7110
{
7111
   // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
7112
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7113
   Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7114
   Builder bld(ctx->program, ctx->block);
7115

7116
   unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
7117
   unsigned num_components = instr->dest.ssa.num_components;
7118
   unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7119
   load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7120
}
7121

7122
void
7123
visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7124
{
7125
   unsigned writemask = nir_intrinsic_write_mask(instr);
7126
   Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7127
   Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7128
   unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7129

7130
   unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7131
   store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7132
}
7133

7134
void
7135
visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7136
{
7137
   unsigned offset = nir_intrinsic_base(instr);
7138
   Builder bld(ctx->program, ctx->block);
7139
   Operand m = load_lds_size_m0(bld);
7140
   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7141
   Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7142

7143
   unsigned num_operands = 3;
7144
   aco_opcode op32, op64, op32_rtn, op64_rtn;
7145
   switch (instr->intrinsic) {
7146
   case nir_intrinsic_shared_atomic_add:
7147
      op32 = aco_opcode::ds_add_u32;
7148
      op64 = aco_opcode::ds_add_u64;
7149
      op32_rtn = aco_opcode::ds_add_rtn_u32;
7150
      op64_rtn = aco_opcode::ds_add_rtn_u64;
7151
      break;
7152
   case nir_intrinsic_shared_atomic_imin:
7153
      op32 = aco_opcode::ds_min_i32;
7154
      op64 = aco_opcode::ds_min_i64;
7155
      op32_rtn = aco_opcode::ds_min_rtn_i32;
7156
      op64_rtn = aco_opcode::ds_min_rtn_i64;
7157
      break;
7158
   case nir_intrinsic_shared_atomic_umin:
7159
      op32 = aco_opcode::ds_min_u32;
7160
      op64 = aco_opcode::ds_min_u64;
7161
      op32_rtn = aco_opcode::ds_min_rtn_u32;
7162
      op64_rtn = aco_opcode::ds_min_rtn_u64;
7163
      break;
7164
   case nir_intrinsic_shared_atomic_imax:
7165
      op32 = aco_opcode::ds_max_i32;
7166
      op64 = aco_opcode::ds_max_i64;
7167
      op32_rtn = aco_opcode::ds_max_rtn_i32;
7168
      op64_rtn = aco_opcode::ds_max_rtn_i64;
7169
      break;
7170
   case nir_intrinsic_shared_atomic_umax:
7171
      op32 = aco_opcode::ds_max_u32;
7172
      op64 = aco_opcode::ds_max_u64;
7173
      op32_rtn = aco_opcode::ds_max_rtn_u32;
7174
      op64_rtn = aco_opcode::ds_max_rtn_u64;
7175
      break;
7176
   case nir_intrinsic_shared_atomic_and:
7177
      op32 = aco_opcode::ds_and_b32;
7178
      op64 = aco_opcode::ds_and_b64;
7179
      op32_rtn = aco_opcode::ds_and_rtn_b32;
7180
      op64_rtn = aco_opcode::ds_and_rtn_b64;
7181
      break;
7182
   case nir_intrinsic_shared_atomic_or:
7183
      op32 = aco_opcode::ds_or_b32;
7184
      op64 = aco_opcode::ds_or_b64;
7185
      op32_rtn = aco_opcode::ds_or_rtn_b32;
7186
      op64_rtn = aco_opcode::ds_or_rtn_b64;
7187
      break;
7188
   case nir_intrinsic_shared_atomic_xor:
7189
      op32 = aco_opcode::ds_xor_b32;
7190
      op64 = aco_opcode::ds_xor_b64;
7191
      op32_rtn = aco_opcode::ds_xor_rtn_b32;
7192
      op64_rtn = aco_opcode::ds_xor_rtn_b64;
7193
      break;
7194
   case nir_intrinsic_shared_atomic_exchange:
7195
      op32 = aco_opcode::ds_write_b32;
7196
      op64 = aco_opcode::ds_write_b64;
7197
      op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7198
      op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7199
      break;
7200
   case nir_intrinsic_shared_atomic_comp_swap:
7201
      op32 = aco_opcode::ds_cmpst_b32;
7202
      op64 = aco_opcode::ds_cmpst_b64;
7203
      op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7204
      op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7205
      num_operands = 4;
7206
      break;
7207
   case nir_intrinsic_shared_atomic_fadd:
7208
      op32 = aco_opcode::ds_add_f32;
7209
      op32_rtn = aco_opcode::ds_add_rtn_f32;
7210
      op64 = aco_opcode::num_opcodes;
7211
      op64_rtn = aco_opcode::num_opcodes;
7212
      break;
7213
   default: unreachable("Unhandled shared atomic intrinsic");
7214
   }
7215

7216
   bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
7217

7218
   aco_opcode op;
7219
   if (data.size() == 1) {
7220
      assert(instr->dest.ssa.bit_size == 32);
7221
      op = return_previous ? op32_rtn : op32;
7222
   } else {
7223
      assert(instr->dest.ssa.bit_size == 64);
7224
      op = return_previous ? op64_rtn : op64;
7225
   }
7226

7227
   if (offset > 65535) {
7228
      address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7229
      offset = 0;
7230
   }
7231

7232
   aco_ptr<DS_instruction> ds;
7233
   ds.reset(
7234
      create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
7235
   ds->operands[0] = Operand(address);
7236
   ds->operands[1] = Operand(data);
7237
   if (num_operands == 4) {
7238
      Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7239
      ds->operands[2] = Operand(data2);
7240
   }
7241
   ds->operands[num_operands - 1] = m;
7242
   ds->offset0 = offset;
7243
   if (return_previous)
7244
      ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
7245
   ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7246
   ctx->block->instructions.emplace_back(std::move(ds));
7247
}
7248

7249
Temp
7250
get_scratch_resource(isel_context* ctx)
7251
{
7252
   Builder bld(ctx->program, ctx->block);
7253
   Temp scratch_addr = ctx->program->private_segment_buffer;
7254
   if (ctx->stage != compute_cs)
7255
      scratch_addr =
7256
         bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7257

7258
   uint32_t rsrc_conf =
7259
      S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
7260

7261
   if (ctx->program->chip_class >= GFX10) {
7262
      rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
7263
                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
7264
   } else if (ctx->program->chip_class <=
7265
              GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
7266
      rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7267
                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7268
   }
7269

7270
   /* older generations need element size = 4 bytes. element size removed in GFX9 */
7271
   if (ctx->program->chip_class <= GFX8)
7272
      rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
7273

7274
   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u),
7275
                     Operand::c32(rsrc_conf));
7276
}
7277

7278
void
7279
visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7280
{
7281
   Builder bld(ctx->program, ctx->block);
7282
   Temp rsrc = get_scratch_resource(ctx);
7283
   Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7284
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7285

7286
   LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,
7287
                        instr->dest.ssa.bit_size / 8u, rsrc};
7288
   info.align_mul = nir_intrinsic_align_mul(instr);
7289
   info.align_offset = nir_intrinsic_align_offset(instr);
7290
   info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
7291
   info.sync = memory_sync_info(storage_scratch, semantic_private);
7292
   info.soffset = ctx->program->scratch_offset;
7293
   emit_load(ctx, bld, info, scratch_load_params);
7294
}
7295

7296
void
7297
visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7298
{
7299
   Builder bld(ctx->program, ctx->block);
7300
   Temp rsrc = get_scratch_resource(ctx);
7301
   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7302
   Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7303

7304
   unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7305
   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7306

7307
   unsigned write_count = 0;
7308
   Temp write_datas[32];
7309
   unsigned offsets[32];
7310
   unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
7311
   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7312
                      &write_count, write_datas, offsets);
7313

7314
   for (unsigned i = 0; i < write_count; i++) {
7315
      aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7316
      Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i],
7317
                                     offsets[i], true, true);
7318
      mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7319
   }
7320
}
7321

7322
void
7323
visit_load_sample_mask_in(isel_context* ctx, nir_intrinsic_instr* instr)
7324
{
7325
   uint8_t log2_ps_iter_samples;
7326
   if (ctx->program->info->ps.uses_sample_shading) {
7327
      log2_ps_iter_samples = util_logbase2(ctx->options->key.fs.num_samples);
7328
   } else {
7329
      log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
7330
   }
7331

7332
   Builder bld(ctx->program, ctx->block);
7333

7334
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7335

7336
   if (log2_ps_iter_samples) {
7337
      /* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */
7338
      Temp sample_id =
7339
         bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
7340
                  Operand::c32(8u), Operand::c32(4u));
7341
      Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id,
7342
                           bld.copy(bld.def(v1), Operand::c32(1u)));
7343
      bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask,
7344
               get_arg(ctx, ctx->args->ac.sample_coverage));
7345
   } else {
7346
      bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.sample_coverage));
7347
   }
7348
}
7349

7350
void
7351
visit_emit_vertex_with_counter(isel_context* ctx, nir_intrinsic_instr* instr)
7352
{
7353
   Builder bld(ctx->program, ctx->block);
7354

7355
   unsigned stream = nir_intrinsic_stream_id(instr);
7356
   Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7357
   next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
7358
   nir_const_value* next_vertex_cv = nir_src_as_const_value(instr->src[0]);
7359

7360
   /* get GSVS ring */
7361
   Temp gsvs_ring =
7362
      bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer,
7363
               Operand::c32(RING_GSVS_GS * 16u));
7364

7365
   unsigned num_components = ctx->program->info->gs.num_stream_output_components[stream];
7366

7367
   unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
7368
   unsigned stream_offset = 0;
7369
   for (unsigned i = 0; i < stream; i++) {
7370
      unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] *
7371
                             ctx->shader->info.gs.vertices_out;
7372
      stream_offset += prev_stride * ctx->program->wave_size;
7373
   }
7374

7375
   /* Limit on the stride field for <= GFX7. */
7376
   assert(stride < (1 << 14));
7377

7378
   Temp gsvs_dwords[4];
7379
   for (unsigned i = 0; i < 4; i++)
7380
      gsvs_dwords[i] = bld.tmp(s1);
7381
   bld.pseudo(aco_opcode::p_split_vector, Definition(gsvs_dwords[0]), Definition(gsvs_dwords[1]),
7382
              Definition(gsvs_dwords[2]), Definition(gsvs_dwords[3]), gsvs_ring);
7383

7384
   if (stream_offset) {
7385
      Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand::c32(stream_offset));
7386

7387
      Temp carry = bld.tmp(s1);
7388
      gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)),
7389
                                gsvs_dwords[0], stream_offset_tmp);
7390
      gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc),
7391
                                gsvs_dwords[1], Operand::zero(), bld.scc(carry));
7392
   }
7393

7394
   gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1],
7395
                             Operand::c32(S_008F04_STRIDE(stride)));
7396
   gsvs_dwords[2] = bld.copy(bld.def(s1), Operand::c32(ctx->program->wave_size));
7397

7398
   gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), gsvs_dwords[0], gsvs_dwords[1],
7399
                          gsvs_dwords[2], gsvs_dwords[3]);
7400

7401
   unsigned offset = 0;
7402
   for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
7403
      if (ctx->program->info->gs.output_streams[i] != stream)
7404
         continue;
7405

7406
      for (unsigned j = 0; j < 4; j++) {
7407
         if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
7408
            continue;
7409

7410
         if (ctx->outputs.mask[i] & (1 << j)) {
7411
            Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
7412
            unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
7413
            if (const_offset >= 4096u) {
7414
               if (vaddr_offset.isUndefined())
7415
                  vaddr_offset = bld.copy(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u));
7416
               else
7417
                  vaddr_offset = bld.vadd32(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u),
7418
                                            vaddr_offset);
7419
               const_offset %= 4096u;
7420
            }
7421

7422
            aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(
7423
               aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
7424
            mtbuf->operands[0] = Operand(gsvs_ring);
7425
            mtbuf->operands[1] = vaddr_offset;
7426
            mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->ac.gs2vs_offset));
7427
            mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
7428
            mtbuf->offen = !vaddr_offset.isUndefined();
7429
            mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
7430
            mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
7431
            mtbuf->offset = const_offset;
7432
            mtbuf->glc = true;
7433
            mtbuf->slc = true;
7434
            mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
7435
            bld.insert(std::move(mtbuf));
7436
         }
7437

7438
         offset += ctx->shader->info.gs.vertices_out;
7439
      }
7440

7441
      /* outputs for the next vertex are undefined and keeping them around can
7442
       * create invalid IR with control flow */
7443
      ctx->outputs.mask[i] = 0;
7444
   }
7445

7446
   bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
7447
}
7448

7449
Temp
7450
emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src)
7451
{
7452
   Builder bld(ctx->program, ctx->block);
7453

7454
   if (cluster_size == 1) {
7455
      return src;
7456
   }
7457
   if (op == nir_op_iand && cluster_size == 4) {
7458
      /* subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) */
7459
      Temp tmp =
7460
         bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7461
      return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
7462
                      bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
7463
   } else if (op == nir_op_ior && cluster_size == 4) {
7464
      /* subgroupClusteredOr(val, 4) -> wqm(val & exec) */
7465
      return bld.sop1(
7466
         Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
7467
         bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
7468
   } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
7469
      /* subgroupAnd(val) -> (exec & ~val) == 0 */
7470
      Temp tmp =
7471
         bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
7472
            .def(1)
7473
            .getTemp();
7474
      Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
7475
      return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
7476
   } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
7477
      /* subgroupOr(val) -> (val & exec) != 0 */
7478
      Temp tmp =
7479
         bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))
7480
            .def(1)
7481
            .getTemp();
7482
      return bool_to_vector_condition(ctx, tmp);
7483
   } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7484
      /* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */
7485
      Temp tmp =
7486
         bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7487
      tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
7488
      tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u))
7489
               .def(1)
7490
               .getTemp();
7491
      return bool_to_vector_condition(ctx, tmp);
7492
   } else {
7493
      /* subgroupClustered{And,Or,Xor}(val, n):
7494
       *   lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32)
7495
       *   cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1)
7496
       * subgroupClusteredAnd():
7497
       *   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
7498
       * subgroupClusteredOr():
7499
       *   return ((val & exec) >> cluster_offset) & cluster_mask != 0
7500
       * subgroupClusteredXor():
7501
       *   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
7502
       */
7503
      Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1));
7504
      Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1),
7505
                                     Operand::c32(~uint32_t(cluster_size - 1)), lane_id);
7506

7507
      Temp tmp;
7508
      if (op == nir_op_iand)
7509
         tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src,
7510
                        Operand(exec, bld.lm));
7511
      else
7512
         tmp =
7513
            bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7514

7515
      uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
7516

7517
      if (ctx->program->chip_class <= GFX7)
7518
         tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
7519
      else if (ctx->program->wave_size == 64)
7520
         tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
7521
      else
7522
         tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
7523
      tmp = emit_extract_vector(ctx, tmp, 0, v1);
7524
      if (cluster_mask != 0xffffffff)
7525
         tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp);
7526

7527
      if (op == nir_op_iand) {
7528
         return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::c32(cluster_mask),
7529
                         tmp);
7530
      } else if (op == nir_op_ior) {
7531
         return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);
7532
      } else if (op == nir_op_ixor) {
7533
         tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u),
7534
                        bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero()));
7535
         return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);
7536
      }
7537
      assert(false);
7538
      return Temp();
7539
   }
7540
}
7541

7542
Temp
7543
emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src)
7544
{
7545
   Builder bld(ctx->program, ctx->block);
7546
   assert(src.regClass() == bld.lm);
7547

7548
   /* subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
7549
    * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
7550
    * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
7551
    */
7552
   Temp tmp;
7553
   if (op == nir_op_iand)
7554
      tmp =
7555
         bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7556
   else
7557
      tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7558

7559
   Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));
7560

7561
   if (op == nir_op_iand)
7562
      return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);
7563
   else if (op == nir_op_ior)
7564
      return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);
7565
   else if (op == nir_op_ixor)
7566
      return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(),
7567
                      bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt));
7568

7569
   assert(false);
7570
   return Temp();
7571
}
7572

7573
Temp
7574
emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src)
7575
{
7576
   Builder bld(ctx->program, ctx->block);
7577

7578
   /* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7579
    * subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7580
    * subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7581
    */
7582
   Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7583
   if (op == nir_op_iand)
7584
      return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7585
   else if (op == nir_op_ior)
7586
      return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7587
   else if (op == nir_op_ixor)
7588
      return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7589

7590
   assert(false);
7591
   return Temp();
7592
}
7593

7594
ReduceOp
7595
get_reduce_op(nir_op op, unsigned bit_size)
7596
{
7597
   switch (op) {
7598
#define CASEI(name)                                                                                \
7599
   case nir_op_##name:                                                                             \
7600
      return (bit_size == 32)   ? name##32                                                         \
7601
             : (bit_size == 16) ? name##16                                                         \
7602
             : (bit_size == 8)  ? name##8                                                          \
7603
                                : name##64;
7604
#define CASEF(name)                                                                                \
7605
   case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7606
      CASEI(iadd)
7607
      CASEI(imul)
7608
      CASEI(imin)
7609
      CASEI(umin)
7610
      CASEI(imax)
7611
      CASEI(umax)
7612
      CASEI(iand)
7613
      CASEI(ior)
7614
      CASEI(ixor)
7615
      CASEF(fadd)
7616
      CASEF(fmul)
7617
      CASEF(fmin)
7618
      CASEF(fmax)
7619
   default: unreachable("unknown reduction op");
7620
#undef CASEI
7621
#undef CASEF
7622
   }
7623
}
7624

7625
void
7626
emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7627
{
7628
   Builder bld(ctx->program, ctx->block);
7629
   Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7630
   assert(dst.regClass().type() != RegType::vgpr);
7631
   if (src.regClass().type() == RegType::vgpr)
7632
      bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7633
   else
7634
      bld.copy(dst, src);
7635
}
7636

7637
void
7638
emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7639
{
7640
   Builder bld(ctx->program, ctx->block);
7641
   Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7642

7643
   if (op == nir_op_fadd) {
7644
      src_tmp = as_vgpr(ctx, src_tmp);
7645
      Temp tmp = dst.regClass() == s1 ? bld.tmp(src_tmp.regClass()) : dst.getTemp();
7646

7647
      if (src.ssa->bit_size == 16) {
7648
         count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7649
         bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7650
      } else {
7651
         assert(src.ssa->bit_size == 32);
7652
         count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7653
         bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7654
      }
7655

7656
      if (tmp != dst.getTemp())
7657
         bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7658

7659
      return;
7660
   }
7661

7662
   if (dst.regClass() == s1)
7663
      src_tmp = bld.as_uniform(src_tmp);
7664

7665
   if (op == nir_op_ixor && count.type() == RegType::sgpr)
7666
      count =
7667
         bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7668
   else if (op == nir_op_ixor)
7669
      count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7670

7671
   assert(dst.getTemp().type() == count.type());
7672

7673
   if (nir_src_is_const(src)) {
7674
      if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)
7675
         bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7676
      else if (nir_src_as_uint(src) == 1)
7677
         bld.copy(dst, count);
7678
      else if (nir_src_as_uint(src) == 0 && dst.bytes() <= 2)
7679
         bld.vop1(aco_opcode::v_mov_b32, dst, Operand::zero()); /* RA will use SDWA if possible */
7680
      else if (nir_src_as_uint(src) == 0)
7681
         bld.copy(dst, Operand::zero());
7682
      else if (count.type() == RegType::vgpr)
7683
         bld.v_mul_imm(dst, count, nir_src_as_uint(src));
7684
      else
7685
         bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7686
   } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
7687
      bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7688
   } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
7689
      bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7690
   } else if (dst.getTemp().type() == RegType::vgpr) {
7691
      bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7692
   } else {
7693
      bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7694
   }
7695
}
7696

7697
bool
7698
emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7699
{
7700
   nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7701
   if (op == nir_op_imul || op == nir_op_fmul)
7702
      return false;
7703

7704
   if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7705
      Builder bld(ctx->program, ctx->block);
7706
      Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7707
      unsigned bit_size = instr->src[0].ssa->bit_size;
7708
      if (bit_size > 32)
7709
         return false;
7710

7711
      Temp thread_count =
7712
         bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7713

7714
      emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7715
   } else {
7716
      emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7717
   }
7718

7719
   return true;
7720
}
7721

7722
bool
7723
emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7724
{
7725
   Builder bld(ctx->program, ctx->block);
7726
   Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7727
   nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7728
   bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7729

7730
   if (op == nir_op_imul || op == nir_op_fmul)
7731
      return false;
7732

7733
   if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7734
      if (instr->src[0].ssa->bit_size > 32)
7735
         return false;
7736

7737
      Temp packed_tid;
7738
      if (inc)
7739
         packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7740
      else
7741
         packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7742

7743
      emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7744
      return true;
7745
   }
7746

7747
   assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7748
          op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7749

7750
   if (inc) {
7751
      emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7752
      return true;
7753
   }
7754

7755
   /* Copy the source and write the reduction operation identity to the first lane. */
7756
   Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7757
   Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7758
   ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7759
   if (dst.bytes() == 8) {
7760
      Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7761
      bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7762
      uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7763
      uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7764

7765
      lo =
7766
         bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_lo)), lane, lo);
7767
      hi =
7768
         bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_hi)), lane, hi);
7769
      bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7770
   } else {
7771
      uint32_t identity = get_reduction_identity(reduce_op, 0);
7772
      bld.writelane(dst, bld.copy(bld.hint_m0(s1), Operand::c32(identity)), lane,
7773
                    as_vgpr(ctx, src));
7774
   }
7775

7776
   return true;
7777
}
7778

7779
Temp
7780
emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7781
                     Definition dst, Temp src)
7782
{
7783
   assert(src.bytes() <= 8);
7784
   assert(src.type() == RegType::vgpr);
7785

7786
   Builder bld(ctx->program, ctx->block);
7787

7788
   unsigned num_defs = 0;
7789
   Definition defs[5];
7790
   defs[num_defs++] = dst;
7791
   defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7792

7793
   /* scalar identity temporary */
7794
   bool need_sitmp = (ctx->program->chip_class <= GFX7 || ctx->program->chip_class >= GFX10) &&
7795
                     aco_op != aco_opcode::p_reduce;
7796
   if (aco_op == aco_opcode::p_exclusive_scan) {
7797
      need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
7798
                     op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
7799
                     op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
7800
                     op == fmul64);
7801
   }
7802
   if (need_sitmp)
7803
      defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
7804

7805
   /* scc clobber */
7806
   defs[num_defs++] = bld.def(s1, scc);
7807

7808
   /* vcc clobber */
7809
   bool clobber_vcc = false;
7810
   if ((op == iadd32 || op == imul64) && ctx->program->chip_class < GFX9)
7811
      clobber_vcc = true;
7812
   if ((op == iadd8 || op == iadd16) && ctx->program->chip_class < GFX8)
7813
      clobber_vcc = true;
7814
   if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
7815
      clobber_vcc = true;
7816

7817
   if (clobber_vcc)
7818
      defs[num_defs++] = bld.def(bld.lm, vcc);
7819

7820
   Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
7821
      aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
7822
   reduce->operands[0] = Operand(src);
7823
   /* setup_reduce_temp will update these undef operands if needed */
7824
   reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7825
   reduce->operands[2] = Operand(v1.as_linear());
7826
   std::copy(defs, defs + num_defs, reduce->definitions.begin());
7827

7828
   reduce->reduce_op = op;
7829
   reduce->cluster_size = cluster_size;
7830
   bld.insert(std::move(reduce));
7831

7832
   return dst.getTemp();
7833
}
7834

7835
void
7836
emit_interp_center(isel_context* ctx, Temp dst, Temp pos1, Temp pos2)
7837
{
7838
   Builder bld(ctx->program, ctx->block);
7839
   Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
7840
   Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);
7841
   Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);
7842

7843
   Temp ddx_1, ddx_2, ddy_1, ddy_2;
7844
   uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7845
   uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7846
   uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7847

7848
   /* Build DD X/Y */
7849
   if (ctx->program->chip_class >= GFX8) {
7850
      Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7851
      ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7852
      ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7853
      Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7854
      ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7855
      ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7856
   } else {
7857
      Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7858
      ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7859
      ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7860
      ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7861
      ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);
7862
      Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7863
      ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7864
      ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);
7865
      ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7866
      ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7867
   }
7868

7869
   /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7870
   aco_opcode mad =
7871
      ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
7872
   Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
7873
   Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
7874
   tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
7875
   tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
7876
   Temp wqm1 = bld.tmp(v1);
7877
   emit_wqm(bld, tmp1, wqm1, true);
7878
   Temp wqm2 = bld.tmp(v1);
7879
   emit_wqm(bld, tmp2, wqm2, true);
7880
   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
7881
   return;
7882
}
7883

7884
Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
7885
void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt);
7886
static void create_vs_exports(isel_context* ctx);
7887

7888
void
7889
visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
7890
{
7891
   Builder bld(ctx->program, ctx->block);
7892
   switch (instr->intrinsic) {
7893
   case nir_intrinsic_load_barycentric_sample:
7894
   case nir_intrinsic_load_barycentric_pixel:
7895
   case nir_intrinsic_load_barycentric_centroid: {
7896
      glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
7897
      Temp bary = Temp(0, s2);
7898
      switch (mode) {
7899
      case INTERP_MODE_SMOOTH:
7900
      case INTERP_MODE_NONE:
7901
         if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7902
            bary = get_arg(ctx, ctx->args->ac.persp_center);
7903
         else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7904
            bary = ctx->persp_centroid;
7905
         else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7906
            bary = get_arg(ctx, ctx->args->ac.persp_sample);
7907
         break;
7908
      case INTERP_MODE_NOPERSPECTIVE:
7909
         if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7910
            bary = get_arg(ctx, ctx->args->ac.linear_center);
7911
         else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7912
            bary = ctx->linear_centroid;
7913
         else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7914
            bary = get_arg(ctx, ctx->args->ac.linear_sample);
7915
         break;
7916
      default: break;
7917
      }
7918
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7919
      Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7920
      Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7921
      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2));
7922
      emit_split_vector(ctx, dst, 2);
7923
      break;
7924
   }
7925
   case nir_intrinsic_load_barycentric_model: {
7926
      Temp model = get_arg(ctx, ctx->args->ac.pull_model);
7927

7928
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7929
      Temp p1 = emit_extract_vector(ctx, model, 0, v1);
7930
      Temp p2 = emit_extract_vector(ctx, model, 1, v1);
7931
      Temp p3 = emit_extract_vector(ctx, model, 2, v1);
7932
      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2),
7933
                 Operand(p3));
7934
      emit_split_vector(ctx, dst, 3);
7935
      break;
7936
   }
7937
   case nir_intrinsic_load_barycentric_at_sample: {
7938
      uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
7939
      switch (ctx->options->key.fs.num_samples) {
7940
      case 2: sample_pos_offset += 1 << 3; break;
7941
      case 4: sample_pos_offset += 3 << 3; break;
7942
      case 8: sample_pos_offset += 7 << 3; break;
7943
      default: break;
7944
      }
7945
      Temp sample_pos;
7946
      Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
7947
      nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
7948
      Temp private_segment_buffer = ctx->program->private_segment_buffer;
7949
      // TODO: bounds checking?
7950
      if (addr.type() == RegType::sgpr) {
7951
         Operand offset;
7952
         if (const_addr) {
7953
            sample_pos_offset += const_addr->u32 << 3;
7954
            offset = Operand::c32(sample_pos_offset);
7955
         } else if (ctx->options->chip_class >= GFX9) {
7956
            offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr,
7957
                              Operand::c32(sample_pos_offset));
7958
         } else {
7959
            offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr,
7960
                              Operand::c32(3u));
7961
            offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
7962
                              Operand::c32(sample_pos_offset));
7963
         }
7964

7965
         Operand off = bld.copy(bld.def(s1), Operand(offset));
7966
         sample_pos =
7967
            bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
7968

7969
      } else if (ctx->options->chip_class >= GFX9) {
7970
         addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
7971
         sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr,
7972
                                 private_segment_buffer, sample_pos_offset);
7973
      } else if (ctx->options->chip_class >= GFX7) {
7974
         /* addr += private_segment_buffer + sample_pos_offset */
7975
         Temp tmp0 = bld.tmp(s1);
7976
         Temp tmp1 = bld.tmp(s1);
7977
         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1),
7978
                    private_segment_buffer);
7979
         Definition scc_tmp = bld.def(s1, scc);
7980
         tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0,
7981
                         Operand::c32(sample_pos_offset));
7982
         tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1,
7983
                         Operand::zero(), bld.scc(scc_tmp.getTemp()));
7984
         addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
7985
         Temp pck0 = bld.tmp(v1);
7986
         Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
7987
         tmp1 = as_vgpr(ctx, tmp1);
7988
         Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1),
7989
                                  bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand::zero(), carry);
7990
         addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
7991

7992
         /* sample_pos = flat_load_dwordx2 addr */
7993
         sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
7994
      } else {
7995
         assert(ctx->options->chip_class == GFX6);
7996

7997
         uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7998
                              S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7999
         Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
8000
                                Operand::zero(), Operand::c32(rsrc_conf));
8001

8002
         addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
8003
         addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand::zero());
8004

8005
         sample_pos = bld.tmp(v2);
8006

8007
         aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(
8008
            aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
8009
         load->definitions[0] = Definition(sample_pos);
8010
         load->operands[0] = Operand(rsrc);
8011
         load->operands[1] = Operand(addr);
8012
         load->operands[2] = Operand::zero();
8013
         load->offset = sample_pos_offset;
8014
         load->offen = 0;
8015
         load->addr64 = true;
8016
         load->glc = false;
8017
         load->dlc = false;
8018
         load->disable_wqm = false;
8019
         ctx->block->instructions.emplace_back(std::move(load));
8020
      }
8021

8022
      /* sample_pos -= 0.5 */
8023
      Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
8024
      Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
8025
      bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
8026
      pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand::c32(0x3f000000u));
8027
      pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand::c32(0x3f000000u));
8028

8029
      emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
8030
      break;
8031
   }
8032
   case nir_intrinsic_load_barycentric_at_offset: {
8033
      Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8034
      RegClass rc = RegClass(offset.type(), 1);
8035
      Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
8036
      bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
8037
      emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
8038
      break;
8039
   }
8040
   case nir_intrinsic_load_front_face: {
8041
      bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8042
               Operand::zero(), get_arg(ctx, ctx->args->ac.front_face))
8043
         .def(0)
8044
         .setHint(vcc);
8045
      break;
8046
   }
8047
   case nir_intrinsic_load_view_index: {
8048
      if (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::GS) ||
8049
          ctx->stage.has(SWStage::TCS) || ctx->stage.has(SWStage::TES)) {
8050
         Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8051
         bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
8052
         break;
8053
      }
8054
      FALLTHROUGH;
8055
   }
8056
   case nir_intrinsic_load_layer_id: {
8057
      unsigned idx = nir_intrinsic_base(instr);
8058
      bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8059
                 Operand::c32(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
8060
      break;
8061
   }
8062
   case nir_intrinsic_load_frag_coord: {
8063
      emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
8064
      break;
8065
   }
8066
   case nir_intrinsic_load_frag_shading_rate:
8067
      emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8068
      break;
8069
   case nir_intrinsic_load_sample_pos: {
8070
      Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
8071
      Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
8072
      bld.pseudo(
8073
         aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8074
         posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),
8075
         posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());
8076
      break;
8077
   }
8078
   case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8079
   case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8080
   case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8081
   case nir_intrinsic_load_input:
8082
   case nir_intrinsic_load_input_vertex: visit_load_input(ctx, instr); break;
8083
   case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8084
   case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8085
   case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;
8086
   case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8087
   case nir_intrinsic_vulkan_resource_index: visit_load_resource(ctx, instr); break;
8088
   case nir_intrinsic_terminate:
8089
   case nir_intrinsic_discard: visit_discard(ctx, instr); break;
8090
   case nir_intrinsic_terminate_if:
8091
   case nir_intrinsic_discard_if: visit_discard_if(ctx, instr); break;
8092
   case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8093
   case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8094
   case nir_intrinsic_shared_atomic_add:
8095
   case nir_intrinsic_shared_atomic_imin:
8096
   case nir_intrinsic_shared_atomic_umin:
8097
   case nir_intrinsic_shared_atomic_imax:
8098
   case nir_intrinsic_shared_atomic_umax:
8099
   case nir_intrinsic_shared_atomic_and:
8100
   case nir_intrinsic_shared_atomic_or:
8101
   case nir_intrinsic_shared_atomic_xor:
8102
   case nir_intrinsic_shared_atomic_exchange:
8103
   case nir_intrinsic_shared_atomic_comp_swap:
8104
   case nir_intrinsic_shared_atomic_fadd: visit_shared_atomic(ctx, instr); break;
8105
   case nir_intrinsic_image_deref_load:
8106
   case nir_intrinsic_image_deref_sparse_load: visit_image_load(ctx, instr); break;
8107
   case nir_intrinsic_image_deref_store: visit_image_store(ctx, instr); break;
8108
   case nir_intrinsic_image_deref_atomic_add:
8109
   case nir_intrinsic_image_deref_atomic_umin:
8110
   case nir_intrinsic_image_deref_atomic_imin:
8111
   case nir_intrinsic_image_deref_atomic_umax:
8112
   case nir_intrinsic_image_deref_atomic_imax:
8113
   case nir_intrinsic_image_deref_atomic_and:
8114
   case nir_intrinsic_image_deref_atomic_or:
8115
   case nir_intrinsic_image_deref_atomic_xor:
8116
   case nir_intrinsic_image_deref_atomic_exchange:
8117
   case nir_intrinsic_image_deref_atomic_comp_swap: visit_image_atomic(ctx, instr); break;
8118
   case nir_intrinsic_image_deref_size: visit_image_size(ctx, instr); break;
8119
   case nir_intrinsic_image_deref_samples: visit_image_samples(ctx, instr); break;
8120
   case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8121
   case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8122
   case nir_intrinsic_load_global: visit_load_global(ctx, instr); break;
8123
   case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8124
   case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8125
   case nir_intrinsic_store_global: visit_store_global(ctx, instr); break;
8126
   case nir_intrinsic_global_atomic_add:
8127
   case nir_intrinsic_global_atomic_imin:
8128
   case nir_intrinsic_global_atomic_umin:
8129
   case nir_intrinsic_global_atomic_imax:
8130
   case nir_intrinsic_global_atomic_umax:
8131
   case nir_intrinsic_global_atomic_and:
8132
   case nir_intrinsic_global_atomic_or:
8133
   case nir_intrinsic_global_atomic_xor:
8134
   case nir_intrinsic_global_atomic_exchange:
8135
   case nir_intrinsic_global_atomic_comp_swap: visit_global_atomic(ctx, instr); break;
8136
   case nir_intrinsic_ssbo_atomic_add:
8137
   case nir_intrinsic_ssbo_atomic_imin:
8138
   case nir_intrinsic_ssbo_atomic_umin:
8139
   case nir_intrinsic_ssbo_atomic_imax:
8140
   case nir_intrinsic_ssbo_atomic_umax:
8141
   case nir_intrinsic_ssbo_atomic_and:
8142
   case nir_intrinsic_ssbo_atomic_or:
8143
   case nir_intrinsic_ssbo_atomic_xor:
8144
   case nir_intrinsic_ssbo_atomic_exchange:
8145
   case nir_intrinsic_ssbo_atomic_comp_swap: visit_atomic_ssbo(ctx, instr); break;
8146
   case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8147
   case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8148
   case nir_intrinsic_get_ssbo_size: visit_get_ssbo_size(ctx, instr); break;
8149
   case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break;
8150
   case nir_intrinsic_load_num_workgroups: {
8151
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8152
      bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
8153
      emit_split_vector(ctx, dst, 3);
8154
      break;
8155
   }
8156
   case nir_intrinsic_load_local_invocation_id: {
8157
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8158
      bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
8159
      emit_split_vector(ctx, dst, 3);
8160
      break;
8161
   }
8162
   case nir_intrinsic_load_workgroup_id: {
8163
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8164
      struct ac_arg* args = ctx->args->ac.workgroup_ids;
8165
      bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8166
                 args[0].used ? Operand(get_arg(ctx, args[0])) : Operand::zero(),
8167
                 args[1].used ? Operand(get_arg(ctx, args[1])) : Operand::zero(),
8168
                 args[2].used ? Operand(get_arg(ctx, args[2])) : Operand::zero());
8169
      emit_split_vector(ctx, dst, 3);
8170
      break;
8171
   }
8172
   case nir_intrinsic_load_local_invocation_index: {
8173
      if (ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS) {
8174
         bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8175
                  get_arg(ctx, ctx->args->ac.vs_rel_patch_id));
8176
         break;
8177
      } else if (ctx->stage.hw == HWStage::GS || ctx->stage.hw == HWStage::NGG) {
8178
         bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), thread_id_in_threadgroup(ctx));
8179
         break;
8180
      }
8181

8182
      Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8183

8184
      /* The tg_size bits [6:11] contain the subgroup id,
8185
       * we need this multiplied by the wave size, and then OR the thread id to it.
8186
       */
8187
      if (ctx->program->wave_size == 64) {
8188
         /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
8189
          * feed that to v_or */
8190
         Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8191
                                Operand::c32(0xfc0u), get_arg(ctx, ctx->args->ac.tg_size));
8192
         bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num,
8193
                  id);
8194
      } else {
8195
         /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
8196
         Temp tg_num =
8197
            bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8198
                     get_arg(ctx, ctx->args->ac.tg_size), Operand::c32(0x6u | (0x6u << 16)));
8199
         bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8200
                  tg_num, Operand::c32(0x5u), id);
8201
      }
8202
      break;
8203
   }
8204
   case nir_intrinsic_load_subgroup_id: {
8205
      if (ctx->stage == compute_cs) {
8206
         bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8207
                  bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size),
8208
                  Operand::c32(0x6u | (0x6u << 16)));
8209
      } else if (ctx->stage.hw == HWStage::NGG) {
8210
         /* Get the id of the current wave within the threadgroup (workgroup) */
8211
         bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8212
                  bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8213
                  Operand::c32(24u | (4u << 16)));
8214
      } else {
8215
         bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::zero());
8216
      }
8217
      break;
8218
   }
8219
   case nir_intrinsic_load_subgroup_invocation: {
8220
      emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8221
      break;
8222
   }
8223
   case nir_intrinsic_load_num_subgroups: {
8224
      if (ctx->stage == compute_cs)
8225
         bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8226
                  bld.def(s1, scc), Operand::c32(0x3fu), get_arg(ctx, ctx->args->ac.tg_size));
8227
      else if (ctx->stage.hw == HWStage::NGG)
8228
         bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8229
                  bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8230
                  Operand::c32(28u | (4u << 16)));
8231
      else
8232
         bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x1u));
8233
      break;
8234
   }
8235
   case nir_intrinsic_ballot: {
8236
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8237
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8238

8239
      if (instr->src[0].ssa->bit_size == 1) {
8240
         assert(src.regClass() == bld.lm);
8241
      } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8242
         src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8243
      } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8244
         src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8245
      } else {
8246
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8247
      }
8248

8249
      /* Make sure that all inactive lanes return zero.
8250
       * Value-numbering might remove the comparison above */
8251
      src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
8252
      if (dst.size() != bld.lm.size()) {
8253
         /* Wave32 with ballot size set to 64 */
8254
         src =
8255
            bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand::zero());
8256
      }
8257

8258
      emit_wqm(bld, src, dst);
8259
      break;
8260
   }
8261
   case nir_intrinsic_shuffle:
8262
   case nir_intrinsic_read_invocation: {
8263
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8264
      if (!nir_src_is_divergent(instr->src[0])) {
8265
         emit_uniform_subgroup(ctx, instr, src);
8266
      } else {
8267
         Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8268
         if (instr->intrinsic == nir_intrinsic_read_invocation ||
8269
             !nir_src_is_divergent(instr->src[1]))
8270
            tid = bld.as_uniform(tid);
8271
         Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8272

8273
         if (instr->dest.ssa.bit_size != 1)
8274
            src = as_vgpr(ctx, src);
8275

8276
         if (src.regClass() == v1b || src.regClass() == v2b) {
8277
            Temp tmp = bld.tmp(v1);
8278
            tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp);
8279
            if (dst.type() == RegType::vgpr)
8280
               bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8281
                          bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8282
            else
8283
               bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8284
         } else if (src.regClass() == v1) {
8285
            emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), dst);
8286
         } else if (src.regClass() == v2) {
8287
            Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8288
            bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8289
            lo = emit_wqm(bld, emit_bpermute(ctx, bld, tid, lo));
8290
            hi = emit_wqm(bld, emit_bpermute(ctx, bld, tid, hi));
8291
            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8292
            emit_split_vector(ctx, dst, 2);
8293
         } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
8294
            assert(src.regClass() == bld.lm);
8295
            Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
8296
            bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8297
         } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
8298
            assert(src.regClass() == bld.lm);
8299
            Temp tmp;
8300
            if (ctx->program->chip_class <= GFX7)
8301
               tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
8302
            else if (ctx->program->wave_size == 64)
8303
               tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
8304
            else
8305
               tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
8306
            tmp = emit_extract_vector(ctx, tmp, 0, v1);
8307
            tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp);
8308
            emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp),
8309
                     dst);
8310
         } else {
8311
            isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8312
         }
8313
      }
8314
      break;
8315
   }
8316
   case nir_intrinsic_load_sample_id: {
8317
      bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8318
               get_arg(ctx, ctx->args->ac.ancillary), Operand::c32(8u), Operand::c32(4u));
8319
      break;
8320
   }
8321
   case nir_intrinsic_load_sample_mask_in: {
8322
      visit_load_sample_mask_in(ctx, instr);
8323
      break;
8324
   }
8325
   case nir_intrinsic_read_first_invocation: {
8326
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8327
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8328
      if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
8329
         emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst);
8330
      } else if (src.regClass() == v2) {
8331
         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8332
         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8333
         lo = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
8334
         hi = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
8335
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8336
         emit_split_vector(ctx, dst, 2);
8337
      } else if (instr->dest.ssa.bit_size == 1) {
8338
         assert(src.regClass() == bld.lm);
8339
         Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8340
                             bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8341
         bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8342
      } else {
8343
         bld.copy(Definition(dst), src);
8344
      }
8345
      break;
8346
   }
8347
   case nir_intrinsic_vote_all: {
8348
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8349
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8350
      assert(src.regClass() == bld.lm);
8351
      assert(dst.regClass() == bld.lm);
8352

8353
      Temp tmp =
8354
         bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
8355
            .def(1)
8356
            .getTemp();
8357
      Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
8358
      bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8359
      break;
8360
   }
8361
   case nir_intrinsic_vote_any: {
8362
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8363
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8364
      assert(src.regClass() == bld.lm);
8365
      assert(dst.regClass() == bld.lm);
8366

8367
      Temp tmp = bool_to_scalar_condition(ctx, src);
8368
      bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8369
      break;
8370
   }
8371
   case nir_intrinsic_reduce:
8372
   case nir_intrinsic_inclusive_scan:
8373
   case nir_intrinsic_exclusive_scan: {
8374
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8375
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8376
      nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8377
      unsigned cluster_size =
8378
         instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8379
      cluster_size = util_next_power_of_two(
8380
         MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8381

8382
      if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size &&
8383
          instr->dest.ssa.bit_size != 1) {
8384
         /* We use divergence analysis to assign the regclass, so check if it's
8385
          * working as expected */
8386
         ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8387
         if (instr->intrinsic == nir_intrinsic_inclusive_scan)
8388
            expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor;
8389
         assert(nir_dest_is_divergent(instr->dest) == expected_divergent);
8390

8391
         if (instr->intrinsic == nir_intrinsic_reduce) {
8392
            if (emit_uniform_reduce(ctx, instr))
8393
               break;
8394
         } else if (emit_uniform_scan(ctx, instr)) {
8395
            break;
8396
         }
8397
      }
8398

8399
      if (instr->dest.ssa.bit_size == 1) {
8400
         if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
8401
            op = nir_op_iand;
8402
         else if (op == nir_op_iadd)
8403
            op = nir_op_ixor;
8404
         else if (op == nir_op_umax || op == nir_op_imax)
8405
            op = nir_op_ior;
8406
         assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
8407

8408
         switch (instr->intrinsic) {
8409
         case nir_intrinsic_reduce:
8410
            emit_wqm(bld, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
8411
            break;
8412
         case nir_intrinsic_exclusive_scan:
8413
            emit_wqm(bld, emit_boolean_exclusive_scan(ctx, op, src), dst);
8414
            break;
8415
         case nir_intrinsic_inclusive_scan:
8416
            emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst);
8417
            break;
8418
         default: assert(false);
8419
         }
8420
      } else if (cluster_size == 1) {
8421
         bld.copy(Definition(dst), src);
8422
      } else {
8423
         unsigned bit_size = instr->src[0].ssa->bit_size;
8424

8425
         src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8426

8427
         ReduceOp reduce_op = get_reduce_op(op, bit_size);
8428

8429
         aco_opcode aco_op;
8430
         switch (instr->intrinsic) {
8431
         case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8432
         case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8433
         case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8434
         default: unreachable("unknown reduce intrinsic");
8435
         }
8436

8437
         Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size,
8438
                                             bld.def(dst.regClass()), src);
8439
         emit_wqm(bld, tmp_dst, dst);
8440
      }
8441
      break;
8442
   }
8443
   case nir_intrinsic_quad_broadcast: {
8444
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8445
      if (!nir_dest_is_divergent(instr->dest)) {
8446
         emit_uniform_subgroup(ctx, instr, src);
8447
      } else {
8448
         Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8449
         unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
8450
         uint32_t dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8451

8452
         if (instr->dest.ssa.bit_size != 1)
8453
            src = as_vgpr(ctx, src);
8454

8455
         if (instr->dest.ssa.bit_size == 1) {
8456
            assert(src.regClass() == bld.lm);
8457
            assert(dst.regClass() == bld.lm);
8458
            uint32_t half_mask = 0x11111111u << lane;
8459
            Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
8460
                                       Operand::c32(half_mask), Operand::c32(half_mask));
8461
            Temp tmp = bld.tmp(bld.lm);
8462
            bld.sop1(Builder::s_wqm, Definition(tmp),
8463
                     bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
8464
                              bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src,
8465
                                       Operand(exec, bld.lm))));
8466
            emit_wqm(bld, tmp, dst);
8467
         } else if (instr->dest.ssa.bit_size == 8) {
8468
            Temp tmp = bld.tmp(v1);
8469
            if (ctx->program->chip_class >= GFX8)
8470
               emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
8471
            else
8472
               emit_wqm(bld,
8473
                        bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl),
8474
                        tmp);
8475
            bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
8476
         } else if (instr->dest.ssa.bit_size == 16) {
8477
            Temp tmp = bld.tmp(v1);
8478
            if (ctx->program->chip_class >= GFX8)
8479
               emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
8480
            else
8481
               emit_wqm(bld,
8482
                        bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl),
8483
                        tmp);
8484
            bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
8485
         } else if (instr->dest.ssa.bit_size == 32) {
8486
            if (ctx->program->chip_class >= GFX8)
8487
               emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);
8488
            else
8489
               emit_wqm(bld,
8490
                        bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl),
8491
                        dst);
8492
         } else if (instr->dest.ssa.bit_size == 64) {
8493
            Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8494
            bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8495
            if (ctx->program->chip_class >= GFX8) {
8496
               lo = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
8497
               hi = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
8498
            } else {
8499
               lo = emit_wqm(
8500
                  bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl));
8501
               hi = emit_wqm(
8502
                  bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl));
8503
            }
8504
            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8505
            emit_split_vector(ctx, dst, 2);
8506
         } else {
8507
            isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8508
         }
8509
      }
8510
      break;
8511
   }
8512
   case nir_intrinsic_quad_swap_horizontal:
8513
   case nir_intrinsic_quad_swap_vertical:
8514
   case nir_intrinsic_quad_swap_diagonal:
8515
   case nir_intrinsic_quad_swizzle_amd: {
8516
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8517
      if (!nir_dest_is_divergent(instr->dest)) {
8518
         emit_uniform_subgroup(ctx, instr, src);
8519
         break;
8520
      }
8521
      uint16_t dpp_ctrl = 0;
8522
      switch (instr->intrinsic) {
8523
      case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8524
      case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8525
      case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8526
      case nir_intrinsic_quad_swizzle_amd: dpp_ctrl = nir_intrinsic_swizzle_mask(instr); break;
8527
      default: break;
8528
      }
8529
      if (ctx->program->chip_class < GFX8)
8530
         dpp_ctrl |= (1 << 15);
8531

8532
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8533

8534
      if (instr->dest.ssa.bit_size != 1)
8535
         src = as_vgpr(ctx, src);
8536

8537
      if (instr->dest.ssa.bit_size == 1) {
8538
         assert(src.regClass() == bld.lm);
8539
         src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8540
                            Operand::c32(-1), src);
8541
         if (ctx->program->chip_class >= GFX8)
8542
            src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
8543
         else
8544
            src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
8545
         Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8546
         emit_wqm(bld, tmp, dst);
8547
      } else if (instr->dest.ssa.bit_size == 8) {
8548
         Temp tmp = bld.tmp(v1);
8549
         if (ctx->program->chip_class >= GFX8)
8550
            emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
8551
         else
8552
            emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
8553
         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
8554
      } else if (instr->dest.ssa.bit_size == 16) {
8555
         Temp tmp = bld.tmp(v1);
8556
         if (ctx->program->chip_class >= GFX8)
8557
            emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
8558
         else
8559
            emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
8560
         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
8561
      } else if (instr->dest.ssa.bit_size == 32) {
8562
         Temp tmp;
8563
         if (ctx->program->chip_class >= GFX8)
8564
            tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
8565
         else
8566
            tmp = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
8567
         emit_wqm(bld, tmp, dst);
8568
      } else if (instr->dest.ssa.bit_size == 64) {
8569
         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8570
         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8571
         if (ctx->program->chip_class >= GFX8) {
8572
            lo = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
8573
            hi = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
8574
         } else {
8575
            lo = emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, dpp_ctrl));
8576
            hi = emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, dpp_ctrl));
8577
         }
8578
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8579
         emit_split_vector(ctx, dst, 2);
8580
      } else {
8581
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8582
      }
8583
      break;
8584
   }
8585
   case nir_intrinsic_masked_swizzle_amd: {
8586
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8587
      if (!nir_dest_is_divergent(instr->dest)) {
8588
         emit_uniform_subgroup(ctx, instr, src);
8589
         break;
8590
      }
8591
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8592
      uint32_t mask = nir_intrinsic_swizzle_mask(instr);
8593

8594
      if (instr->dest.ssa.bit_size != 1)
8595
         src = as_vgpr(ctx, src);
8596

8597
      if (instr->dest.ssa.bit_size == 1) {
8598
         assert(src.regClass() == bld.lm);
8599
         src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8600
                            Operand::c32(-1), src);
8601
         src = emit_masked_swizzle(ctx, bld, src, mask);
8602
         Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8603
         emit_wqm(bld, tmp, dst);
8604
      } else if (dst.regClass() == v1b) {
8605
         Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8606
         emit_extract_vector(ctx, tmp, 0, dst);
8607
      } else if (dst.regClass() == v2b) {
8608
         Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8609
         emit_extract_vector(ctx, tmp, 0, dst);
8610
      } else if (dst.regClass() == v1) {
8611
         emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask), dst);
8612
      } else if (dst.regClass() == v2) {
8613
         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8614
         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8615
         lo = emit_wqm(bld, emit_masked_swizzle(ctx, bld, lo, mask));
8616
         hi = emit_wqm(bld, emit_masked_swizzle(ctx, bld, hi, mask));
8617
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8618
         emit_split_vector(ctx, dst, 2);
8619
      } else {
8620
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8621
      }
8622
      break;
8623
   }
8624
   case nir_intrinsic_write_invocation_amd: {
8625
      Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8626
      Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8627
      Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8628
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8629
      if (dst.regClass() == v1) {
8630
         /* src2 is ignored for writelane. RA assigns the same reg for dst */
8631
         emit_wqm(bld, bld.writelane(bld.def(v1), val, lane, src), dst);
8632
      } else if (dst.regClass() == v2) {
8633
         Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8634
         Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8635
         bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8636
         bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8637
         Temp lo = emit_wqm(bld, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
8638
         Temp hi = emit_wqm(bld, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
8639
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8640
         emit_split_vector(ctx, dst, 2);
8641
      } else {
8642
         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8643
      }
8644
      break;
8645
   }
8646
   case nir_intrinsic_mbcnt_amd: {
8647
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8648
      Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8649
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8650
      /* Fit 64-bit mask for wave32 */
8651
      src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8652
      Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src), Operand(add_src));
8653
      emit_wqm(bld, wqm_tmp, dst);
8654
      break;
8655
   }
8656
   case nir_intrinsic_byte_permute_amd: {
8657
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8658
      assert(dst.regClass() == v1);
8659
      assert(ctx->program->chip_class >= GFX8);
8660
      bld.vop3(aco_opcode::v_perm_b32, Definition(dst), get_ssa_temp(ctx, instr->src[0].ssa),
8661
               as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)),
8662
               as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
8663
      break;
8664
   }
8665
   case nir_intrinsic_lane_permute_16_amd: {
8666
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8667
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8668
      assert(ctx->program->chip_class >= GFX10);
8669

8670
      if (src.regClass() == s1) {
8671
         bld.copy(Definition(dst), src);
8672
      } else if (dst.regClass() == v1 && src.regClass() == v1) {
8673
         bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
8674
                  bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
8675
                  bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
8676
      } else {
8677
         isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
8678
      }
8679
      break;
8680
   }
8681
   case nir_intrinsic_load_helper_invocation:
8682
   case nir_intrinsic_is_helper_invocation: {
8683
      /* load_helper() after demote() get lowered to is_helper().
8684
       * Otherwise, these two behave the same. */
8685
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8686
      bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
8687
      ctx->block->kind |= block_kind_needs_lowering;
8688
      ctx->program->needs_exact = true;
8689
      break;
8690
   }
8691
   case nir_intrinsic_demote:
8692
      bld.pseudo(aco_opcode::p_demote_to_helper, Operand::c32(-1u));
8693

8694
      if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8695
         ctx->cf_info.exec_potentially_empty_discard = true;
8696
      ctx->block->kind |= block_kind_uses_demote;
8697
      ctx->program->needs_exact = true;
8698
      break;
8699
   case nir_intrinsic_demote_if: {
8700
      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8701
      assert(src.regClass() == bld.lm);
8702
      Temp cond =
8703
         bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8704
      bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8705

8706
      if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8707
         ctx->cf_info.exec_potentially_empty_discard = true;
8708
      ctx->block->kind |= block_kind_uses_demote;
8709
      ctx->program->needs_exact = true;
8710
      break;
8711
   }
8712
   case nir_intrinsic_first_invocation: {
8713
      emit_wqm(bld, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
8714
               get_ssa_temp(ctx, &instr->dest.ssa));
8715
      break;
8716
   }
8717
   case nir_intrinsic_last_invocation: {
8718
      Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
8719
      Temp last = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc),
8720
                           Operand::c32(ctx->program->wave_size - 1u), flbit);
8721
      emit_wqm(bld, last, get_ssa_temp(ctx, &instr->dest.ssa));
8722
      break;
8723
   }
8724
   case nir_intrinsic_elect: {
8725
      Temp first = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
8726
      emit_wqm(
8727
         bld, bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), Operand::c32(1u), first),
8728
         get_ssa_temp(ctx, &instr->dest.ssa));
8729
      break;
8730
   }
8731
   case nir_intrinsic_shader_clock: {
8732
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8733
      if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP &&
8734
          ctx->options->chip_class >= GFX10_3) {
8735
         /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
8736
         Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
8737
         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
8738
      } else {
8739
         aco_opcode opcode = nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE
8740
                                ? aco_opcode::s_memrealtime
8741
                                : aco_opcode::s_memtime;
8742
         bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
8743
      }
8744
      emit_split_vector(ctx, dst, 2);
8745
      break;
8746
   }
8747
   case nir_intrinsic_load_vertex_id_zero_base: {
8748
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8749
      bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
8750
      break;
8751
   }
8752
   case nir_intrinsic_load_first_vertex: {
8753
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8754
      bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
8755
      break;
8756
   }
8757
   case nir_intrinsic_load_base_instance: {
8758
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8759
      bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
8760
      break;
8761
   }
8762
   case nir_intrinsic_load_instance_id: {
8763
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8764
      bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
8765
      break;
8766
   }
8767
   case nir_intrinsic_load_draw_id: {
8768
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8769
      bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
8770
      break;
8771
   }
8772
   case nir_intrinsic_load_invocation_id: {
8773
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8774

8775
      if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
8776
         if (ctx->options->chip_class >= GFX10)
8777
            bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),
8778
                         get_arg(ctx, ctx->args->ac.gs_invocation_id));
8779
         else
8780
            bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8781
      } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
8782
         bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
8783
                  Operand::c32(8u), Operand::c32(5u));
8784
      } else {
8785
         unreachable("Unsupported stage for load_invocation_id");
8786
      }
8787

8788
      break;
8789
   }
8790
   case nir_intrinsic_load_primitive_id: {
8791
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8792

8793
      switch (ctx->shader->info.stage) {
8794
      case MESA_SHADER_GEOMETRY:
8795
         bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8796
         break;
8797
      case MESA_SHADER_TESS_CTRL:
8798
         bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
8799
         break;
8800
      case MESA_SHADER_TESS_EVAL:
8801
         bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
8802
         break;
8803
      default:
8804
         if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) {
8805
            /* In case of NGG, the GS threads always have the primitive ID
8806
             * even if there is no SW GS. */
8807
            bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8808
            break;
8809
         }
8810
         unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
8811
      }
8812

8813
      break;
8814
   }
8815
   case nir_intrinsic_load_patch_vertices_in: {
8816
      assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||
8817
             ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
8818

8819
      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8820
      bld.copy(Definition(dst), Operand::c32(ctx->args->options->key.tcs.input_vertices));
8821
      break;
8822
   }
8823
   case nir_intrinsic_emit_vertex_with_counter: {
8824
      assert(ctx->stage.hw == HWStage::GS);
8825
      visit_emit_vertex_with_counter(ctx, instr);
8826
      break;
8827
   }
8828
   case nir_intrinsic_end_primitive_with_counter: {
8829
      if (ctx->stage.hw != HWStage::NGG) {
8830
         unsigned stream = nir_intrinsic_stream_id(instr);
8831
         bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1,
8832
                  sendmsg_gs(true, false, stream));
8833
      }
8834
      break;
8835
   }
8836
   case nir_intrinsic_set_vertex_and_primitive_count: {
8837
      assert(ctx->stage.hw == HWStage::GS);
8838
      /* unused in the legacy pipeline, the HW keeps track of this for us */
8839
      break;
8840
   }
8841
   case nir_intrinsic_load_tess_rel_patch_id_amd: {
8842
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_tess_rel_patch_id(ctx));
8843
      break;
8844
   }
8845
   case nir_intrinsic_load_ring_tess_factors_amd: {
8846
      bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8847
               ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_FACTOR * 16u));
8848
      break;
8849
   }
8850
   case nir_intrinsic_load_ring_tess_factors_offset_amd: {
8851
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8852
               get_arg(ctx, ctx->args->ac.tcs_factor_offset));
8853
      break;
8854
   }
8855
   case nir_intrinsic_load_ring_tess_offchip_amd: {
8856
      bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8857
               ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_OFFCHIP * 16u));
8858
      break;
8859
   }
8860
   case nir_intrinsic_load_ring_tess_offchip_offset_amd: {
8861
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8862
               get_arg(ctx, ctx->args->ac.tess_offchip_offset));
8863
      break;
8864
   }
8865
   case nir_intrinsic_load_ring_esgs_amd: {
8866
      unsigned ring = ctx->stage.hw == HWStage::ES ? RING_ESGS_VS : RING_ESGS_GS;
8867
      bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8868
               ctx->program->private_segment_buffer, Operand::c32(ring * 16u));
8869
      break;
8870
   }
8871
   case nir_intrinsic_load_ring_es2gs_offset_amd: {
8872
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8873
               get_arg(ctx, ctx->args->ac.es2gs_offset));
8874
      break;
8875
   }
8876
   case nir_intrinsic_load_gs_vertex_offset_amd: {
8877
      unsigned b = nir_intrinsic_base(instr);
8878
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8879
               get_arg(ctx, ctx->args->ac.gs_vtx_offset[b]));
8880
      break;
8881
   }
8882
   case nir_intrinsic_has_input_vertex_amd:
8883
   case nir_intrinsic_has_input_primitive_amd: {
8884
      assert(ctx->stage.hw == HWStage::NGG);
8885
      unsigned i = instr->intrinsic == nir_intrinsic_has_input_vertex_amd ? 0 : 1;
8886
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), merged_wave_info_to_mask(ctx, i));
8887
      break;
8888
   }
8889
   case nir_intrinsic_load_workgroup_num_input_vertices_amd:
8890
   case nir_intrinsic_load_workgroup_num_input_primitives_amd: {
8891
      assert(ctx->stage.hw == HWStage::NGG);
8892
      unsigned pos =
8893
         instr->intrinsic == nir_intrinsic_load_workgroup_num_input_vertices_amd ? 12 : 22;
8894
      bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8895
               bld.def(s1, scc), get_arg(ctx, ctx->args->ac.gs_tg_info),
8896
               Operand::c32(pos | (9u << 16u)));
8897
      break;
8898
   }
8899
   case nir_intrinsic_load_initial_edgeflag_amd: {
8900
      assert(ctx->stage.hw == HWStage::NGG);
8901
      assert(nir_src_is_const(instr->src[0]));
8902
      unsigned i = nir_src_as_uint(instr->src[0]);
8903

8904
      Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
8905
      bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8906
               gs_invocation_id, Operand::c32(8u + i), Operand::c32(1u));
8907
      break;
8908
   }
8909
   case nir_intrinsic_load_packed_passthrough_primitive_amd: {
8910
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8911
               get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]));
8912
      break;
8913
   }
8914
   case nir_intrinsic_export_vertex_amd: {
8915
      ctx->block->kind |= block_kind_export_end;
8916
      create_vs_exports(ctx);
8917
      break;
8918
   }
8919
   case nir_intrinsic_export_primitive_amd: {
8920
      assert(ctx->stage.hw == HWStage::NGG);
8921
      Temp prim_exp_arg = get_ssa_temp(ctx, instr->src[0].ssa);
8922
      bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
8923
              1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */,
8924
              true /* done */, false /* valid mask */);
8925
      break;
8926
   }
8927
   case nir_intrinsic_alloc_vertices_and_primitives_amd: {
8928
      assert(ctx->stage.hw == HWStage::NGG);
8929
      Temp num_vertices = get_ssa_temp(ctx, instr->src[0].ssa);
8930
      Temp num_primitives = get_ssa_temp(ctx, instr->src[1].ssa);
8931
      ngg_emit_sendmsg_gs_alloc_req(ctx, num_vertices, num_primitives);
8932
      break;
8933
   }
8934
   case nir_intrinsic_gds_atomic_add_amd: {
8935
      Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
8936
      Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
8937
      Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
8938
      Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
8939
      bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
8940
             true);
8941
      break;
8942
   }
8943
   case nir_intrinsic_load_shader_query_enabled_amd: {
8944
      unsigned cmp_bit = 0;
8945
      Temp shader_query_enabled =
8946
         bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),
8947
                  get_arg(ctx, ctx->args->ngg_gs_state), Operand::c32(cmp_bit));
8948
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8949
               bool_to_vector_condition(ctx, shader_query_enabled));
8950
      break;
8951
   }
8952
   case nir_intrinsic_load_cull_front_face_enabled_amd:
8953
   case nir_intrinsic_load_cull_back_face_enabled_amd:
8954
   case nir_intrinsic_load_cull_ccw_amd:
8955
   case nir_intrinsic_load_cull_small_primitives_enabled_amd: {
8956
      unsigned cmp_bit;
8957
      if (instr->intrinsic == nir_intrinsic_load_cull_front_face_enabled_amd)
8958
         cmp_bit = 0;
8959
      else if (instr->intrinsic == nir_intrinsic_load_cull_back_face_enabled_amd)
8960
         cmp_bit = 1;
8961
      else if (instr->intrinsic == nir_intrinsic_load_cull_ccw_amd)
8962
         cmp_bit = 2;
8963
      else if (instr->intrinsic == nir_intrinsic_load_cull_small_primitives_enabled_amd)
8964
         cmp_bit = 3;
8965
      else
8966
         unreachable("unimplemented culling intrinsic");
8967

8968
      Builder::Result enabled =
8969
         bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),
8970
                  get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(cmp_bit));
8971
      enabled.instr->definitions[0].setNoCSE(true);
8972
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8973
               bool_to_vector_condition(ctx, enabled));
8974
      break;
8975
   }
8976
   case nir_intrinsic_load_sbt_amd: visit_load_sbt_amd(ctx, instr); break;
8977
   case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
8978
   case nir_intrinsic_load_cull_any_enabled_amd: {
8979
      Builder::Result cull_any_enabled =
8980
         bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8981
                  get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(0x00ffffffu));
8982
      cull_any_enabled.instr->definitions[1].setNoCSE(true);
8983
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8984
               bool_to_vector_condition(ctx, cull_any_enabled.def(1).getTemp()));
8985
      break;
8986
   }
8987
   case nir_intrinsic_load_cull_small_prim_precision_amd: {
8988
      /* Exponent is 8-bit signed int, move that into a signed 32-bit int. */
8989
      Temp exponent = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc),
8990
                               get_arg(ctx, ctx->args->ngg_gs_state), Operand::c32(24u));
8991
      /* small_prim_precision = 1.0 * 2^X */
8992
      bld.vop3(aco_opcode::v_ldexp_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8993
               Operand::c32(0x3f800000u), Operand(exponent));
8994
      break;
8995
   }
8996
   case nir_intrinsic_load_viewport_x_scale: {
8997
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8998
               get_arg(ctx, ctx->args->ngg_viewport_scale[0]));
8999
      break;
9000
   }
9001
   case nir_intrinsic_load_viewport_y_scale: {
9002
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
9003
               get_arg(ctx, ctx->args->ngg_viewport_scale[1]));
9004
      break;
9005
   }
9006
   case nir_intrinsic_load_viewport_x_offset: {
9007
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
9008
               get_arg(ctx, ctx->args->ngg_viewport_translate[0]));
9009
      break;
9010
   }
9011
   case nir_intrinsic_load_viewport_y_offset: {
9012
      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
9013
               get_arg(ctx, ctx->args->ngg_viewport_translate[1]));
9014
      break;
9015
   }
9016
   case nir_intrinsic_overwrite_vs_arguments_amd: {
9017
      ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9018
      ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9019
      break;
9020
   }
9021
   case nir_intrinsic_overwrite_tes_arguments_amd: {
9022
      ctx->arg_temps[ctx->args->ac.tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9023
      ctx->arg_temps[ctx->args->ac.tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9024
      ctx->arg_temps[ctx->args->ac.tes_rel_patch_id.arg_index] =
9025
         get_ssa_temp(ctx, instr->src[2].ssa);
9026
      ctx->arg_temps[ctx->args->ac.tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
9027
      break;
9028
   }
9029
   case nir_intrinsic_overwrite_subgroup_num_vertices_and_primitives_amd: {
9030
      Temp old_merged_wave_info = get_arg(ctx, ctx->args->ac.merged_wave_info);
9031
      Temp num_vertices = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9032
      Temp num_primitives = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
9033
      Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), num_primitives,
9034
                          Operand::c32(8u));
9035
      tmp = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), tmp, num_vertices);
9036
      ctx->arg_temps[ctx->args->ac.merged_wave_info.arg_index] =
9037
         bld.sop2(aco_opcode::s_pack_lh_b32_b16, bld.def(s1), tmp, old_merged_wave_info);
9038
      break;
9039
   }
9040
   default:
9041
      isel_err(&instr->instr, "Unimplemented intrinsic instr");
9042
      abort();
9043

9044
      break;
9045
   }
9046
}
9047

9048
void
9049
tex_fetch_ptrs(isel_context* ctx, nir_tex_instr* instr, Temp* res_ptr, Temp* samp_ptr,
9050
               Temp* fmask_ptr, enum glsl_base_type* stype)
9051
{
9052
   nir_deref_instr* texture_deref_instr = NULL;
9053
   nir_deref_instr* sampler_deref_instr = NULL;
9054
   int plane = -1;
9055

9056
   for (unsigned i = 0; i < instr->num_srcs; i++) {
9057
      switch (instr->src[i].src_type) {
9058
      case nir_tex_src_texture_deref:
9059
         texture_deref_instr = nir_src_as_deref(instr->src[i].src);
9060
         break;
9061
      case nir_tex_src_sampler_deref:
9062
         sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
9063
         break;
9064
      case nir_tex_src_plane: plane = nir_src_as_int(instr->src[i].src); break;
9065
      default: break;
9066
      }
9067
   }
9068

9069
   *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
9070

9071
   if (!sampler_deref_instr)
9072
      sampler_deref_instr = texture_deref_instr;
9073

9074
   if (plane >= 0) {
9075
      assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical);
9076
      assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
9077
      *res_ptr = get_sampler_desc(ctx, texture_deref_instr,
9078
                                  (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false);
9079
   } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9080
      *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false);
9081
   } else if (instr->op == nir_texop_fragment_mask_fetch) {
9082
      *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);
9083
   } else {
9084
      *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false);
9085
   }
9086
   if (samp_ptr) {
9087
      *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false);
9088

9089
      if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
9090
         /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */
9091
         Builder bld(ctx->program, ctx->block);
9092

9093
         /* to avoid unnecessary moves, we split and recombine sampler and image */
9094
         Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),
9095
                        bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
9096
         Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
9097
         bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
9098
                    Definition(img[2]), Definition(img[3]), Definition(img[4]), Definition(img[5]),
9099
                    Definition(img[6]), Definition(img[7]), *res_ptr);
9100
         bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
9101
                    Definition(samp[2]), Definition(samp[3]), *samp_ptr);
9102

9103
         samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
9104
         *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), img[0], img[1], img[2],
9105
                               img[3], img[4], img[5], img[6], img[7]);
9106
         *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), samp[0], samp[1], samp[2],
9107
                                samp[3]);
9108
      }
9109
   }
9110
   if (fmask_ptr && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_samples_identical))
9111
      *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);
9112
}
9113

9114
void
9115
build_cube_select(isel_context* ctx, Temp ma, Temp id, Temp deriv, Temp* out_ma, Temp* out_sc,
9116
                  Temp* out_tc)
9117
{
9118
   Builder bld(ctx->program, ctx->block);
9119

9120
   Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
9121
   Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
9122
   Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
9123

9124
   Operand neg_one = Operand::c32(0xbf800000u);
9125
   Operand one = Operand::c32(0x3f800000u);
9126
   Operand two = Operand::c32(0x40000000u);
9127
   Operand four = Operand::c32(0x40800000u);
9128

9129
   Temp is_ma_positive =
9130
      bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), ma);
9131
   Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
9132
   Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::zero(), sgn_ma);
9133

9134
   Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
9135
   Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
9136
   is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
9137
   Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)),
9138
                               bld.def(s1, scc), is_ma_z, is_ma_y);
9139

9140
   /* select sc */
9141
   Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
9142
   Temp sgn = bld.vop2_e64(
9143
      aco_opcode::v_cndmask_b32, bld.def(v1),
9144
      bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), one, is_ma_y);
9145
   *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
9146

9147
   /* select tc */
9148
   tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
9149
   sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
9150
   *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
9151

9152
   /* select ma */
9153
   tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9154
                  bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
9155
                  deriv_z, is_ma_z);
9156
   tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffffu), tmp);
9157
   *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
9158
}
9159

9160
void
9161
prepare_cube_coords(isel_context* ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy,
9162
                    bool is_deriv, bool is_array)
9163
{
9164
   Builder bld(ctx->program, ctx->block);
9165
   Temp ma, tc, sc, id;
9166
   aco_opcode madak =
9167
      ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32;
9168
   aco_opcode madmk =
9169
      ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32;
9170

9171
   if (is_array) {
9172
      coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
9173

9174
      /* see comment in ac_prepare_cube_coords() */
9175
      if (ctx->options->chip_class <= GFX8)
9176
         coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), coords[3]);
9177
   }
9178

9179
   ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9180

9181
   aco_ptr<VOP3_instruction> vop3a{
9182
      create_instruction<VOP3_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
9183
   vop3a->operands[0] = Operand(ma);
9184
   vop3a->abs[0] = true;
9185
   Temp invma = bld.tmp(v1);
9186
   vop3a->definitions[0] = Definition(invma);
9187
   ctx->block->instructions.emplace_back(std::move(vop3a));
9188

9189
   sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9190
   if (!is_deriv)
9191
      sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand::c32(0x3fc00000u /*1.5*/));
9192

9193
   tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9194
   if (!is_deriv)
9195
      tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand::c32(0x3fc00000u /*1.5*/));
9196

9197
   id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9198

9199
   if (is_deriv) {
9200
      sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
9201
      tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
9202

9203
      for (unsigned i = 0; i < 2; i++) {
9204
         /* see comment in ac_prepare_cube_coords() */
9205
         Temp deriv_ma;
9206
         Temp deriv_sc, deriv_tc;
9207
         build_cube_select(ctx, ma, id, i ? *ddy : *ddx, &deriv_ma, &deriv_sc, &deriv_tc);
9208

9209
         deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
9210

9211
         Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
9212
                           bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
9213
                           bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
9214
         Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
9215
                           bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
9216
                           bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
9217
         *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
9218
      }
9219

9220
      sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), sc);
9221
      tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), tc);
9222
   }
9223

9224
   if (is_array)
9225
      id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand::c32(0x41000000u /*8.0*/));
9226
   coords.resize(3);
9227
   coords[0] = sc;
9228
   coords[1] = tc;
9229
   coords[2] = id;
9230
}
9231

9232
void
9233
get_const_vec(nir_ssa_def* vec, nir_const_value* cv[4])
9234
{
9235
   if (vec->parent_instr->type != nir_instr_type_alu)
9236
      return;
9237
   nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9238
   if (vec_instr->op != nir_op_vec(vec->num_components))
9239
      return;
9240

9241
   for (unsigned i = 0; i < vec->num_components; i++) {
9242
      cv[i] =
9243
         vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9244
   }
9245
}
9246

9247
void
9248
visit_tex(isel_context* ctx, nir_tex_instr* instr)
9249
{
9250
   Builder bld(ctx->program, ctx->block);
9251
   bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9252
        has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9253
        has_sample_index = false, has_clamped_lod = false;
9254
   Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
9255
                                      lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
9256
                                      clamped_lod = Temp();
9257
   std::vector<Temp> coords;
9258
   std::vector<Temp> derivs;
9259
   nir_const_value* sample_index_cv = NULL;
9260
   nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9261
   enum glsl_base_type stype;
9262
   tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
9263

9264
   bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
9265
                                  (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
9266
   bool tg4_integer_cube_workaround =
9267
      tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9268

9269
   for (unsigned i = 0; i < instr->num_srcs; i++) {
9270
      switch (instr->src[i].src_type) {
9271
      case nir_tex_src_coord: {
9272
         Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9273
         for (unsigned j = 0; j < coord.size(); j++)
9274
            coords.emplace_back(emit_extract_vector(ctx, coord, j, v1));
9275
         break;
9276
      }
9277
      case nir_tex_src_bias:
9278
         bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9279
         has_bias = true;
9280
         break;
9281
      case nir_tex_src_lod: {
9282
         if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9283
            level_zero = true;
9284
         } else {
9285
            lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
9286
            has_lod = true;
9287
         }
9288
         break;
9289
      }
9290
      case nir_tex_src_min_lod:
9291
         clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
9292
         has_clamped_lod = true;
9293
         break;
9294
      case nir_tex_src_comparator:
9295
         if (instr->is_shadow) {
9296
            compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9297
            has_compare = true;
9298
         }
9299
         break;
9300
      case nir_tex_src_offset:
9301
         offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9302
         get_const_vec(instr->src[i].src.ssa, const_offset);
9303
         has_offset = true;
9304
         break;
9305
      case nir_tex_src_ddx:
9306
         ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
9307
         has_ddx = true;
9308
         break;
9309
      case nir_tex_src_ddy:
9310
         ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
9311
         has_ddy = true;
9312
         break;
9313
      case nir_tex_src_ms_index:
9314
         sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
9315
         sample_index_cv = nir_src_as_const_value(instr->src[i].src);
9316
         has_sample_index = true;
9317
         break;
9318
      case nir_tex_src_texture_offset:
9319
      case nir_tex_src_sampler_offset:
9320
      default: break;
9321
      }
9322
   }
9323

9324
   if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9325
      return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa));
9326

9327
   if (instr->op == nir_texop_texture_samples) {
9328
      get_image_samples(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), resource);
9329
      return;
9330
   }
9331

9332
   if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
9333
      aco_ptr<Instruction> tmp_instr;
9334
      Temp acc, pack = Temp();
9335

9336
      uint32_t pack_const = 0;
9337
      for (unsigned i = 0; i < offset.size(); i++) {
9338
         if (!const_offset[i])
9339
            continue;
9340
         pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9341
      }
9342

9343
      if (offset.type() == RegType::sgpr) {
9344
         for (unsigned i = 0; i < offset.size(); i++) {
9345
            if (const_offset[i])
9346
               continue;
9347

9348
            acc = emit_extract_vector(ctx, offset, i, s1);
9349
            acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9350
                           Operand::c32(0x3Fu));
9351

9352
            if (i) {
9353
               acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9354
                              Operand::c32(8u * i));
9355
            }
9356

9357
            if (pack == Temp()) {
9358
               pack = acc;
9359
            } else {
9360
               pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9361
            }
9362
         }
9363

9364
         if (pack_const && pack != Temp())
9365
            pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9366
                            Operand::c32(pack_const), pack);
9367
      } else {
9368
         for (unsigned i = 0; i < offset.size(); i++) {
9369
            if (const_offset[i])
9370
               continue;
9371

9372
            acc = emit_extract_vector(ctx, offset, i, v1);
9373
            acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9374

9375
            if (i) {
9376
               acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9377
            }
9378

9379
            if (pack == Temp()) {
9380
               pack = acc;
9381
            } else {
9382
               pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9383
            }
9384
         }
9385

9386
         if (pack_const && pack != Temp())
9387
            pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9388
      }
9389
      if (pack_const && pack == Temp())
9390
         offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9391
      else if (pack == Temp())
9392
         has_offset = false;
9393
      else
9394
         offset = pack;
9395
   }
9396

9397
   if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
9398
      prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd,
9399
                          instr->is_array && instr->op != nir_texop_lod);
9400

9401
   /* pack derivatives */
9402
   if (has_ddx || has_ddy) {
9403
      if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
9404
         assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);
9405
         Temp zero = bld.copy(bld.def(v1), Operand::zero());
9406
         derivs = {ddx, zero, ddy, zero};
9407
      } else {
9408
         for (unsigned i = 0; has_ddx && i < ddx.size(); i++)
9409
            derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));
9410
         for (unsigned i = 0; has_ddy && i < ddy.size(); i++)
9411
            derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));
9412
      }
9413
      has_derivs = true;
9414
   }
9415

9416
   if (instr->coord_components > 1 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
9417
       instr->is_array && instr->op != nir_texop_txf)
9418
      coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
9419

9420
   if (instr->coord_components > 2 &&
9421
       (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9422
        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
9423
        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
9424
       instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms &&
9425
       instr->op != nir_texop_fragment_fetch && instr->op != nir_texop_fragment_mask_fetch)
9426
      coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
9427

9428
   if (ctx->options->chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
9429
       instr->op != nir_texop_lod && instr->coord_components) {
9430
      assert(coords.size() > 0 && coords.size() < 3);
9431

9432
      coords.insert(std::next(coords.begin()),
9433
                    bld.copy(bld.def(v1), instr->op == nir_texop_txf ? Operand::c32(0)
9434
                                                                     : Operand::c32(0x3f000000)));
9435
   }
9436

9437
   bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
9438

9439
   if (instr->op == nir_texop_samples_identical)
9440
      resource = fmask_ptr;
9441

9442
   else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9443
             instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
9444
            instr->op != nir_texop_txs && instr->op != nir_texop_fragment_fetch &&
9445
            instr->op != nir_texop_fragment_mask_fetch) {
9446
      assert(has_sample_index);
9447
      Operand op(sample_index);
9448
      if (sample_index_cv)
9449
         op = Operand::c32(sample_index_cv->u32);
9450
      sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
9451
   }
9452

9453
   if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
9454
      for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
9455
         Temp off = emit_extract_vector(ctx, offset, i, v1);
9456
         coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
9457
      }
9458
      has_offset = false;
9459
   }
9460

9461
   /* Build tex instruction */
9462
   unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa) & 0xf;
9463
   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9464
      dmask = u_bit_consecutive(0, util_last_bit(dmask));
9465
   if (instr->is_sparse)
9466
      dmask = MAX2(dmask, 1) | 0x10;
9467
   unsigned dim =
9468
      ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
9469
         ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
9470
         : 0;
9471
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9472
   Temp tmp_dst = dst;
9473

9474
   /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9475
   if (instr->op == nir_texop_tg4) {
9476
      assert(instr->dest.ssa.num_components == (4 + instr->is_sparse));
9477
      if (instr->is_shadow)
9478
         dmask = 1;
9479
      else
9480
         dmask = 1 << instr->component;
9481
      if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9482
         tmp_dst = bld.tmp(instr->is_sparse ? v5 : v4);
9483
   } else if (instr->op == nir_texop_samples_identical) {
9484
      tmp_dst = bld.tmp(v1);
9485
   } else if (util_bitcount(dmask) != instr->dest.ssa.num_components ||
9486
              dst.type() == RegType::sgpr) {
9487
      tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
9488
   }
9489

9490
   if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
9491
      if (!has_lod)
9492
         lod = bld.copy(bld.def(v1), Operand::zero());
9493

9494
      bool div_by_6 = instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
9495
                      instr->is_array && (dmask & (1 << 2));
9496
      if (tmp_dst.id() == dst.id() && div_by_6)
9497
         tmp_dst = bld.tmp(tmp_dst.regClass());
9498

9499
      MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(tmp_dst),
9500
                                        resource, Operand(s4), std::vector<Temp>{lod});
9501
      if (ctx->options->chip_class == GFX9 && instr->op == nir_texop_txs &&
9502
          instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) {
9503
         tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
9504
      } else if (instr->op == nir_texop_query_levels) {
9505
         tex->dmask = 1 << 3;
9506
      } else {
9507
         tex->dmask = dmask;
9508
      }
9509
      tex->da = da;
9510
      tex->dim = dim;
9511

9512
      if (div_by_6) {
9513
         /* divide 3rd value by 6 by multiplying with magic number */
9514
         emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9515
         Temp c = bld.copy(bld.def(s1), Operand::c32(0x2AAAAAAB));
9516
         Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1),
9517
                              emit_extract_vector(ctx, tmp_dst, 2, v1), c);
9518
         assert(instr->dest.ssa.num_components == 3);
9519
         Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
9520
         tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
9521
                              emit_extract_vector(ctx, tmp_dst, 0, v1),
9522
                              emit_extract_vector(ctx, tmp_dst, 1, v1), by_6);
9523
      }
9524

9525
      expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9526
      return;
9527
   }
9528

9529
   Temp tg4_compare_cube_wa64 = Temp();
9530

9531
   if (tg4_integer_workarounds) {
9532
      Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9533
      Temp size = bld.tmp(v2);
9534
      MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(size),
9535
                                        resource, Operand(s4), std::vector<Temp>{tg4_lod});
9536
      tex->dim = dim;
9537
      tex->dmask = 0x3;
9538
      tex->da = da;
9539
      emit_split_vector(ctx, size, size.size());
9540

9541
      Temp half_texel[2];
9542
      for (unsigned i = 0; i < 2; i++) {
9543
         half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9544
         half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9545
         half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9546
         half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9547
                                  Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9548
      }
9549

9550
      if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9551
         /* In vulkan, whether the sampler uses unnormalized
9552
          * coordinates or not is a dynamic property of the
9553
          * sampler. Hence, to figure out whether or not we
9554
          * need to divide by the texture size, we need to test
9555
          * the sampler at runtime. This tests the bit set by
9556
          * radv_init_sampler().
9557
          */
9558
         unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9559
         Temp not_needed =
9560
            bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand::c32(bit_idx));
9561

9562
         not_needed = bool_to_vector_condition(ctx, not_needed);
9563
         half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9564
                                  Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9565
         half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9566
                                  Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9567
      }
9568

9569
      Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9570
                            bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9571

9572
      if (tg4_integer_cube_workaround) {
9573
         /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9574
         Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9575
         aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
9576
            aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9577
         split->operands[0] = Operand(resource);
9578
         for (unsigned i = 0; i < resource.size(); i++) {
9579
            desc[i] = bld.tmp(s1);
9580
            split->definitions[i] = Definition(desc[i]);
9581
         }
9582
         ctx->block->instructions.emplace_back(std::move(split));
9583

9584
         Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9585
                              Operand::c32(20u | (6u << 16)));
9586
         Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9587
                                         Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9588

9589
         Temp nfmt;
9590
         if (stype == GLSL_TYPE_UINT) {
9591
            nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9592
                            Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9593
                            Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9594
         } else {
9595
            nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9596
                            Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9597
                            Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9598
         }
9599
         tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9600
         bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9601

9602
         nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9603
                         Operand::c32(26u));
9604

9605
         desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9606
                            Operand::c32(C_008F14_NUM_FORMAT));
9607
         desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9608

9609
         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
9610
            aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9611
         for (unsigned i = 0; i < resource.size(); i++)
9612
            vec->operands[i] = Operand(desc[i]);
9613
         resource = bld.tmp(resource.regClass());
9614
         vec->definitions[0] = Definition(resource);
9615
         ctx->block->instructions.emplace_back(std::move(vec));
9616

9617
         new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9618
                                  tg4_compare_cube_wa64);
9619
         new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9620
                                  tg4_compare_cube_wa64);
9621
      }
9622
      coords[0] = new_coords[0];
9623
      coords[1] = new_coords[1];
9624
   }
9625

9626
   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9627
      // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9628
      // ac_build_buffer_load_format_gfx9_safe()
9629

9630
      assert(coords.size() == 1);
9631
      aco_opcode op;
9632
      switch (util_last_bit(dmask & 0xf)) {
9633
      case 1: op = aco_opcode::buffer_load_format_x; break;
9634
      case 2: op = aco_opcode::buffer_load_format_xy; break;
9635
      case 3: op = aco_opcode::buffer_load_format_xyz; break;
9636
      case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9637
      default: unreachable("Tex instruction loads more than 4 components.");
9638
      }
9639

9640
      aco_ptr<MUBUF_instruction> mubuf{
9641
         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9642
      mubuf->operands[0] = Operand(resource);
9643
      mubuf->operands[1] = Operand(coords[0]);
9644
      mubuf->operands[2] = Operand::c32(0);
9645
      mubuf->definitions[0] = Definition(tmp_dst);
9646
      mubuf->idxen = true;
9647
      mubuf->tfe = instr->is_sparse;
9648
      if (mubuf->tfe)
9649
         mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9650
      ctx->block->instructions.emplace_back(std::move(mubuf));
9651

9652
      expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9653
      return;
9654
   }
9655

9656
   /* gather MIMG address components */
9657
   std::vector<Temp> args;
9658
   unsigned wqm_mask = 0;
9659
   if (has_offset) {
9660
      wqm_mask |= u_bit_consecutive(args.size(), 1);
9661
      args.emplace_back(offset);
9662
   }
9663
   if (has_bias)
9664
      args.emplace_back(bias);
9665
   if (has_compare)
9666
      args.emplace_back(compare);
9667
   if (has_derivs)
9668
      args.insert(args.end(), derivs.begin(), derivs.end());
9669

9670
   wqm_mask |= u_bit_consecutive(args.size(), coords.size());
9671
   args.insert(args.end(), coords.begin(), coords.end());
9672

9673
   if (has_sample_index)
9674
      args.emplace_back(sample_index);
9675
   if (has_lod)
9676
      args.emplace_back(lod);
9677
   if (has_clamped_lod)
9678
      args.emplace_back(clamped_lod);
9679

9680
   if (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms ||
9681
       instr->op == nir_texop_samples_identical || instr->op == nir_texop_fragment_fetch ||
9682
       instr->op == nir_texop_fragment_mask_fetch) {
9683
      aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9684
                            instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
9685
                         ? aco_opcode::image_load
9686
                         : aco_opcode::image_load_mip;
9687
      Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9688
      MIMG_instruction* tex =
9689
         emit_mimg(bld, op, Definition(tmp_dst), resource, Operand(s4), args, 0, vdata);
9690
      tex->dim = dim;
9691
      tex->dmask = dmask & 0xf;
9692
      tex->unrm = true;
9693
      tex->da = da;
9694
      tex->tfe = instr->is_sparse;
9695

9696
      if (instr->op == nir_texop_samples_identical) {
9697
         assert(dmask == 1 && dst.regClass() == bld.lm);
9698
         assert(dst.id() != tmp_dst.id());
9699

9700
         bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand::zero(), tmp_dst)
9701
            .def(0)
9702
            .setHint(vcc);
9703
      } else {
9704
         expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9705
      }
9706
      return;
9707
   }
9708

9709
   // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
9710
   aco_opcode opcode = aco_opcode::image_sample;
9711
   if (has_offset) { /* image_sample_*_o */
9712
      if (has_clamped_lod) {
9713
         if (has_compare) {
9714
            opcode = aco_opcode::image_sample_c_cl_o;
9715
            if (has_derivs)
9716
               opcode = aco_opcode::image_sample_c_d_cl_o;
9717
            if (has_bias)
9718
               opcode = aco_opcode::image_sample_c_b_cl_o;
9719
         } else {
9720
            opcode = aco_opcode::image_sample_cl_o;
9721
            if (has_derivs)
9722
               opcode = aco_opcode::image_sample_d_cl_o;
9723
            if (has_bias)
9724
               opcode = aco_opcode::image_sample_b_cl_o;
9725
         }
9726
      } else if (has_compare) {
9727
         opcode = aco_opcode::image_sample_c_o;
9728
         if (has_derivs)
9729
            opcode = aco_opcode::image_sample_c_d_o;
9730
         if (has_bias)
9731
            opcode = aco_opcode::image_sample_c_b_o;
9732
         if (level_zero)
9733
            opcode = aco_opcode::image_sample_c_lz_o;
9734
         if (has_lod)
9735
            opcode = aco_opcode::image_sample_c_l_o;
9736
      } else {
9737
         opcode = aco_opcode::image_sample_o;
9738
         if (has_derivs)
9739
            opcode = aco_opcode::image_sample_d_o;
9740
         if (has_bias)
9741
            opcode = aco_opcode::image_sample_b_o;
9742
         if (level_zero)
9743
            opcode = aco_opcode::image_sample_lz_o;
9744
         if (has_lod)
9745
            opcode = aco_opcode::image_sample_l_o;
9746
      }
9747
   } else if (has_clamped_lod) { /* image_sample_*_cl */
9748
      if (has_compare) {
9749
         opcode = aco_opcode::image_sample_c_cl;
9750
         if (has_derivs)
9751
            opcode = aco_opcode::image_sample_c_d_cl;
9752
         if (has_bias)
9753
            opcode = aco_opcode::image_sample_c_b_cl;
9754
      } else {
9755
         opcode = aco_opcode::image_sample_cl;
9756
         if (has_derivs)
9757
            opcode = aco_opcode::image_sample_d_cl;
9758
         if (has_bias)
9759
            opcode = aco_opcode::image_sample_b_cl;
9760
      }
9761
   } else { /* no offset */
9762
      if (has_compare) {
9763
         opcode = aco_opcode::image_sample_c;
9764
         if (has_derivs)
9765
            opcode = aco_opcode::image_sample_c_d;
9766
         if (has_bias)
9767
            opcode = aco_opcode::image_sample_c_b;
9768
         if (level_zero)
9769
            opcode = aco_opcode::image_sample_c_lz;
9770
         if (has_lod)
9771
            opcode = aco_opcode::image_sample_c_l;
9772
      } else {
9773
         opcode = aco_opcode::image_sample;
9774
         if (has_derivs)
9775
            opcode = aco_opcode::image_sample_d;
9776
         if (has_bias)
9777
            opcode = aco_opcode::image_sample_b;
9778
         if (level_zero)
9779
            opcode = aco_opcode::image_sample_lz;
9780
         if (has_lod)
9781
            opcode = aco_opcode::image_sample_l;
9782
      }
9783
   }
9784

9785
   if (instr->op == nir_texop_tg4) {
9786
      if (has_offset) { /* image_gather4_*_o */
9787
         if (has_compare) {
9788
            opcode = aco_opcode::image_gather4_c_lz_o;
9789
            if (has_lod)
9790
               opcode = aco_opcode::image_gather4_c_l_o;
9791
            if (has_bias)
9792
               opcode = aco_opcode::image_gather4_c_b_o;
9793
         } else {
9794
            opcode = aco_opcode::image_gather4_lz_o;
9795
            if (has_lod)
9796
               opcode = aco_opcode::image_gather4_l_o;
9797
            if (has_bias)
9798
               opcode = aco_opcode::image_gather4_b_o;
9799
         }
9800
      } else {
9801
         if (has_compare) {
9802
            opcode = aco_opcode::image_gather4_c_lz;
9803
            if (has_lod)
9804
               opcode = aco_opcode::image_gather4_c_l;
9805
            if (has_bias)
9806
               opcode = aco_opcode::image_gather4_c_b;
9807
         } else {
9808
            opcode = aco_opcode::image_gather4_lz;
9809
            if (has_lod)
9810
               opcode = aco_opcode::image_gather4_l;
9811
            if (has_bias)
9812
               opcode = aco_opcode::image_gather4_b;
9813
         }
9814
      }
9815
   } else if (instr->op == nir_texop_lod) {
9816
      opcode = aco_opcode::image_get_lod;
9817
   }
9818

9819
   bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
9820
                          !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
9821
                          instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
9822

9823
   Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9824
   MIMG_instruction* tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource, Operand(sampler),
9825
                                     args, implicit_derivs ? wqm_mask : 0, vdata);
9826
   tex->dim = dim;
9827
   tex->dmask = dmask & 0xf;
9828
   tex->da = da;
9829
   tex->tfe = instr->is_sparse;
9830

9831
   if (tg4_integer_cube_workaround) {
9832
      assert(tmp_dst.id() != dst.id());
9833
      assert(tmp_dst.size() == dst.size());
9834

9835
      emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9836
      Temp val[4];
9837
      for (unsigned i = 0; i < 4; i++) {
9838
         val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9839
         Temp cvt_val;
9840
         if (stype == GLSL_TYPE_UINT)
9841
            cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9842
         else
9843
            cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9844
         val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
9845
                           tg4_compare_cube_wa64);
9846
      }
9847

9848
      Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
9849
      if (instr->is_sparse)
9850
         tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9851
                              val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
9852
      else
9853
         tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9854
                              val[3]);
9855
   }
9856
   unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
9857
   expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
9858
}
9859

9860
Operand
9861
get_phi_operand(isel_context* ctx, nir_ssa_def* ssa, RegClass rc, bool logical)
9862
{
9863
   Temp tmp = get_ssa_temp(ctx, ssa);
9864
   if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
9865
      return Operand(rc);
9866
   } else if (logical && ssa->bit_size == 1 &&
9867
              ssa->parent_instr->type == nir_instr_type_load_const) {
9868
      if (ctx->program->wave_size == 64)
9869
         return Operand::c64(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX
9870
                                                                                    : 0u);
9871
      else
9872
         return Operand::c32(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX
9873
                                                                                    : 0u);
9874
   } else {
9875
      return Operand(tmp);
9876
   }
9877
}
9878

9879
void
9880
visit_phi(isel_context* ctx, nir_phi_instr* instr)
9881
{
9882
   aco_ptr<Pseudo_instruction> phi;
9883
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9884
   assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9885

9886
   bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
9887
   logical |= (ctx->block->kind & block_kind_merge) != 0;
9888
   aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
9889

9890
   /* we want a sorted list of sources, since the predecessor list is also sorted */
9891
   std::map<unsigned, nir_ssa_def*> phi_src;
9892
   nir_foreach_phi_src (src, instr)
9893
      phi_src[src->pred->index] = src->src.ssa;
9894

9895
   std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
9896
   unsigned num_operands = 0;
9897
   Operand* const operands = (Operand*)alloca(
9898
      (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));
9899
   unsigned num_defined = 0;
9900
   unsigned cur_pred_idx = 0;
9901
   for (std::pair<unsigned, nir_ssa_def*> src : phi_src) {
9902
      if (cur_pred_idx < preds.size()) {
9903
         /* handle missing preds (IF merges with discard/break) and extra preds
9904
          * (loop exit with discard) */
9905
         unsigned block = ctx->cf_info.nir_to_aco[src.first];
9906
         unsigned skipped = 0;
9907
         while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
9908
            skipped++;
9909
         if (cur_pred_idx + skipped < preds.size()) {
9910
            for (unsigned i = 0; i < skipped; i++)
9911
               operands[num_operands++] = Operand(dst.regClass());
9912
            cur_pred_idx += skipped;
9913
         } else {
9914
            continue;
9915
         }
9916
      }
9917
      /* Handle missing predecessors at the end. This shouldn't happen with loop
9918
       * headers and we can't ignore these sources for loop header phis. */
9919
      if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
9920
         continue;
9921
      cur_pred_idx++;
9922
      Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
9923
      operands[num_operands++] = op;
9924
      num_defined += !op.isUndefined();
9925
   }
9926
   /* handle block_kind_continue_or_break at loop exit blocks */
9927
   while (cur_pred_idx++ < preds.size())
9928
      operands[num_operands++] = Operand(dst.regClass());
9929

9930
   /* If the loop ends with a break, still add a linear continue edge in case
9931
    * that break is divergent or continue_or_break is used. We'll either remove
9932
    * this operand later in visit_loop() if it's not necessary or replace the
9933
    * undef with something correct. */
9934
   if (!logical && ctx->block->kind & block_kind_loop_header) {
9935
      nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
9936
      nir_block* last = nir_loop_last_block(loop);
9937
      if (last->successors[0] != instr->instr.block)
9938
         operands[num_operands++] = Operand(RegClass());
9939
   }
9940

9941
   /* we can use a linear phi in some cases if one src is undef */
9942
   if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
9943
      phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO,
9944
                                                       num_operands, 1));
9945

9946
      Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
9947
      Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];
9948
      assert(invert->kind & block_kind_invert);
9949

9950
      unsigned then_block = invert->linear_preds[0];
9951

9952
      Block* insert_block = NULL;
9953
      for (unsigned i = 0; i < num_operands; i++) {
9954
         Operand op = operands[i];
9955
         if (op.isUndefined())
9956
            continue;
9957
         insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
9958
         phi->operands[0] = op;
9959
         break;
9960
      }
9961
      assert(insert_block); /* should be handled by the "num_defined == 0" case above */
9962
      phi->operands[1] = Operand(dst.regClass());
9963
      phi->definitions[0] = Definition(dst);
9964
      insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
9965
      return;
9966
   }
9967

9968
   phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9969
   for (unsigned i = 0; i < num_operands; i++)
9970
      phi->operands[i] = operands[i];
9971
   phi->definitions[0] = Definition(dst);
9972
   ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9973
}
9974

9975
void
9976
visit_undef(isel_context* ctx, nir_ssa_undef_instr* instr)
9977
{
9978
   Temp dst = get_ssa_temp(ctx, &instr->def);
9979

9980
   assert(dst.type() == RegType::sgpr);
9981

9982
   if (dst.size() == 1) {
9983
      Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
9984
   } else {
9985
      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9986
         aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9987
      for (unsigned i = 0; i < dst.size(); i++)
9988
         vec->operands[i] = Operand::zero();
9989
      vec->definitions[0] = Definition(dst);
9990
      ctx->block->instructions.emplace_back(std::move(vec));
9991
   }
9992
}
9993

9994
void
9995
begin_loop(isel_context* ctx, loop_context* lc)
9996
{
9997
   // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9998
   append_logical_end(ctx->block);
9999
   ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
10000
   Builder bld(ctx->program, ctx->block);
10001
   bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10002
   unsigned loop_preheader_idx = ctx->block->index;
10003

10004
   lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
10005

10006
   ctx->program->next_loop_depth++;
10007

10008
   Block* loop_header = ctx->program->create_and_insert_block();
10009
   loop_header->kind |= block_kind_loop_header;
10010
   add_edge(loop_preheader_idx, loop_header);
10011
   ctx->block = loop_header;
10012

10013
   append_logical_start(ctx->block);
10014

10015
   lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
10016
   lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
10017
   lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
10018
   lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
10019
   lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
10020
}
10021

10022
void
10023
end_loop(isel_context* ctx, loop_context* lc)
10024
{
10025
   // TODO: what if a loop ends with a unconditional or uniformly branched continue
10026
   //       and this branch is never taken?
10027
   if (!ctx->cf_info.has_branch) {
10028
      unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10029
      Builder bld(ctx->program, ctx->block);
10030
      append_logical_end(ctx->block);
10031

10032
      if (ctx->cf_info.exec_potentially_empty_discard ||
10033
          ctx->cf_info.exec_potentially_empty_break) {
10034
         /* Discards can result in code running with an empty exec mask.
10035
          * This would result in divergent breaks not ever being taken. As a
10036
          * workaround, break the loop when the loop mask is empty instead of
10037
          * always continuing. */
10038
         ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
10039
         unsigned block_idx = ctx->block->index;
10040

10041
         /* create helper blocks to avoid critical edges */
10042
         Block* break_block = ctx->program->create_and_insert_block();
10043
         break_block->kind = block_kind_uniform;
10044
         bld.reset(break_block);
10045
         bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10046
         add_linear_edge(block_idx, break_block);
10047
         add_linear_edge(break_block->index, &lc->loop_exit);
10048

10049
         Block* continue_block = ctx->program->create_and_insert_block();
10050
         continue_block->kind = block_kind_uniform;
10051
         bld.reset(continue_block);
10052
         bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10053
         add_linear_edge(block_idx, continue_block);
10054
         add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
10055

10056
         if (!ctx->cf_info.parent_loop.has_divergent_branch)
10057
            add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
10058
         ctx->block = &ctx->program->blocks[block_idx];
10059
      } else {
10060
         ctx->block->kind |= (block_kind_continue | block_kind_uniform);
10061
         if (!ctx->cf_info.parent_loop.has_divergent_branch)
10062
            add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10063
         else
10064
            add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10065
      }
10066

10067
      bld.reset(ctx->block);
10068
      bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10069
   }
10070

10071
   ctx->cf_info.has_branch = false;
10072
   ctx->program->next_loop_depth--;
10073

10074
   // TODO: if the loop has not a single exit, we must add one °°
10075
   /* emit loop successor block */
10076
   ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
10077
   append_logical_start(ctx->block);
10078

10079
#if 0
10080
   // TODO: check if it is beneficial to not branch on continues
10081
   /* trim linear phis in loop header */
10082
   for (auto&& instr : loop_entry->instructions) {
10083
      if (instr->opcode == aco_opcode::p_linear_phi) {
10084
         aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
10085
         new_phi->definitions[0] = instr->definitions[0];
10086
         for (unsigned i = 0; i < new_phi->operands.size(); i++)
10087
            new_phi->operands[i] = instr->operands[i];
10088
         /* check that the remaining operands are all the same */
10089
         for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
10090
            assert(instr->operands[i].tempId() == instr->operands.back().tempId());
10091
         instr.swap(new_phi);
10092
      } else if (instr->opcode == aco_opcode::p_phi) {
10093
         continue;
10094
      } else {
10095
         break;
10096
      }
10097
   }
10098
#endif
10099

10100
   ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
10101
   ctx->cf_info.parent_loop.exit = lc->exit_old;
10102
   ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
10103
   ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
10104
   ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
10105
   if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
10106
      ctx->cf_info.exec_potentially_empty_discard = false;
10107
}
10108

10109
void
10110
emit_loop_jump(isel_context* ctx, bool is_break)
10111
{
10112
   Builder bld(ctx->program, ctx->block);
10113
   Block* logical_target;
10114
   append_logical_end(ctx->block);
10115
   unsigned idx = ctx->block->index;
10116

10117
   if (is_break) {
10118
      logical_target = ctx->cf_info.parent_loop.exit;
10119
      add_logical_edge(idx, logical_target);
10120
      ctx->block->kind |= block_kind_break;
10121

10122
      if (!ctx->cf_info.parent_if.is_divergent &&
10123
          !ctx->cf_info.parent_loop.has_divergent_continue) {
10124
         /* uniform break - directly jump out of the loop */
10125
         ctx->block->kind |= block_kind_uniform;
10126
         ctx->cf_info.has_branch = true;
10127
         bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10128
         add_linear_edge(idx, logical_target);
10129
         return;
10130
      }
10131
      ctx->cf_info.parent_loop.has_divergent_branch = true;
10132
   } else {
10133
      logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10134
      add_logical_edge(idx, logical_target);
10135
      ctx->block->kind |= block_kind_continue;
10136

10137
      if (!ctx->cf_info.parent_if.is_divergent) {
10138
         /* uniform continue - directly jump to the loop header */
10139
         ctx->block->kind |= block_kind_uniform;
10140
         ctx->cf_info.has_branch = true;
10141
         bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10142
         add_linear_edge(idx, logical_target);
10143
         return;
10144
      }
10145

10146
      /* for potential uniform breaks after this continue,
10147
         we must ensure that they are handled correctly */
10148
      ctx->cf_info.parent_loop.has_divergent_continue = true;
10149
      ctx->cf_info.parent_loop.has_divergent_branch = true;
10150
   }
10151

10152
   if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
10153
      ctx->cf_info.exec_potentially_empty_break = true;
10154
      ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth;
10155
   }
10156

10157
   /* remove critical edges from linear CFG */
10158
   bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10159
   Block* break_block = ctx->program->create_and_insert_block();
10160
   break_block->kind |= block_kind_uniform;
10161
   add_linear_edge(idx, break_block);
10162
   /* the loop_header pointer might be invalidated by this point */
10163
   if (!is_break)
10164
      logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10165
   add_linear_edge(break_block->index, logical_target);
10166
   bld.reset(break_block);
10167
   bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10168

10169
   Block* continue_block = ctx->program->create_and_insert_block();
10170
   add_linear_edge(idx, continue_block);
10171
   append_logical_start(continue_block);
10172
   ctx->block = continue_block;
10173
}
10174

10175
void
10176
emit_loop_break(isel_context* ctx)
10177
{
10178
   emit_loop_jump(ctx, true);
10179
}
10180

10181
void
10182
emit_loop_continue(isel_context* ctx)
10183
{
10184
   emit_loop_jump(ctx, false);
10185
}
10186

10187
void
10188
visit_jump(isel_context* ctx, nir_jump_instr* instr)
10189
{
10190
   /* visit_block() would usually do this but divergent jumps updates ctx->block */
10191
   ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
10192

10193
   switch (instr->type) {
10194
   case nir_jump_break: emit_loop_break(ctx); break;
10195
   case nir_jump_continue: emit_loop_continue(ctx); break;
10196
   default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
10197
   }
10198
}
10199

10200
void
10201
visit_block(isel_context* ctx, nir_block* block)
10202
{
10203
   nir_foreach_instr (instr, block) {
10204
      switch (instr->type) {
10205
      case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10206
      case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10207
      case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10208
      case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10209
      case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10210
      case nir_instr_type_ssa_undef: visit_undef(ctx, nir_instr_as_ssa_undef(instr)); break;
10211
      case nir_instr_type_deref: break;
10212
      case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10213
      default: isel_err(instr, "Unknown NIR instr type");
10214
      }
10215
   }
10216

10217
   if (!ctx->cf_info.parent_loop.has_divergent_branch)
10218
      ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
10219
}
10220

10221
static Operand
10222
create_continue_phis(isel_context* ctx, unsigned first, unsigned last,
10223
                     aco_ptr<Instruction>& header_phi, Operand* vals)
10224
{
10225
   vals[0] = Operand(header_phi->definitions[0].getTemp());
10226
   RegClass rc = vals[0].regClass();
10227

10228
   unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
10229

10230
   unsigned next_pred = 1;
10231

10232
   for (unsigned idx = first + 1; idx <= last; idx++) {
10233
      Block& block = ctx->program->blocks[idx];
10234
      if (block.loop_nest_depth != loop_nest_depth) {
10235
         vals[idx - first] = vals[idx - 1 - first];
10236
         continue;
10237
      }
10238

10239
      if ((block.kind & block_kind_continue) && block.index != last) {
10240
         vals[idx - first] = header_phi->operands[next_pred];
10241
         next_pred++;
10242
         continue;
10243
      }
10244

10245
      bool all_same = true;
10246
      for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
10247
         all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
10248

10249
      Operand val;
10250
      if (all_same) {
10251
         val = vals[block.linear_preds[0] - first];
10252
      } else {
10253
         aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
10254
            aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
10255
         for (unsigned i = 0; i < block.linear_preds.size(); i++)
10256
            phi->operands[i] = vals[block.linear_preds[i] - first];
10257
         val = Operand(ctx->program->allocateTmp(rc));
10258
         phi->definitions[0] = Definition(val.getTemp());
10259
         block.instructions.emplace(block.instructions.begin(), std::move(phi));
10260
      }
10261
      vals[idx - first] = val;
10262
   }
10263

10264
   return vals[last - first];
10265
}
10266

10267
static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10268
static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10269
static void end_uniform_if(isel_context* ctx, if_context* ic);
10270

10271
static void
10272
visit_loop(isel_context* ctx, nir_loop* loop)
10273
{
10274
   loop_context lc;
10275
   begin_loop(ctx, &lc);
10276

10277
   /* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the
10278
    * loop header are live. Handle this without complicating the ACO IR by creating a dummy break.
10279
    */
10280
   if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) {
10281
      Builder bld(ctx->program, ctx->block);
10282
      Temp cond = bld.copy(bld.def(s1, scc), Operand::zero());
10283
      if_context ic;
10284
      begin_uniform_if_then(ctx, &ic, cond);
10285
      emit_loop_break(ctx);
10286
      begin_uniform_if_else(ctx, &ic);
10287
      end_uniform_if(ctx, &ic);
10288
   }
10289

10290
   bool unreachable = visit_cf_list(ctx, &loop->body);
10291

10292
   unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10293

10294
   /* Fixup phis in loop header from unreachable blocks.
10295
    * has_branch/has_divergent_branch also indicates if the loop ends with a
10296
    * break/continue instruction, but we don't emit those if unreachable=true */
10297
   if (unreachable) {
10298
      assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
10299
      bool linear = ctx->cf_info.has_branch;
10300
      bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
10301
      for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10302
         if ((logical && instr->opcode == aco_opcode::p_phi) ||
10303
             (linear && instr->opcode == aco_opcode::p_linear_phi)) {
10304
            /* the last operand should be the one that needs to be removed */
10305
            instr->operands.pop_back();
10306
         } else if (!is_phi(instr)) {
10307
            break;
10308
         }
10309
      }
10310
   }
10311

10312
   /* Fixup linear phis in loop header from expecting a continue. Both this fixup
10313
    * and the previous one shouldn't both happen at once because a break in the
10314
    * merge block would get CSE'd */
10315
   if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
10316
      unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
10317
      Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand));
10318
      for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10319
         if (instr->opcode == aco_opcode::p_linear_phi) {
10320
            if (ctx->cf_info.has_branch)
10321
               instr->operands.pop_back();
10322
            else
10323
               instr->operands.back() =
10324
                  create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
10325
         } else if (!is_phi(instr)) {
10326
            break;
10327
         }
10328
      }
10329
   }
10330

10331
   end_loop(ctx, &lc);
10332
}
10333

10334
static void
10335
begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
10336
{
10337
   ic->cond = cond;
10338

10339
   append_logical_end(ctx->block);
10340
   ctx->block->kind |= block_kind_branch;
10341

10342
   /* branch to linear then block */
10343
   assert(cond.regClass() == ctx->program->lane_mask);
10344
   aco_ptr<Pseudo_branch_instruction> branch;
10345
   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,
10346
                                                              Format::PSEUDO_BRANCH, 1, 1));
10347
   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10348
   branch->definitions[0].setHint(vcc);
10349
   branch->operands[0] = Operand(cond);
10350
   ctx->block->instructions.push_back(std::move(branch));
10351

10352
   ic->BB_if_idx = ctx->block->index;
10353
   ic->BB_invert = Block();
10354
   /* Invert blocks are intentionally not marked as top level because they
10355
    * are not part of the logical cfg. */
10356
   ic->BB_invert.kind |= block_kind_invert;
10357
   ic->BB_endif = Block();
10358
   ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10359

10360
   ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
10361
   ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
10362
   ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
10363
   ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10364
   ctx->cf_info.parent_if.is_divergent = true;
10365

10366
   /* divergent branches use cbranch_execz */
10367
   ctx->cf_info.exec_potentially_empty_discard = false;
10368
   ctx->cf_info.exec_potentially_empty_break = false;
10369
   ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10370

10371
   /** emit logical then block */
10372
   ctx->program->next_divergent_if_logical_depth++;
10373
   Block* BB_then_logical = ctx->program->create_and_insert_block();
10374
   add_edge(ic->BB_if_idx, BB_then_logical);
10375
   ctx->block = BB_then_logical;
10376
   append_logical_start(BB_then_logical);
10377
}
10378

10379
static void
10380
begin_divergent_if_else(isel_context* ctx, if_context* ic)
10381
{
10382
   Block* BB_then_logical = ctx->block;
10383
   append_logical_end(BB_then_logical);
10384
   /* branch from logical then block to invert block */
10385
   aco_ptr<Pseudo_branch_instruction> branch;
10386
   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10387
                                                              Format::PSEUDO_BRANCH, 0, 1));
10388
   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10389
   branch->definitions[0].setHint(vcc);
10390
   BB_then_logical->instructions.emplace_back(std::move(branch));
10391
   add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10392
   if (!ctx->cf_info.parent_loop.has_divergent_branch)
10393
      add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10394
   BB_then_logical->kind |= block_kind_uniform;
10395
   assert(!ctx->cf_info.has_branch);
10396
   ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10397
   ctx->cf_info.parent_loop.has_divergent_branch = false;
10398
   ctx->program->next_divergent_if_logical_depth--;
10399

10400
   /** emit linear then block */
10401
   Block* BB_then_linear = ctx->program->create_and_insert_block();
10402
   BB_then_linear->kind |= block_kind_uniform;
10403
   add_linear_edge(ic->BB_if_idx, BB_then_linear);
10404
   /* branch from linear then block to invert block */
10405
   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10406
                                                              Format::PSEUDO_BRANCH, 0, 1));
10407
   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10408
   branch->definitions[0].setHint(vcc);
10409
   BB_then_linear->instructions.emplace_back(std::move(branch));
10410
   add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10411

10412
   /** emit invert merge block */
10413
   ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10414
   ic->invert_idx = ctx->block->index;
10415

10416
   /* branch to linear else block (skip else) */
10417
   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10418
                                                              Format::PSEUDO_BRANCH, 0, 1));
10419
   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10420
   branch->definitions[0].setHint(vcc);
10421
   ctx->block->instructions.push_back(std::move(branch));
10422

10423
   ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
10424
   ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
10425
   ic->exec_potentially_empty_break_depth_old = std::min(
10426
      ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10427
   /* divergent branches use cbranch_execz */
10428
   ctx->cf_info.exec_potentially_empty_discard = false;
10429
   ctx->cf_info.exec_potentially_empty_break = false;
10430
   ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10431

10432
   /** emit logical else block */
10433
   ctx->program->next_divergent_if_logical_depth++;
10434
   Block* BB_else_logical = ctx->program->create_and_insert_block();
10435
   add_logical_edge(ic->BB_if_idx, BB_else_logical);
10436
   add_linear_edge(ic->invert_idx, BB_else_logical);
10437
   ctx->block = BB_else_logical;
10438
   append_logical_start(BB_else_logical);
10439
}
10440

10441
static void
10442
end_divergent_if(isel_context* ctx, if_context* ic)
10443
{
10444
   Block* BB_else_logical = ctx->block;
10445
   append_logical_end(BB_else_logical);
10446

10447
   /* branch from logical else block to endif block */
10448
   aco_ptr<Pseudo_branch_instruction> branch;
10449
   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10450
                                                              Format::PSEUDO_BRANCH, 0, 1));
10451
   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10452
   branch->definitions[0].setHint(vcc);
10453
   BB_else_logical->instructions.emplace_back(std::move(branch));
10454
   add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10455
   if (!ctx->cf_info.parent_loop.has_divergent_branch)
10456
      add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10457
   BB_else_logical->kind |= block_kind_uniform;
10458
   ctx->program->next_divergent_if_logical_depth--;
10459

10460
   assert(!ctx->cf_info.has_branch);
10461
   ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10462

10463
   /** emit linear else block */
10464
   Block* BB_else_linear = ctx->program->create_and_insert_block();
10465
   BB_else_linear->kind |= block_kind_uniform;
10466
   add_linear_edge(ic->invert_idx, BB_else_linear);
10467

10468
   /* branch from linear else block to endif block */
10469
   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10470
                                                              Format::PSEUDO_BRANCH, 0, 1));
10471
   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10472
   branch->definitions[0].setHint(vcc);
10473
   BB_else_linear->instructions.emplace_back(std::move(branch));
10474
   add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10475

10476
   /** emit endif merge block */
10477
   ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10478
   append_logical_start(ctx->block);
10479

10480
   ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10481
   ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
10482
   ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
10483
   ctx->cf_info.exec_potentially_empty_break_depth = std::min(
10484
      ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10485
   if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
10486
       !ctx->cf_info.parent_if.is_divergent) {
10487
      ctx->cf_info.exec_potentially_empty_break = false;
10488
      ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10489
   }
10490
   /* uniform control flow never has an empty exec-mask */
10491
   if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
10492
      ctx->cf_info.exec_potentially_empty_discard = false;
10493
      ctx->cf_info.exec_potentially_empty_break = false;
10494
      ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10495
   }
10496
}
10497

10498
static void
10499
begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10500
{
10501
   assert(cond.regClass() == s1);
10502

10503
   append_logical_end(ctx->block);
10504
   ctx->block->kind |= block_kind_uniform;
10505

10506
   aco_ptr<Pseudo_branch_instruction> branch;
10507
   aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10508
   branch.reset(
10509
      create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
10510
   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10511
   branch->definitions[0].setHint(vcc);
10512
   branch->operands[0] = Operand(cond);
10513
   branch->operands[0].setFixed(scc);
10514
   ctx->block->instructions.emplace_back(std::move(branch));
10515

10516
   ic->BB_if_idx = ctx->block->index;
10517
   ic->BB_endif = Block();
10518
   ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10519

10520
   ctx->cf_info.has_branch = false;
10521
   ctx->cf_info.parent_loop.has_divergent_branch = false;
10522

10523
   /** emit then block */
10524
   ctx->program->next_uniform_if_depth++;
10525
   Block* BB_then = ctx->program->create_and_insert_block();
10526
   add_edge(ic->BB_if_idx, BB_then);
10527
   append_logical_start(BB_then);
10528
   ctx->block = BB_then;
10529
}
10530

10531
static void
10532
begin_uniform_if_else(isel_context* ctx, if_context* ic)
10533
{
10534
   Block* BB_then = ctx->block;
10535

10536
   ic->uniform_has_then_branch = ctx->cf_info.has_branch;
10537
   ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10538

10539
   if (!ic->uniform_has_then_branch) {
10540
      append_logical_end(BB_then);
10541
      /* branch from then block to endif block */
10542
      aco_ptr<Pseudo_branch_instruction> branch;
10543
      branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10544
                                                                 Format::PSEUDO_BRANCH, 0, 1));
10545
      branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10546
      branch->definitions[0].setHint(vcc);
10547
      BB_then->instructions.emplace_back(std::move(branch));
10548
      add_linear_edge(BB_then->index, &ic->BB_endif);
10549
      if (!ic->then_branch_divergent)
10550
         add_logical_edge(BB_then->index, &ic->BB_endif);
10551
      BB_then->kind |= block_kind_uniform;
10552
   }
10553

10554
   ctx->cf_info.has_branch = false;
10555
   ctx->cf_info.parent_loop.has_divergent_branch = false;
10556

10557
   /** emit else block */
10558
   Block* BB_else = ctx->program->create_and_insert_block();
10559
   add_edge(ic->BB_if_idx, BB_else);
10560
   append_logical_start(BB_else);
10561
   ctx->block = BB_else;
10562
}
10563

10564
static void
10565
end_uniform_if(isel_context* ctx, if_context* ic)
10566
{
10567
   Block* BB_else = ctx->block;
10568

10569
   if (!ctx->cf_info.has_branch) {
10570
      append_logical_end(BB_else);
10571
      /* branch from then block to endif block */
10572
      aco_ptr<Pseudo_branch_instruction> branch;
10573
      branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10574
                                                                 Format::PSEUDO_BRANCH, 0, 1));
10575
      branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10576
      branch->definitions[0].setHint(vcc);
10577
      BB_else->instructions.emplace_back(std::move(branch));
10578
      add_linear_edge(BB_else->index, &ic->BB_endif);
10579
      if (!ctx->cf_info.parent_loop.has_divergent_branch)
10580
         add_logical_edge(BB_else->index, &ic->BB_endif);
10581
      BB_else->kind |= block_kind_uniform;
10582
   }
10583

10584
   ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
10585
   ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10586

10587
   /** emit endif merge block */
10588
   ctx->program->next_uniform_if_depth--;
10589
   if (!ctx->cf_info.has_branch) {
10590
      ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10591
      append_logical_start(ctx->block);
10592
   }
10593
}
10594

10595
static bool
10596
visit_if(isel_context* ctx, nir_if* if_stmt)
10597
{
10598
   Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10599
   Builder bld(ctx->program, ctx->block);
10600
   aco_ptr<Pseudo_branch_instruction> branch;
10601
   if_context ic;
10602

10603
   if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
10604
      /**
10605
       * Uniform conditionals are represented in the following way*) :
10606
       *
10607
       * The linear and logical CFG:
10608
       *                        BB_IF
10609
       *                        /    \
10610
       *       BB_THEN (logical)      BB_ELSE (logical)
10611
       *                        \    /
10612
       *                        BB_ENDIF
10613
       *
10614
       * *) Exceptions may be due to break and continue statements within loops
10615
       *    If a break/continue happens within uniform control flow, it branches
10616
       *    to the loop exit/entry block. Otherwise, it branches to the next
10617
       *    merge block.
10618
       **/
10619

10620
      assert(cond.regClass() == ctx->program->lane_mask);
10621
      cond = bool_to_scalar_condition(ctx, cond);
10622

10623
      begin_uniform_if_then(ctx, &ic, cond);
10624
      visit_cf_list(ctx, &if_stmt->then_list);
10625

10626
      begin_uniform_if_else(ctx, &ic);
10627
      visit_cf_list(ctx, &if_stmt->else_list);
10628

10629
      end_uniform_if(ctx, &ic);
10630
   } else { /* non-uniform condition */
10631
      /**
10632
       * To maintain a logical and linear CFG without critical edges,
10633
       * non-uniform conditionals are represented in the following way*) :
10634
       *
10635
       * The linear CFG:
10636
       *                        BB_IF
10637
       *                        /    \
10638
       *       BB_THEN (logical)      BB_THEN (linear)
10639
       *                        \    /
10640
       *                        BB_INVERT (linear)
10641
       *                        /    \
10642
       *       BB_ELSE (logical)      BB_ELSE (linear)
10643
       *                        \    /
10644
       *                        BB_ENDIF
10645
       *
10646
       * The logical CFG:
10647
       *                        BB_IF
10648
       *                        /    \
10649
       *       BB_THEN (logical)      BB_ELSE (logical)
10650
       *                        \    /
10651
       *                        BB_ENDIF
10652
       *
10653
       * *) Exceptions may be due to break and continue statements within loops
10654
       **/
10655

10656
      begin_divergent_if_then(ctx, &ic, cond);
10657
      visit_cf_list(ctx, &if_stmt->then_list);
10658

10659
      begin_divergent_if_else(ctx, &ic);
10660
      visit_cf_list(ctx, &if_stmt->else_list);
10661

10662
      end_divergent_if(ctx, &ic);
10663
   }
10664

10665
   return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
10666
}
10667

10668
static bool
10669
visit_cf_list(isel_context* ctx, struct exec_list* list)
10670
{
10671
   foreach_list_typed (nir_cf_node, node, node, list) {
10672
      switch (node->type) {
10673
      case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10674
      case nir_cf_node_if:
10675
         if (!visit_if(ctx, nir_cf_node_as_if(node)))
10676
            return true;
10677
         break;
10678
      case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10679
      default: unreachable("unimplemented cf list type");
10680
      }
10681
   }
10682
   return false;
10683
}
10684

10685
static void
10686
export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos)
10687
{
10688
   assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10689

10690
   int offset = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
10691
                   ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
10692
                   : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
10693
   unsigned mask = ctx->outputs.mask[slot];
10694
   if (!is_pos && !mask)
10695
      return;
10696
   if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
10697
      return;
10698
   aco_ptr<Export_instruction> exp{
10699
      create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
10700
   exp->enabled_mask = mask;
10701
   for (unsigned i = 0; i < 4; ++i) {
10702
      if (mask & (1 << i))
10703
         exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10704
      else
10705
         exp->operands[i] = Operand(v1);
10706
   }
10707
   /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
10708
    * Setting valid_mask=1 prevents it and has no other effect.
10709
    */
10710
   exp->valid_mask = ctx->options->chip_class == GFX10 && is_pos && *next_pos == 0;
10711
   exp->done = false;
10712
   exp->compressed = false;
10713
   if (is_pos)
10714
      exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
10715
   else
10716
      exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
10717
   ctx->block->instructions.emplace_back(std::move(exp));
10718
}
10719

10720
static void
10721
export_vs_psiz_layer_viewport_vrs(isel_context* ctx, int* next_pos)
10722
{
10723
   aco_ptr<Export_instruction> exp{
10724
      create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
10725
   exp->enabled_mask = 0;
10726
   for (unsigned i = 0; i < 4; ++i)
10727
      exp->operands[i] = Operand(v1);
10728
   if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
10729
      exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
10730
      exp->enabled_mask |= 0x1;
10731
   }
10732
   if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
10733
      exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
10734
      exp->enabled_mask |= 0x4;
10735
   }
10736
   if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
10737
      if (ctx->options->chip_class < GFX9) {
10738
         exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
10739
         exp->enabled_mask |= 0x8;
10740
      } else {
10741
         Builder bld(ctx->program, ctx->block);
10742

10743
         Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u),
10744
                             Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
10745
         if (exp->operands[2].isTemp())
10746
            out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
10747

10748
         exp->operands[2] = Operand(out);
10749
         exp->enabled_mask |= 0x4;
10750
      }
10751
   }
10752
   if (ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_SHADING_RATE]) {
10753
      exp->operands[1] = Operand(ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]);
10754
      exp->enabled_mask |= 0x2;
10755
   } else if (ctx->options->force_vrs_rates) {
10756
      /* Bits [2:3] = VRS rate X
10757
       * Bits [4:5] = VRS rate Y
10758
       *
10759
       * The range is [-2, 1]. Values:
10760
       *   1: 2x coarser shading rate in that direction.
10761
       *   0: normal shading rate
10762
       *  -1: 2x finer shading rate (sample shading, not directional)
10763
       *  -2: 4x finer shading rate (sample shading, not directional)
10764
       *
10765
       * Sample shading can't go above 8 samples, so both numbers can't be -2
10766
       * at the same time.
10767
       */
10768
      Builder bld(ctx->program, ctx->block);
10769
      Temp rates = bld.copy(bld.def(v1), Operand::c32((unsigned)ctx->options->force_vrs_rates));
10770

10771
      /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
10772
      Temp cond = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), Operand::c32(0x3f800000u),
10773
                           Operand(ctx->outputs.temps[VARYING_SLOT_POS + 3]));
10774
      rates = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10775
                       bld.copy(bld.def(v1), Operand::zero()), rates, cond);
10776

10777
      exp->operands[1] = Operand(rates);
10778
      exp->enabled_mask |= 0x2;
10779
   }
10780

10781
   exp->valid_mask = ctx->options->chip_class == GFX10 && *next_pos == 0;
10782
   exp->done = false;
10783
   exp->compressed = false;
10784
   exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
10785
   ctx->block->instructions.emplace_back(std::move(exp));
10786
}
10787

10788
static void
10789
create_vs_exports(isel_context* ctx)
10790
{
10791
   assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10792

10793
   radv_vs_output_info* outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
10794
                                     ? &ctx->program->info->tes.outinfo
10795
                                     : &ctx->program->info->vs.outinfo;
10796

10797
   ctx->block->kind |= block_kind_export_end;
10798

10799
   if (outinfo->export_prim_id && ctx->stage.hw != HWStage::NGG) {
10800
      ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
10801
      if (ctx->stage.has(SWStage::TES))
10802
         ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
10803
            get_arg(ctx, ctx->args->ac.tes_patch_id);
10804
      else
10805
         ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
10806
            get_arg(ctx, ctx->args->ac.vs_prim_id);
10807
   }
10808

10809
   if (ctx->options->key.has_multiview_view_index) {
10810
      ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
10811
      ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] =
10812
         as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
10813
   }
10814

10815
   /* Hardware requires position data to always be exported, even if the
10816
    * application did not write gl_Position.
10817
    */
10818
   ctx->outputs.mask[VARYING_SLOT_POS] = 0xf;
10819

10820
   /* the order these position exports are created is important */
10821
   int next_pos = 0;
10822
   export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
10823

10824
   bool writes_primitive_shading_rate =
10825
      outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates;
10826
   if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index ||
10827
       writes_primitive_shading_rate) {
10828
      export_vs_psiz_layer_viewport_vrs(ctx, &next_pos);
10829
   }
10830
   if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
10831
      export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
10832
   if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
10833
      export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
10834

10835
   if (ctx->export_clip_dists) {
10836
      if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
10837
         export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
10838
      if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
10839
         export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
10840
   }
10841

10842
   for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
10843
      if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER && i != VARYING_SLOT_PRIMITIVE_ID &&
10844
          i != VARYING_SLOT_VIEWPORT)
10845
         continue;
10846

10847
      export_vs_varying(ctx, i, false, NULL);
10848
   }
10849
}
10850

10851
static bool
10852
export_fs_mrt_z(isel_context* ctx)
10853
{
10854
   Builder bld(ctx->program, ctx->block);
10855
   unsigned enabled_channels = 0;
10856
   bool compr = false;
10857
   Operand values[4];
10858

10859
   for (unsigned i = 0; i < 4; ++i) {
10860
      values[i] = Operand(v1);
10861
   }
10862

10863
   /* Both stencil and sample mask only need 16-bits. */
10864
   if (!ctx->program->info->ps.writes_z &&
10865
       (ctx->program->info->ps.writes_stencil || ctx->program->info->ps.writes_sample_mask)) {
10866
      compr = true; /* COMPR flag */
10867

10868
      if (ctx->program->info->ps.writes_stencil) {
10869
         /* Stencil should be in X[23:16]. */
10870
         values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10871
         values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), values[0]);
10872
         enabled_channels |= 0x3;
10873
      }
10874

10875
      if (ctx->program->info->ps.writes_sample_mask) {
10876
         /* SampleMask should be in Y[15:0]. */
10877
         values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10878
         enabled_channels |= 0xc;
10879
      }
10880
   } else {
10881
      if (ctx->program->info->ps.writes_z) {
10882
         values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
10883
         enabled_channels |= 0x1;
10884
      }
10885

10886
      if (ctx->program->info->ps.writes_stencil) {
10887
         values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10888
         enabled_channels |= 0x2;
10889
      }
10890

10891
      if (ctx->program->info->ps.writes_sample_mask) {
10892
         values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10893
         enabled_channels |= 0x4;
10894
      }
10895
   }
10896

10897
   /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
10898
    * writemask component.
10899
    */
10900
   if (ctx->options->chip_class == GFX6 && ctx->options->family != CHIP_OLAND &&
10901
       ctx->options->family != CHIP_HAINAN) {
10902
      enabled_channels |= 0x1;
10903
   }
10904

10905
   bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,
10906
           V_008DFC_SQ_EXP_MRTZ, compr);
10907

10908
   return true;
10909
}
10910

10911
static bool
10912
export_fs_mrt_color(isel_context* ctx, int slot)
10913
{
10914
   Builder bld(ctx->program, ctx->block);
10915
   unsigned write_mask = ctx->outputs.mask[slot];
10916
   Operand values[4];
10917

10918
   for (unsigned i = 0; i < 4; ++i) {
10919
      if (write_mask & (1 << i)) {
10920
         values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10921
      } else {
10922
         values[i] = Operand(v1);
10923
      }
10924
   }
10925

10926
   unsigned target, col_format;
10927
   unsigned enabled_channels = 0;
10928
   aco_opcode compr_op = (aco_opcode)0;
10929
   bool compr = false;
10930

10931
   slot -= FRAG_RESULT_DATA0;
10932
   target = V_008DFC_SQ_EXP_MRT + slot;
10933
   col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf;
10934

10935
   bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
10936
   bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
10937
   bool is_16bit = values[0].regClass() == v2b;
10938

10939
   /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10940
   if (ctx->options->enable_mrt_output_nan_fixup && !is_16bit &&
10941
       (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
10942
        col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
10943
        col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10944
      for (int i = 0; i < 4; i++) {
10945
         if (!(write_mask & (1 << i)))
10946
            continue;
10947

10948
         Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
10949
                               values[i], bld.copy(bld.def(v1), Operand::c32(3u)));
10950
         values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
10951
                              bld.copy(bld.def(v1), Operand::zero()), isnan);
10952
      }
10953
   }
10954

10955
   switch (col_format) {
10956
   case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
10957

10958
   case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
10959

10960
   case V_028714_SPI_SHADER_32_AR:
10961
      if (ctx->options->chip_class >= GFX10) {
10962
         /* Special case: on GFX10, the outputs are different for 32_AR */
10963
         enabled_channels = 0x3;
10964
         values[1] = values[3];
10965
         values[3] = Operand(v1);
10966
      } else {
10967
         enabled_channels = 0x9;
10968
      }
10969
      break;
10970

10971
   case V_028714_SPI_SHADER_FP16_ABGR:
10972
      for (int i = 0; i < 2; i++) {
10973
         bool enabled = (write_mask >> (i * 2)) & 0x3;
10974
         if (enabled) {
10975
            enabled_channels |= 0x3 << (i * 2);
10976
            if (is_16bit) {
10977
               values[i] =
10978
                  bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),
10979
                             values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2],
10980
                             values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]);
10981
            } else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9) {
10982
               values[i] =
10983
                  bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
10984
                           values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
10985
                           values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
10986
            } else {
10987
               values[i] =
10988
                  bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
10989
                           values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],
10990
                           values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);
10991
            }
10992
         } else {
10993
            values[i] = Operand(v1);
10994
         }
10995
      }
10996
      values[2] = Operand(v1);
10997
      values[3] = Operand(v1);
10998
      compr = true;
10999
      break;
11000

11001
   case V_028714_SPI_SHADER_UNORM16_ABGR:
11002
      if (is_16bit && ctx->options->chip_class >= GFX9) {
11003
         compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
11004
      } else {
11005
         compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
11006
      }
11007
      break;
11008

11009
   case V_028714_SPI_SHADER_SNORM16_ABGR:
11010
      if (is_16bit && ctx->options->chip_class >= GFX9) {
11011
         compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
11012
      } else {
11013
         compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
11014
      }
11015
      break;
11016

11017
   case V_028714_SPI_SHADER_UINT16_ABGR: {
11018
      compr_op = aco_opcode::v_cvt_pk_u16_u32;
11019
      if (is_int8 || is_int10) {
11020
         /* clamp */
11021
         uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
11022
         Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
11023

11024
         for (unsigned i = 0; i < 4; i++) {
11025
            if ((write_mask >> i) & 1) {
11026
               values[i] =
11027
                  bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
11028
                           i == 3 && is_int10 ? Operand::c32(3u) : Operand(max_rgb_val), values[i]);
11029
            }
11030
         }
11031
      } else if (is_16bit) {
11032
         for (unsigned i = 0; i < 4; i++) {
11033
            if ((write_mask >> i) & 1) {
11034
               Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
11035
               values[i] = Operand(tmp);
11036
            }
11037
         }
11038
      }
11039
      break;
11040
   }
11041

11042
   case V_028714_SPI_SHADER_SINT16_ABGR:
11043
      compr_op = aco_opcode::v_cvt_pk_i16_i32;
11044
      if (is_int8 || is_int10) {
11045
         /* clamp */
11046
         uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
11047
         uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
11048
         Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
11049
         Temp min_rgb_val = bld.copy(bld.def(s1), Operand::c32(min_rgb));
11050

11051
         for (unsigned i = 0; i < 4; i++) {
11052
            if ((write_mask >> i) & 1) {
11053
               values[i] =
11054
                  bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
11055
                           i == 3 && is_int10 ? Operand::c32(1u) : Operand(max_rgb_val), values[i]);
11056
               values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
11057
                                    i == 3 && is_int10 ? Operand::c32(-2u) : Operand(min_rgb_val),
11058
                                    values[i]);
11059
            }
11060
         }
11061
      } else if (is_16bit) {
11062
         for (unsigned i = 0; i < 4; i++) {
11063
            if ((write_mask >> i) & 1) {
11064
               Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
11065
               values[i] = Operand(tmp);
11066
            }
11067
         }
11068
      }
11069
      break;
11070

11071
   case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
11072

11073
   case V_028714_SPI_SHADER_ZERO:
11074
   default: return false;
11075
   }
11076

11077
   if ((bool)compr_op) {
11078
      for (int i = 0; i < 2; i++) {
11079
         /* check if at least one of the values to be compressed is enabled */
11080
         bool enabled = (write_mask >> (i * 2)) & 0x3;
11081
         if (enabled) {
11082
            enabled_channels |= 0x3 << (i * 2);
11083
            values[i] = bld.vop3(
11084
               compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
11085
               values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
11086
         } else {
11087
            values[i] = Operand(v1);
11088
         }
11089
      }
11090
      values[2] = Operand(v1);
11091
      values[3] = Operand(v1);
11092
      compr = true;
11093
   } else if (!compr) {
11094
      for (int i = 0; i < 4; i++)
11095
         values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
11096
   }
11097

11098
   bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, target,
11099
           compr);
11100
   return true;
11101
}
11102

11103
static void
11104
create_fs_null_export(isel_context* ctx)
11105
{
11106
   /* FS must always have exports.
11107
    * So when there are none, we need to add a null export.
11108
    */
11109

11110
   Builder bld(ctx->program, ctx->block);
11111
   unsigned dest = V_008DFC_SQ_EXP_NULL;
11112
   bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
11113
           /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
11114
}
11115

11116
static void
11117
create_fs_exports(isel_context* ctx)
11118
{
11119
   bool exported = false;
11120

11121
   /* Export depth, stencil and sample mask. */
11122
   if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
11123
       ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11124
      exported |= export_fs_mrt_z(ctx);
11125

11126
   /* Export all color render targets. */
11127
   for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)
11128
      if (ctx->outputs.mask[i])
11129
         exported |= export_fs_mrt_color(ctx, i);
11130

11131
   if (!exported)
11132
      create_fs_null_export(ctx);
11133

11134
   ctx->block->kind |= block_kind_export_end;
11135
}
11136

11137
static void
11138
create_workgroup_barrier(Builder& bld)
11139
{
11140
   bld.barrier(aco_opcode::p_barrier,
11141
               memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup), scope_workgroup);
11142
}
11143

11144
static void
11145
emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset,
11146
                   const struct radv_stream_output* output)
11147
{
11148
   unsigned num_comps = util_bitcount(output->component_mask);
11149
   unsigned writemask = (1 << num_comps) - 1;
11150
   unsigned loc = output->location;
11151
   unsigned buf = output->buffer;
11152

11153
   assert(num_comps && num_comps <= 4);
11154
   if (!num_comps || num_comps > 4)
11155
      return;
11156

11157
   unsigned first_comp = ffs(output->component_mask) - 1;
11158

11159
   Temp out[4];
11160
   bool all_undef = true;
11161
   assert(ctx->stage.hw == HWStage::VS);
11162
   for (unsigned i = 0; i < num_comps; i++) {
11163
      out[i] = ctx->outputs.temps[loc * 4 + first_comp + i];
11164
      all_undef = all_undef && !out[i].id();
11165
   }
11166
   if (all_undef)
11167
      return;
11168

11169
   while (writemask) {
11170
      int start, count;
11171
      u_bit_scan_consecutive_range(&writemask, &start, &count);
11172
      if (count == 3 && ctx->options->chip_class == GFX6) {
11173
         /* GFX6 doesn't support storing vec3, split it. */
11174
         writemask |= 1u << (start + 2);
11175
         count = 2;
11176
      }
11177

11178
      unsigned offset = output->offset + start * 4;
11179

11180
      Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count));
11181
      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
11182
         aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
11183
      for (int i = 0; i < count; ++i)
11184
         vec->operands[i] =
11185
            (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand::zero();
11186
      vec->definitions[0] = Definition(write_data);
11187
      ctx->block->instructions.emplace_back(std::move(vec));
11188

11189
      aco_opcode opcode;
11190
      switch (count) {
11191
      case 1: opcode = aco_opcode::buffer_store_dword; break;
11192
      case 2: opcode = aco_opcode::buffer_store_dwordx2; break;
11193
      case 3: opcode = aco_opcode::buffer_store_dwordx3; break;
11194
      case 4: opcode = aco_opcode::buffer_store_dwordx4; break;
11195
      default: unreachable("Unsupported dword count.");
11196
      }
11197

11198
      aco_ptr<MUBUF_instruction> store{
11199
         create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
11200
      store->operands[0] = Operand(so_buffers[buf]);
11201
      store->operands[1] = Operand(so_write_offset[buf]);
11202
      store->operands[2] = Operand::c32(0);
11203
      store->operands[3] = Operand(write_data);
11204
      if (offset > 4095) {
11205
         /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
11206
         Builder bld(ctx->program, ctx->block);
11207
         store->operands[0] =
11208
            bld.vadd32(bld.def(v1), Operand::c32(offset), Operand(so_write_offset[buf]));
11209
      } else {
11210
         store->offset = offset;
11211
      }
11212
      store->offen = true;
11213
      store->glc = true;
11214
      store->dlc = false;
11215
      store->slc = true;
11216
      ctx->block->instructions.emplace_back(std::move(store));
11217
   }
11218
}
11219

11220
static void
11221
emit_streamout(isel_context* ctx, unsigned stream)
11222
{
11223
   Builder bld(ctx->program, ctx->block);
11224

11225
   Temp so_buffers[4];
11226
   Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
11227
   for (unsigned i = 0; i < 4; i++) {
11228
      unsigned stride = ctx->program->info->so.strides[i];
11229
      if (!stride)
11230
         continue;
11231

11232
      Operand off = bld.copy(bld.def(s1), Operand::c32(i * 16u));
11233
      so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, off);
11234
   }
11235

11236
   Temp so_vtx_count =
11237
      bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11238
               get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u));
11239

11240
   Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
11241

11242
   Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
11243

11244
   if_context ic;
11245
   begin_divergent_if_then(ctx, &ic, can_emit);
11246

11247
   bld.reset(ctx->block);
11248

11249
   Temp so_write_index =
11250
      bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);
11251

11252
   Temp so_write_offset[4];
11253

11254
   for (unsigned i = 0; i < 4; i++) {
11255
      unsigned stride = ctx->program->info->so.strides[i];
11256
      if (!stride)
11257
         continue;
11258

11259
      if (stride == 1) {
11260
         Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11261
                                get_arg(ctx, ctx->args->ac.streamout_write_index),
11262
                                get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11263
         Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
11264

11265
         so_write_offset[i] =
11266
            bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), new_offset);
11267
      } else {
11268
         Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
11269
         Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(4u),
11270
                                 get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11271
         so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
11272
      }
11273
   }
11274

11275
   for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
11276
      struct radv_stream_output* output = &ctx->program->info->so.outputs[i];
11277
      if (stream != output->stream)
11278
         continue;
11279

11280
      emit_stream_output(ctx, so_buffers, so_write_offset, output);
11281
   }
11282

11283
   begin_divergent_if_else(ctx, &ic);
11284
   end_divergent_if(ctx, &ic);
11285
}
11286

11287
Pseudo_instruction*
11288
add_startpgm(struct isel_context* ctx)
11289
{
11290
   unsigned arg_count = ctx->args->ac.arg_count;
11291
   if (ctx->stage == fragment_fs) {
11292
      /* LLVM optimizes away unused FS inputs and computes spi_ps_input_addr
11293
       * itself and then communicates the results back via the ELF binary.
11294
       * Mirror what LLVM does by re-mapping the VGPR arguments here.
11295
       *
11296
       * TODO: If we made the FS input scanning code into a separate pass that
11297
       * could run before argument setup, then this wouldn't be necessary
11298
       * anymore.
11299
       */
11300
      struct ac_shader_args* args = &ctx->args->ac;
11301
      arg_count = 0;
11302
      for (unsigned i = 0, vgpr_arg = 0, vgpr_reg = 0; i < args->arg_count; i++) {
11303
         if (args->args[i].file != AC_ARG_VGPR) {
11304
            arg_count++;
11305
            continue;
11306
         }
11307

11308
         if (!(ctx->program->config->spi_ps_input_addr & (1 << vgpr_arg))) {
11309
            args->args[i].skip = true;
11310
         } else {
11311
            args->args[i].offset = vgpr_reg;
11312
            vgpr_reg += args->args[i].size;
11313
            arg_count++;
11314
         }
11315
         vgpr_arg++;
11316
      }
11317
   }
11318

11319
   aco_ptr<Pseudo_instruction> startpgm{
11320
      create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, arg_count)};
11321
   for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) {
11322
      if (ctx->args->ac.args[i].skip)
11323
         continue;
11324

11325
      enum ac_arg_regfile file = ctx->args->ac.args[i].file;
11326
      unsigned size = ctx->args->ac.args[i].size;
11327
      unsigned reg = ctx->args->ac.args[i].offset;
11328
      RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11329
      Temp dst = ctx->program->allocateTmp(type);
11330
      ctx->arg_temps[i] = dst;
11331
      startpgm->definitions[arg] = Definition(dst);
11332
      startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11333
      arg++;
11334
   }
11335
   Pseudo_instruction* instr = startpgm.get();
11336
   ctx->block->instructions.push_back(std::move(startpgm));
11337

11338
   /* Stash these in the program so that they can be accessed later when
11339
    * handling spilling.
11340
    */
11341
   ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11342
   ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset);
11343

11344
   return instr;
11345
}
11346

11347
void
11348
fix_ls_vgpr_init_bug(isel_context* ctx, Pseudo_instruction* startpgm)
11349
{
11350
   assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
11351
   Builder bld(ctx->program, ctx->block);
11352
   constexpr unsigned hs_idx = 1u;
11353
   Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11354
                                              get_arg(ctx, ctx->args->ac.merged_wave_info),
11355
                                              Operand::c32((8u << 16) | (hs_idx * 8u)));
11356
   Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11357

11358
   /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
11359

11360
   Temp instance_id =
11361
      bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.vertex_id),
11362
               get_arg(ctx, ctx->args->ac.instance_id), ls_has_nonzero_hs_threads);
11363
   Temp vs_rel_patch_id =
11364
      bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
11365
               get_arg(ctx, ctx->args->ac.vs_rel_patch_id), ls_has_nonzero_hs_threads);
11366
   Temp vertex_id =
11367
      bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_patch_id),
11368
               get_arg(ctx, ctx->args->ac.vertex_id), ls_has_nonzero_hs_threads);
11369

11370
   ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
11371
   ctx->arg_temps[ctx->args->ac.vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11372
   ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
11373
}
11374

11375
void
11376
split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
11377
{
11378
   /* Split all arguments except for the first (ring_offsets) and the last
11379
    * (exec) so that the dead channels don't stay live throughout the program.
11380
    */
11381
   for (int i = 1; i < startpgm->definitions.size(); i++) {
11382
      if (startpgm->definitions[i].regClass().size() > 1) {
11383
         emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11384
                           startpgm->definitions[i].regClass().size());
11385
      }
11386
   }
11387
}
11388

11389
void
11390
handle_bc_optimize(isel_context* ctx)
11391
{
11392
   /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
11393
   Builder bld(ctx->program, ctx->block);
11394
   uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
11395
   bool uses_center =
11396
      G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
11397
   bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) ||
11398
                        G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
11399
   ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
11400
   ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
11401
   if (uses_center && uses_centroid) {
11402
      Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
11403
                              get_arg(ctx, ctx->args->ac.prim_mask), Operand::zero());
11404

11405
      if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
11406
         Temp new_coord[2];
11407
         for (unsigned i = 0; i < 2; i++) {
11408
            Temp persp_centroid =
11409
               emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
11410
            Temp persp_center =
11411
               emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
11412
            new_coord[i] =
11413
               bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), persp_centroid, persp_center, sel);
11414
         }
11415
         ctx->persp_centroid = bld.tmp(v2);
11416
         bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
11417
                    Operand(new_coord[0]), Operand(new_coord[1]));
11418
         emit_split_vector(ctx, ctx->persp_centroid, 2);
11419
      }
11420

11421
      if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
11422
         Temp new_coord[2];
11423
         for (unsigned i = 0; i < 2; i++) {
11424
            Temp linear_centroid =
11425
               emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
11426
            Temp linear_center =
11427
               emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
11428
            new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), linear_centroid,
11429
                                    linear_center, sel);
11430
         }
11431
         ctx->linear_centroid = bld.tmp(v2);
11432
         bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
11433
                    Operand(new_coord[0]), Operand(new_coord[1]));
11434
         emit_split_vector(ctx, ctx->linear_centroid, 2);
11435
      }
11436
   }
11437
}
11438

11439
void
11440
setup_fp_mode(isel_context* ctx, nir_shader* shader)
11441
{
11442
   Program* program = ctx->program;
11443

11444
   unsigned float_controls = shader->info.float_controls_execution_mode;
11445

11446
   program->next_fp_mode.preserve_signed_zero_inf_nan32 =
11447
      float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
11448
   program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
11449
      float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
11450
                        FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
11451

11452
   program->next_fp_mode.must_flush_denorms32 =
11453
      float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
11454
   program->next_fp_mode.must_flush_denorms16_64 =
11455
      float_controls &
11456
      (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
11457

11458
   program->next_fp_mode.care_about_round32 =
11459
      float_controls &
11460
      (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
11461

11462
   program->next_fp_mode.care_about_round16_64 =
11463
      float_controls &
11464
      (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
11465
       FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
11466

11467
   /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
11468
    * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11469
   if (program->next_fp_mode.must_flush_denorms16_64)
11470
      program->next_fp_mode.denorm16_64 = 0;
11471
   else
11472
      program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11473

11474
   /* preserving fp32 denorms is expensive, so only do it if asked */
11475
   if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11476
      program->next_fp_mode.denorm32 = fp_denorm_keep;
11477
   else
11478
      program->next_fp_mode.denorm32 = 0;
11479

11480
   if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11481
      program->next_fp_mode.round32 = fp_round_tz;
11482
   else
11483
      program->next_fp_mode.round32 = fp_round_ne;
11484

11485
   if (float_controls &
11486
       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11487
      program->next_fp_mode.round16_64 = fp_round_tz;
11488
   else
11489
      program->next_fp_mode.round16_64 = fp_round_ne;
11490

11491
   ctx->block->fp_mode = program->next_fp_mode;
11492
}
11493

11494
void
11495
cleanup_cfg(Program* program)
11496
{
11497
   /* create linear_succs/logical_succs */
11498
   for (Block& BB : program->blocks) {
11499
      for (unsigned idx : BB.linear_preds)
11500
         program->blocks[idx].linear_succs.emplace_back(BB.index);
11501
      for (unsigned idx : BB.logical_preds)
11502
         program->blocks[idx].logical_succs.emplace_back(BB.index);
11503
   }
11504
}
11505

11506
Temp
11507
lanecount_to_mask(isel_context* ctx, Temp count, bool allow64 = true)
11508
{
11509
   assert(count.regClass() == s1);
11510

11511
   Builder bld(ctx->program, ctx->block);
11512
   Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11513
   Temp cond;
11514

11515
   if (ctx->program->wave_size == 64) {
11516
      /* If we know that all 64 threads can't be active at a time, we just use the mask as-is */
11517
      if (!allow64)
11518
         return mask;
11519

11520
      /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
11521
      Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,
11522
                                Operand::c32(6u /* log2(64) */));
11523
      cond =
11524
         bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));
11525
   } else {
11526
      /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11527
       * the register */
11528
      cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11529
   }
11530

11531
   return cond;
11532
}
11533

11534
Temp
11535
merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11536
{
11537
   Builder bld(ctx->program, ctx->block);
11538

11539
   /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
11540
   Temp count = i == 0
11541
                   ? get_arg(ctx, ctx->args->ac.merged_wave_info)
11542
                   : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
11543
                              get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(i * 8u));
11544

11545
   return lanecount_to_mask(ctx, count);
11546
}
11547

11548
void
11549
ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt)
11550
{
11551
   assert(vtx_cnt.id() && prm_cnt.id());
11552

11553
   Builder bld(ctx->program, ctx->block);
11554
   Temp prm_cnt_0;
11555

11556
   if (ctx->program->chip_class == GFX10 &&
11557
       (ctx->stage.has(SWStage::GS) || ctx->program->info->has_ngg_culling)) {
11558
      /* Navi 1x workaround: check whether the workgroup has no output.
11559
       * If so, change the number of exported vertices and primitives to 1.
11560
       */
11561
      prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand::zero());
11562
      prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), prm_cnt,
11563
                         bld.scc(prm_cnt_0));
11564
      vtx_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), vtx_cnt,
11565
                         bld.scc(prm_cnt_0));
11566
   }
11567

11568
   /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
11569
   Temp tmp =
11570
      bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand::c32(12u));
11571
   tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
11572

11573
   /* Request the SPI to allocate space for the primitives and vertices
11574
    * that will be exported by the threadgroup.
11575
    */
11576
   bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
11577

11578
   if (prm_cnt_0.id()) {
11579
      /* Navi 1x workaround: export a triangle with NaN coordinates when NGG has no output.
11580
       * It can't have all-zero positions because that would render an undesired pixel with
11581
       * conservative rasterization.
11582
       */
11583
      Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
11584
      Temp cond = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc),
11585
                           Operand::c32_or_c64(1u, ctx->program->wave_size == 64), first_lane);
11586
      cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond,
11587
                      Operand::zero(ctx->program->wave_size == 64 ? 8 : 4), bld.scc(prm_cnt_0));
11588

11589
      if_context ic_prim_0;
11590
      begin_divergent_if_then(ctx, &ic_prim_0, cond);
11591
      bld.reset(ctx->block);
11592
      ctx->block->kind |= block_kind_export_end;
11593

11594
      /* Use zero: means that it's a triangle whose every vertex index is 0. */
11595
      Temp zero = bld.copy(bld.def(v1), Operand::zero());
11596
      /* Use NaN for the coordinates, so that the rasterizer allways culls it.  */
11597
      Temp nan_coord = bld.copy(bld.def(v1), Operand::c32(-1u));
11598

11599
      bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1), 1 /* enabled mask */,
11600
              V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, true /* done */,
11601
              false /* valid mask */);
11602
      bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord, 0xf /* enabled mask */,
11603
              V_008DFC_SQ_EXP_POS /* dest */, false /* compressed */, true /* done */,
11604
              true /* valid mask */);
11605

11606
      begin_divergent_if_else(ctx, &ic_prim_0);
11607
      end_divergent_if(ctx, &ic_prim_0);
11608
      bld.reset(ctx->block);
11609
   }
11610
}
11611

11612
} /* end namespace */
11613

11614
void
11615
select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
11616
               ac_shader_config* config, struct radv_shader_args* args)
11617
{
11618
   isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
11619
   if_context ic_merged_wave_info;
11620
   bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
11621

11622
   for (unsigned i = 0; i < shader_count; i++) {
11623
      nir_shader* nir = shaders[i];
11624
      init_context(&ctx, nir);
11625

11626
      setup_fp_mode(&ctx, nir);
11627

11628
      if (!i) {
11629
         /* needs to be after init_context() for FS */
11630
         Pseudo_instruction* startpgm = add_startpgm(&ctx);
11631
         append_logical_start(ctx.block);
11632

11633
         if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
11634
            fix_ls_vgpr_init_bug(&ctx, startpgm);
11635

11636
         split_arguments(&ctx, startpgm);
11637

11638
         if (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES)) {
11639
            Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);
11640
         }
11641
      }
11642

11643
      /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
11644
      nir_function_impl* func = nir_shader_get_entrypoint(nir);
11645
      bool empty_shader =
11646
         nir_cf_list_is_empty_block(&func->body) &&
11647
         ((nir->info.stage == MESA_SHADER_VERTEX &&
11648
           (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
11649
          (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
11650

11651
      bool check_merged_wave_info =
11652
         ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
11653
      bool endif_merged_wave_info =
11654
         ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
11655

11656
      if (program->chip_class == GFX10 && program->stage.hw == HWStage::NGG &&
11657
          program->stage.num_sw_stages() == 1) {
11658
         /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11659
          * s_sendmsg(GS_ALLOC_REQ). */
11660
         Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);
11661
      }
11662

11663
      if (check_merged_wave_info) {
11664
         Temp cond = merged_wave_info_to_mask(&ctx, i);
11665
         begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
11666
      }
11667

11668
      if (i) {
11669
         Builder bld(ctx.program, ctx.block);
11670

11671
         /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
11672
         bool tcs_skip_barrier = ctx.stage == vertex_tess_control_hs &&
11673
                                 ctx.tcs_temp_only_inputs == nir->info.inputs_read;
11674

11675
         if (!ngg_gs && !tcs_skip_barrier)
11676
            create_workgroup_barrier(bld);
11677

11678
         if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
11679
            ctx.gs_wave_id = bld.pseudo(aco_opcode::p_extract, bld.def(s1, m0), bld.def(s1, scc),
11680
                                        get_arg(&ctx, args->ac.merged_wave_info), Operand::c32(2u),
11681
                                        Operand::c32(8u), Operand::zero());
11682
         }
11683
      } else if (ctx.stage == geometry_gs)
11684
         ctx.gs_wave_id = get_arg(&ctx, args->ac.gs_wave_id);
11685

11686
      if (ctx.stage == fragment_fs)
11687
         handle_bc_optimize(&ctx);
11688

11689
      visit_cf_list(&ctx, &func->body);
11690

11691
      if (ctx.program->info->so.num_outputs && ctx.stage.hw == HWStage::VS)
11692
         emit_streamout(&ctx, 0);
11693

11694
      if (ctx.stage.hw == HWStage::VS) {
11695
         create_vs_exports(&ctx);
11696
      } else if (nir->info.stage == MESA_SHADER_GEOMETRY && !ngg_gs) {
11697
         Builder bld(ctx.program, ctx.block);
11698
         bld.barrier(aco_opcode::p_barrier,
11699
                     memory_sync_info(storage_vmem_output, semantic_release, scope_device));
11700
         bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1,
11701
                  sendmsg_gs_done(false, false, 0));
11702
      }
11703

11704
      if (ctx.stage == fragment_fs) {
11705
         create_fs_exports(&ctx);
11706
      }
11707

11708
      if (endif_merged_wave_info) {
11709
         begin_divergent_if_else(&ctx, &ic_merged_wave_info);
11710
         end_divergent_if(&ctx, &ic_merged_wave_info);
11711
      }
11712

11713
      if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
11714
         /* Outputs of the previous stage are inputs to the next stage */
11715
         ctx.inputs = ctx.outputs;
11716
         ctx.outputs = shader_io_state();
11717
      }
11718

11719
      cleanup_context(&ctx);
11720
   }
11721

11722
   program->config->float_mode = program->blocks[0].fp_mode.val;
11723

11724
   append_logical_end(ctx.block);
11725
   ctx.block->kind |= block_kind_uniform;
11726
   Builder bld(ctx.program, ctx.block);
11727
   bld.sopp(aco_opcode::s_endpgm);
11728

11729
   cleanup_cfg(program);
11730
}
11731

11732
void
11733
select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
11734
                      struct radv_shader_args* args)
11735
{
11736
   isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
11737

11738
   ctx.block->fp_mode = program->next_fp_mode;
11739

11740
   add_startpgm(&ctx);
11741
   append_logical_start(ctx.block);
11742

11743
   Builder bld(ctx.program, ctx.block);
11744

11745
   Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4),
11746
                             program->private_segment_buffer, Operand::c32(RING_GSVS_VS * 16u));
11747

11748
   Operand stream_id = Operand::zero();
11749
   if (args->shader_info->so.num_outputs)
11750
      stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11751
                           get_arg(&ctx, ctx.args->ac.streamout_config), Operand::c32(0x20018u));
11752

11753
   Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u),
11754
                              get_arg(&ctx, ctx.args->ac.vertex_id));
11755

11756
   std::stack<if_context> if_contexts;
11757

11758
   for (unsigned stream = 0; stream < 4; stream++) {
11759
      if (stream_id.isConstant() && stream != stream_id.constantValue())
11760
         continue;
11761

11762
      unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
11763
      if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
11764
         continue;
11765

11766
      memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
11767

11768
      if (!stream_id.isConstant()) {
11769
         Temp cond =
11770
            bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand::c32(stream));
11771
         if_contexts.emplace();
11772
         begin_uniform_if_then(&ctx, &if_contexts.top(), cond);
11773
         bld.reset(ctx.block);
11774
      }
11775

11776
      unsigned offset = 0;
11777
      for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
11778
         if (args->shader_info->gs.output_streams[i] != stream)
11779
            continue;
11780

11781
         unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
11782
         unsigned length = util_last_bit(output_usage_mask);
11783
         for (unsigned j = 0; j < length; ++j) {
11784
            if (!(output_usage_mask & (1 << j)))
11785
               continue;
11786

11787
            Temp val = bld.tmp(v1);
11788
            unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
11789
            load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true,
11790
                            true, true);
11791

11792
            ctx.outputs.mask[i] |= 1 << j;
11793
            ctx.outputs.temps[i * 4u + j] = val;
11794

11795
            offset++;
11796
         }
11797
      }
11798

11799
      if (args->shader_info->so.num_outputs) {
11800
         emit_streamout(&ctx, stream);
11801
         bld.reset(ctx.block);
11802
      }
11803

11804
      if (stream == 0) {
11805
         create_vs_exports(&ctx);
11806
      }
11807

11808
      if (!stream_id.isConstant()) {
11809
         begin_uniform_if_else(&ctx, &if_contexts.top());
11810
         bld.reset(ctx.block);
11811
      }
11812
   }
11813

11814
   while (!if_contexts.empty()) {
11815
      end_uniform_if(&ctx, &if_contexts.top());
11816
      if_contexts.pop();
11817
   }
11818

11819
   program->config->float_mode = program->blocks[0].fp_mode.val;
11820

11821
   append_logical_end(ctx.block);
11822
   ctx.block->kind |= block_kind_uniform;
11823
   bld.reset(ctx.block);
11824
   bld.sopp(aco_opcode::s_endpgm);
11825

11826
   cleanup_cfg(program);
11827
}
11828

11829
void
11830
select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
11831
                           struct radv_shader_args* args)
11832
{
11833
   assert(args->options->chip_class == GFX8);
11834

11835
   init_program(program, compute_cs, args->shader_info, args->options->chip_class,
11836
                args->options->family, args->options->wgp_mode, config);
11837

11838
   isel_context ctx = {};
11839
   ctx.program = program;
11840
   ctx.args = args;
11841
   ctx.options = args->options;
11842
   ctx.stage = program->stage;
11843

11844
   ctx.block = ctx.program->create_and_insert_block();
11845
   ctx.block->kind = block_kind_top_level;
11846

11847
   program->workgroup_size = 1; /* XXX */
11848

11849
   add_startpgm(&ctx);
11850
   append_logical_start(ctx.block);
11851

11852
   Builder bld(ctx.program, ctx.block);
11853

11854
   /* Load the buffer descriptor from TMA. */
11855
   bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
11856
            Operand::zero());
11857

11858
   /* Store TTMP0-TTMP1. */
11859
   bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
11860
            Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
11861

11862
   uint32_t hw_regs_idx[] = {
11863
      2, /* HW_REG_STATUS */
11864
      3, /* HW_REG_TRAP_STS */
11865
      4, /* HW_REG_HW_ID */
11866
      7, /* HW_REG_IB_STS */
11867
   };
11868

11869
   /* Store some hardware registers. */
11870
   for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
11871
      /* "((size - 1) << 11) | register" */
11872
      bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
11873
               ((20 - 1) << 11) | hw_regs_idx[i]);
11874

11875
      bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
11876
               Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
11877
   }
11878

11879
   program->config->float_mode = program->blocks[0].fp_mode.val;
11880

11881
   append_logical_end(ctx.block);
11882
   ctx.block->kind |= block_kind_uniform;
11883
   bld.sopp(aco_opcode::s_endpgm);
11884

11885
   cleanup_cfg(program);
11886
}
11887
} // namespace aco
11888

11889
Product

Resources

Company