CoCalc -- aco

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_ir.h
⁴⁵⁵⁰ views
1
/*
2
 * Copyright © 2018 Valve Corporation
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 *
23
 */
24

25
#ifndef ACO_IR_H
26
#define ACO_IR_H
27

28
#include "aco_opcodes.h"
29
#include "aco_util.h"
30

31
#include "vulkan/radv_shader.h"
32

33
#include "nir.h"
34

35
#include <bitset>
36
#include <memory>
37
#include <vector>
38

39
struct radv_shader_args;
40
struct radv_shader_info;
41

42
namespace aco {
43

44
extern uint64_t debug_flags;
45

46
enum {
47
   DEBUG_VALIDATE_IR = 0x1,
48
   DEBUG_VALIDATE_RA = 0x2,
49
   DEBUG_PERFWARN = 0x4,
50
   DEBUG_FORCE_WAITCNT = 0x8,
51
   DEBUG_NO_VN = 0x10,
52
   DEBUG_NO_OPT = 0x20,
53
   DEBUG_NO_SCHED = 0x40,
54
   DEBUG_PERF_INFO = 0x80,
55
   DEBUG_LIVE_INFO = 0x100,
56
};
57

58
/**
59
 * Representation of the instruction's microcode encoding format
60
 * Note: Some Vector ALU Formats can be combined, such that:
61
 * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding
62
 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
63
 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
64
 *
65
 * (*) The same is applicable for VOP1 and VOPC instructions.
66
 */
67
enum class Format : std::uint16_t {
68
   /* Pseudo Instruction Format */
69
   PSEUDO = 0,
70
   /* Scalar ALU & Control Formats */
71
   SOP1 = 1,
72
   SOP2 = 2,
73
   SOPK = 3,
74
   SOPP = 4,
75
   SOPC = 5,
76
   /* Scalar Memory Format */
77
   SMEM = 6,
78
   /* LDS/GDS Format */
79
   DS = 8,
80
   /* Vector Memory Buffer Formats */
81
   MTBUF = 9,
82
   MUBUF = 10,
83
   /* Vector Memory Image Format */
84
   MIMG = 11,
85
   /* Export Format */
86
   EXP = 12,
87
   /* Flat Formats */
88
   FLAT = 13,
89
   GLOBAL = 14,
90
   SCRATCH = 15,
91

92
   PSEUDO_BRANCH = 16,
93
   PSEUDO_BARRIER = 17,
94
   PSEUDO_REDUCTION = 18,
95

96
   /* Vector ALU Formats */
97
   VOP3P = 19,
98
   VOP1 = 1 << 8,
99
   VOP2 = 1 << 9,
100
   VOPC = 1 << 10,
101
   VOP3 = 1 << 11,
102
   /* Vector Parameter Interpolation Format */
103
   VINTRP = 1 << 12,
104
   DPP = 1 << 13,
105
   SDWA = 1 << 14,
106
};
107

108
enum class instr_class : uint8_t {
109
   valu32 = 0,
110
   valu_convert32 = 1,
111
   valu64 = 2,
112
   valu_quarter_rate32 = 3,
113
   valu_fma = 4,
114
   valu_transcendental32 = 5,
115
   valu_double = 6,
116
   valu_double_add = 7,
117
   valu_double_convert = 8,
118
   valu_double_transcendental = 9,
119
   salu = 10,
120
   smem = 11,
121
   barrier = 12,
122
   branch = 13,
123
   sendmsg = 14,
124
   ds = 15,
125
   exp = 16,
126
   vmem = 17,
127
   waitcnt = 18,
128
   other = 19,
129
   count,
130
};
131

132
enum storage_class : uint8_t {
133
   storage_none = 0x0,   /* no synchronization and can be reordered around aliasing stores */
134
   storage_buffer = 0x1, /* SSBOs and global memory */
135
   storage_atomic_counter = 0x2, /* not used for Vulkan */
136
   storage_image = 0x4,
137
   storage_shared = 0x8,       /* or TCS output */
138
   storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
139
   storage_scratch = 0x20,
140
   storage_vgpr_spill = 0x40,
141
   storage_count = 8,
142
};
143

144
enum memory_semantics : uint8_t {
145
   semantic_none = 0x0,
146
   /* for loads: don't move any access after this load to before this load (even other loads)
147
    * for barriers: don't move any access after the barrier to before any
148
    * atomics/control_barriers/sendmsg_gs_done before the barrier */
149
   semantic_acquire = 0x1,
150
   /* for stores: don't move any access before this store to after this store
151
    * for barriers: don't move any access before the barrier to after any
152
    * atomics/control_barriers/sendmsg_gs_done after the barrier */
153
   semantic_release = 0x2,
154

155
   /* the rest are for load/stores/atomics only */
156
   /* cannot be DCE'd or CSE'd */
157
   semantic_volatile = 0x4,
158
   /* does not interact with barriers and assumes this lane is the only lane
159
    * accessing this memory */
160
   semantic_private = 0x8,
161
   /* this operation can be reordered around operations of the same storage.
162
    * says nothing about barriers */
163
   semantic_can_reorder = 0x10,
164
   /* this is a atomic instruction (may only read or write memory) */
165
   semantic_atomic = 0x20,
166
   /* this is instruction both reads and writes memory */
167
   semantic_rmw = 0x40,
168

169
   semantic_acqrel = semantic_acquire | semantic_release,
170
   semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
171
};
172

173
enum sync_scope : uint8_t {
174
   scope_invocation = 0,
175
   scope_subgroup = 1,
176
   scope_workgroup = 2,
177
   scope_queuefamily = 3,
178
   scope_device = 4,
179
};
180

181
struct memory_sync_info {
182
   memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
183
   memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
184
       : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
185
   {}
186

187
   storage_class storage : 8;
188
   memory_semantics semantics : 8;
189
   sync_scope scope : 8;
190

191
   bool operator==(const memory_sync_info& rhs) const
192
   {
193
      return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
194
   }
195

196
   bool can_reorder() const
197
   {
198
      if (semantics & semantic_acqrel)
199
         return false;
200
      /* Also check storage so that zero-initialized memory_sync_info can be
201
       * reordered. */
202
      return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
203
   }
204
};
205
static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
206

207
enum fp_round {
208
   fp_round_ne = 0,
209
   fp_round_pi = 1,
210
   fp_round_ni = 2,
211
   fp_round_tz = 3,
212
};
213

214
enum fp_denorm {
215
   /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
216
    * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
217
   fp_denorm_flush = 0x0,
218
   fp_denorm_keep_in = 0x1,
219
   fp_denorm_keep_out = 0x2,
220
   fp_denorm_keep = 0x3,
221
};
222

223
struct float_mode {
224
   /* matches encoding of the MODE register */
225
   union {
226
      struct {
227
         fp_round round32 : 2;
228
         fp_round round16_64 : 2;
229
         unsigned denorm32 : 2;
230
         unsigned denorm16_64 : 2;
231
      };
232
      struct {
233
         uint8_t round : 4;
234
         uint8_t denorm : 4;
235
      };
236
      uint8_t val = 0;
237
   };
238
   /* if false, optimizations which may remove infs/nan/-0.0 can be done */
239
   bool preserve_signed_zero_inf_nan32 : 1;
240
   bool preserve_signed_zero_inf_nan16_64 : 1;
241
   /* if false, optimizations which may remove denormal flushing can be done */
242
   bool must_flush_denorms32 : 1;
243
   bool must_flush_denorms16_64 : 1;
244
   bool care_about_round32 : 1;
245
   bool care_about_round16_64 : 1;
246

247
   /* Returns true if instructions using the mode "other" can safely use the
248
    * current one instead. */
249
   bool canReplace(float_mode other) const noexcept
250
   {
251
      return val == other.val &&
252
             (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
253
             (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
254
             (must_flush_denorms32 || !other.must_flush_denorms32) &&
255
             (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
256
             (care_about_round32 || !other.care_about_round32) &&
257
             (care_about_round16_64 || !other.care_about_round16_64);
258
   }
259
};
260

261
struct wait_imm {
262
   static const uint8_t unset_counter = 0xff;
263

264
   uint8_t vm;
265
   uint8_t exp;
266
   uint8_t lgkm;
267
   uint8_t vs;
268

269
   wait_imm();
270
   wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
271
   wait_imm(enum chip_class chip, uint16_t packed);
272

273
   uint16_t pack(enum chip_class chip) const;
274

275
   bool combine(const wait_imm& other);
276

277
   bool empty() const;
278
};
279

280
constexpr Format
281
asVOP3(Format format)
282
{
283
   return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
284
};
285

286
constexpr Format
287
asSDWA(Format format)
288
{
289
   assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
290
   return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
291
}
292

293
enum class RegType {
294
   none = 0,
295
   sgpr,
296
   vgpr,
297
   linear_vgpr,
298
};
299

300
struct RegClass {
301

302
   enum RC : uint8_t {
303
      s1 = 1,
304
      s2 = 2,
305
      s3 = 3,
306
      s4 = 4,
307
      s6 = 6,
308
      s8 = 8,
309
      s16 = 16,
310
      v1 = s1 | (1 << 5),
311
      v2 = s2 | (1 << 5),
312
      v3 = s3 | (1 << 5),
313
      v4 = s4 | (1 << 5),
314
      v5 = 5 | (1 << 5),
315
      v6 = 6 | (1 << 5),
316
      v7 = 7 | (1 << 5),
317
      v8 = 8 | (1 << 5),
318
      /* byte-sized register class */
319
      v1b = v1 | (1 << 7),
320
      v2b = v2 | (1 << 7),
321
      v3b = v3 | (1 << 7),
322
      v4b = v4 | (1 << 7),
323
      v6b = v6 | (1 << 7),
324
      v8b = v8 | (1 << 7),
325
      /* these are used for WWM and spills to vgpr */
326
      v1_linear = v1 | (1 << 6),
327
      v2_linear = v2 | (1 << 6),
328
   };
329

330
   RegClass() = default;
331
   constexpr RegClass(RC rc_) : rc(rc_) {}
332
   constexpr RegClass(RegType type, unsigned size)
333
       : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
334
   {}
335

336
   constexpr operator RC() const { return rc; }
337
   explicit operator bool() = delete;
338

339
   constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
340
   constexpr bool is_subdword() const { return rc & (1 << 7); }
341
   constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
342
   // TODO: use size() less in favor of bytes()
343
   constexpr unsigned size() const { return (bytes() + 3) >> 2; }
344
   constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
345
   constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
346
   constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
347

348
   static constexpr RegClass get(RegType type, unsigned bytes)
349
   {
350
      if (type == RegType::sgpr) {
351
         return RegClass(type, DIV_ROUND_UP(bytes, 4u));
352
      } else {
353
         return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
354
      }
355
   }
356

357
private:
358
   RC rc;
359
};
360

361
/* transitional helper expressions */
362
static constexpr RegClass s1{RegClass::s1};
363
static constexpr RegClass s2{RegClass::s2};
364
static constexpr RegClass s3{RegClass::s3};
365
static constexpr RegClass s4{RegClass::s4};
366
static constexpr RegClass s8{RegClass::s8};
367
static constexpr RegClass s16{RegClass::s16};
368
static constexpr RegClass v1{RegClass::v1};
369
static constexpr RegClass v2{RegClass::v2};
370
static constexpr RegClass v3{RegClass::v3};
371
static constexpr RegClass v4{RegClass::v4};
372
static constexpr RegClass v5{RegClass::v5};
373
static constexpr RegClass v6{RegClass::v6};
374
static constexpr RegClass v7{RegClass::v7};
375
static constexpr RegClass v8{RegClass::v8};
376
static constexpr RegClass v1b{RegClass::v1b};
377
static constexpr RegClass v2b{RegClass::v2b};
378
static constexpr RegClass v3b{RegClass::v3b};
379
static constexpr RegClass v4b{RegClass::v4b};
380
static constexpr RegClass v6b{RegClass::v6b};
381
static constexpr RegClass v8b{RegClass::v8b};
382

383
/**
384
 * Temp Class
385
 * Each temporary virtual register has a
386
 * register class (i.e. size and type)
387
 * and SSA id.
388
 */
389
struct Temp {
390
   Temp() noexcept : id_(0), reg_class(0) {}
391
   constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
392

393
   constexpr uint32_t id() const noexcept { return id_; }
394
   constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
395

396
   constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
397
   constexpr unsigned size() const noexcept { return regClass().size(); }
398
   constexpr RegType type() const noexcept { return regClass().type(); }
399
   constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
400

401
   constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
402
   constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
403
   constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
404

405
private:
406
   uint32_t id_ : 24;
407
   uint32_t reg_class : 8;
408
};
409

410
/**
411
 * PhysReg
412
 * Represents the physical register for each
413
 * Operand and Definition.
414
 */
415
struct PhysReg {
416
   constexpr PhysReg() = default;
417
   explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
418
   constexpr unsigned reg() const { return reg_b >> 2; }
419
   constexpr unsigned byte() const { return reg_b & 0x3; }
420
   constexpr operator unsigned() const { return reg(); }
421
   constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
422
   constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
423
   constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
424
   constexpr PhysReg advance(int bytes) const
425
   {
426
      PhysReg res = *this;
427
      res.reg_b += bytes;
428
      return res;
429
   }
430

431
   uint16_t reg_b = 0;
432
};
433

434
/* helper expressions for special registers */
435
static constexpr PhysReg m0{124};
436
static constexpr PhysReg vcc{106};
437
static constexpr PhysReg vcc_hi{107};
438
static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
439
static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
440
static constexpr PhysReg ttmp0{112};
441
static constexpr PhysReg ttmp1{113};
442
static constexpr PhysReg ttmp2{114};
443
static constexpr PhysReg ttmp3{115};
444
static constexpr PhysReg ttmp4{116};
445
static constexpr PhysReg ttmp5{117};
446
static constexpr PhysReg ttmp6{118};
447
static constexpr PhysReg ttmp7{119};
448
static constexpr PhysReg ttmp8{120};
449
static constexpr PhysReg ttmp9{121};
450
static constexpr PhysReg ttmp10{122};
451
static constexpr PhysReg ttmp11{123};
452
static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
453
static constexpr PhysReg exec{126};
454
static constexpr PhysReg exec_lo{126};
455
static constexpr PhysReg exec_hi{127};
456
static constexpr PhysReg vccz{251};
457
static constexpr PhysReg execz{252};
458
static constexpr PhysReg scc{253};
459

460
/**
461
 * Operand Class
462
 * Initially, each Operand refers to either
463
 * a temporary virtual register
464
 * or to a constant value
465
 * Temporary registers get mapped to physical register during RA
466
 * Constant values are inlined into the instruction sequence.
467
 */
468
class Operand final {
469
public:
470
   constexpr Operand()
471
       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false),
472
         isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false),
473
         is24bit_(false), signext(false)
474
   {}
475

476
   explicit Operand(Temp r) noexcept
477
   {
478
      data_.temp = r;
479
      if (r.id()) {
480
         isTemp_ = true;
481
      } else {
482
         isUndef_ = true;
483
         setFixed(PhysReg{128});
484
      }
485
   };
486
   explicit Operand(Temp r, PhysReg reg) noexcept
487
   {
488
      assert(r.id()); /* Don't allow fixing an undef to a register */
489
      data_.temp = r;
490
      isTemp_ = true;
491
      setFixed(reg);
492
   };
493

494
   /* 8-bit constant */
495
   static Operand c8(uint8_t v) noexcept
496
   {
497
      /* 8-bit constants are only used for copies and copies from any 8-bit
498
       * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
499
       * to be inline constants. */
500
      Operand op;
501
      op.control_ = 0;
502
      op.data_.i = v;
503
      op.isConstant_ = true;
504
      op.constSize = 0;
505
      op.setFixed(PhysReg{0u});
506
      return op;
507
   };
508

509
   /* 16-bit constant */
510
   static Operand c16(uint16_t v) noexcept
511
   {
512
      Operand op;
513
      op.control_ = 0;
514
      op.data_.i = v;
515
      op.isConstant_ = true;
516
      op.constSize = 1;
517
      if (v <= 64)
518
         op.setFixed(PhysReg{128u + v});
519
      else if (v >= 0xFFF0) /* [-16 .. -1] */
520
         op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
521
      else if (v == 0x3800) /* 0.5 */
522
         op.setFixed(PhysReg{240});
523
      else if (v == 0xB800) /* -0.5 */
524
         op.setFixed(PhysReg{241});
525
      else if (v == 0x3C00) /* 1.0 */
526
         op.setFixed(PhysReg{242});
527
      else if (v == 0xBC00) /* -1.0 */
528
         op.setFixed(PhysReg{243});
529
      else if (v == 0x4000) /* 2.0 */
530
         op.setFixed(PhysReg{244});
531
      else if (v == 0xC000) /* -2.0 */
532
         op.setFixed(PhysReg{245});
533
      else if (v == 0x4400) /* 4.0 */
534
         op.setFixed(PhysReg{246});
535
      else if (v == 0xC400) /* -4.0 */
536
         op.setFixed(PhysReg{247});
537
      else if (v == 0x3118) /* 1/2 PI */
538
         op.setFixed(PhysReg{248});
539
      else /* Literal Constant */
540
         op.setFixed(PhysReg{255});
541
      return op;
542
   }
543

544
   /* 32-bit constant */
545
   static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }
546

547
   /* 64-bit constant */
548
   static Operand c64(uint64_t v) noexcept
549
   {
550
      Operand op;
551
      op.control_ = 0;
552
      op.isConstant_ = true;
553
      op.constSize = 3;
554
      if (v <= 64) {
555
         op.data_.i = (uint32_t)v;
556
         op.setFixed(PhysReg{128 + (uint32_t)v});
557
      } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
558
         op.data_.i = (uint32_t)v;
559
         op.setFixed(PhysReg{192 - (uint32_t)v});
560
      } else if (v == 0x3FE0000000000000) { /* 0.5 */
561
         op.data_.i = 0x3f000000;
562
         op.setFixed(PhysReg{240});
563
      } else if (v == 0xBFE0000000000000) { /* -0.5 */
564
         op.data_.i = 0xbf000000;
565
         op.setFixed(PhysReg{241});
566
      } else if (v == 0x3FF0000000000000) { /* 1.0 */
567
         op.data_.i = 0x3f800000;
568
         op.setFixed(PhysReg{242});
569
      } else if (v == 0xBFF0000000000000) { /* -1.0 */
570
         op.data_.i = 0xbf800000;
571
         op.setFixed(PhysReg{243});
572
      } else if (v == 0x4000000000000000) { /* 2.0 */
573
         op.data_.i = 0x40000000;
574
         op.setFixed(PhysReg{244});
575
      } else if (v == 0xC000000000000000) { /* -2.0 */
576
         op.data_.i = 0xc0000000;
577
         op.setFixed(PhysReg{245});
578
      } else if (v == 0x4010000000000000) { /* 4.0 */
579
         op.data_.i = 0x40800000;
580
         op.setFixed(PhysReg{246});
581
      } else if (v == 0xC010000000000000) { /* -4.0 */
582
         op.data_.i = 0xc0800000;
583
         op.setFixed(PhysReg{247});
584
      } else { /* Literal Constant: we don't know if it is a long or double.*/
585
         op.signext = v >> 63;
586
         op.data_.i = v & 0xffffffffu;
587
         op.setFixed(PhysReg{255});
588
         assert(op.constantValue64() == v &&
589
                "attempt to create a unrepresentable 64-bit literal constant");
590
      }
591
      return op;
592
   }
593

594
   /* 32-bit constant stored as a 32-bit or 64-bit operand */
595
   static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
596
   {
597
      Operand op;
598
      op.control_ = 0;
599
      op.data_.i = v;
600
      op.isConstant_ = true;
601
      op.constSize = is64bit ? 3 : 2;
602
      if (v <= 64)
603
         op.setFixed(PhysReg{128 + v});
604
      else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
605
         op.setFixed(PhysReg{192 - v});
606
      else if (v == 0x3f000000) /* 0.5 */
607
         op.setFixed(PhysReg{240});
608
      else if (v == 0xbf000000) /* -0.5 */
609
         op.setFixed(PhysReg{241});
610
      else if (v == 0x3f800000) /* 1.0 */
611
         op.setFixed(PhysReg{242});
612
      else if (v == 0xbf800000) /* -1.0 */
613
         op.setFixed(PhysReg{243});
614
      else if (v == 0x40000000) /* 2.0 */
615
         op.setFixed(PhysReg{244});
616
      else if (v == 0xc0000000) /* -2.0 */
617
         op.setFixed(PhysReg{245});
618
      else if (v == 0x40800000) /* 4.0 */
619
         op.setFixed(PhysReg{246});
620
      else if (v == 0xc0800000) /* -4.0 */
621
         op.setFixed(PhysReg{247});
622
      else { /* Literal Constant */
623
         assert(!is64bit && "attempt to create a 64-bit literal constant");
624
         op.setFixed(PhysReg{255});
625
      }
626
      return op;
627
   }
628

629
   explicit Operand(RegClass type) noexcept
630
   {
631
      isUndef_ = true;
632
      data_.temp = Temp(0, type);
633
      setFixed(PhysReg{128});
634
   };
635
   explicit Operand(PhysReg reg, RegClass type) noexcept
636
   {
637
      data_.temp = Temp(0, type);
638
      setFixed(reg);
639
   }
640

641
   static Operand zero(unsigned bytes = 4) noexcept
642
   {
643
      if (bytes == 8)
644
         return Operand::c64(0);
645
      else if (bytes == 4)
646
         return Operand::c32(0);
647
      else if (bytes == 2)
648
         return Operand::c16(0);
649
      assert(bytes == 1);
650
      return Operand::c8(0);
651
   }
652

653
   /* This is useful over the constructors when you want to take a chip class
654
    * for 1/2 PI or an unknown operand size.
655
    */
656
   static Operand get_const(enum chip_class chip, uint64_t val, unsigned bytes)
657
   {
658
      if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
659
         /* 1/2 PI can be an inline constant on GFX8+ */
660
         Operand op = Operand::c32(val);
661
         op.setFixed(PhysReg{248});
662
         return op;
663
      }
664

665
      if (bytes == 8)
666
         return Operand::c64(val);
667
      else if (bytes == 4)
668
         return Operand::c32(val);
669
      else if (bytes == 2)
670
         return Operand::c16(val);
671
      assert(bytes == 1);
672
      return Operand::c8(val);
673
   }
674

675
   static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
676
                                         bool sext = false)
677
   {
678
      if (bytes <= 4)
679
         return true;
680

681
      if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
682
         return true;
683
      uint64_t upper33 = val & 0xFFFFFFFF80000000;
684
      if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
685
         return true;
686

687
      return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
688
             val == 0x3FE0000000000000 ||              /* 0.5 */
689
             val == 0xBFE0000000000000 ||              /* -0.5 */
690
             val == 0x3FF0000000000000 ||              /* 1.0 */
691
             val == 0xBFF0000000000000 ||              /* -1.0 */
692
             val == 0x4000000000000000 ||              /* 2.0 */
693
             val == 0xC000000000000000 ||              /* -2.0 */
694
             val == 0x4010000000000000 ||              /* 4.0 */
695
             val == 0xC010000000000000;                /* -4.0 */
696
   }
697

698
   constexpr bool isTemp() const noexcept { return isTemp_; }
699

700
   constexpr void setTemp(Temp t) noexcept
701
   {
702
      assert(!isConstant_);
703
      isTemp_ = true;
704
      data_.temp = t;
705
   }
706

707
   constexpr Temp getTemp() const noexcept { return data_.temp; }
708

709
   constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
710

711
   constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); }
712

713
   constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
714

715
   constexpr unsigned bytes() const noexcept
716
   {
717
      if (isConstant())
718
         return 1 << constSize;
719
      else
720
         return data_.temp.bytes();
721
   }
722

723
   constexpr unsigned size() const noexcept
724
   {
725
      if (isConstant())
726
         return constSize > 2 ? 2 : 1;
727
      else
728
         return data_.temp.size();
729
   }
730

731
   constexpr bool isFixed() const noexcept { return isFixed_; }
732

733
   constexpr PhysReg physReg() const noexcept { return reg_; }
734

735
   constexpr void setFixed(PhysReg reg) noexcept
736
   {
737
      isFixed_ = reg != unsigned(-1);
738
      reg_ = reg;
739
   }
740

741
   constexpr bool isConstant() const noexcept { return isConstant_; }
742

743
   constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
744

745
   constexpr bool isUndefined() const noexcept { return isUndef_; }
746

747
   constexpr uint32_t constantValue() const noexcept { return data_.i; }
748

749
   constexpr bool constantEquals(uint32_t cmp) const noexcept
750
   {
751
      return isConstant() && constantValue() == cmp;
752
   }
753

754
   constexpr uint64_t constantValue64() const noexcept
755
   {
756
      if (constSize == 3) {
757
         if (reg_ <= 192)
758
            return reg_ - 128;
759
         else if (reg_ <= 208)
760
            return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
761

762
         switch (reg_) {
763
         case 240: return 0x3FE0000000000000;
764
         case 241: return 0xBFE0000000000000;
765
         case 242: return 0x3FF0000000000000;
766
         case 243: return 0xBFF0000000000000;
767
         case 244: return 0x4000000000000000;
768
         case 245: return 0xC000000000000000;
769
         case 246: return 0x4010000000000000;
770
         case 247: return 0xC010000000000000;
771
         case 255:
772
            return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
773
         }
774
         unreachable("invalid register for 64-bit constant");
775
      } else {
776
         return data_.i;
777
      }
778
   }
779

780
   constexpr bool isOfType(RegType type) const noexcept
781
   {
782
      return hasRegClass() && regClass().type() == type;
783
   }
784

785
   /* Indicates that the killed operand's live range intersects with the
786
    * instruction's definitions. Unlike isKill() and isFirstKill(), this is
787
    * not set by liveness analysis. */
788
   constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
789

790
   constexpr bool isLateKill() const noexcept { return isLateKill_; }
791

792
   constexpr void setKill(bool flag) noexcept
793
   {
794
      isKill_ = flag;
795
      if (!flag)
796
         setFirstKill(false);
797
   }
798

799
   constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
800

801
   constexpr void setFirstKill(bool flag) noexcept
802
   {
803
      isFirstKill_ = flag;
804
      if (flag)
805
         setKill(flag);
806
   }
807

808
   /* When there are multiple operands killing the same temporary,
809
    * isFirstKill() is only returns true for the first one. */
810
   constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
811

812
   constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
813

814
   constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
815

816
   constexpr bool operator==(Operand other) const noexcept
817
   {
818
      if (other.size() != size())
819
         return false;
820
      if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
821
         return false;
822
      if (isFixed() && other.isFixed() && physReg() != other.physReg())
823
         return false;
824
      if (isLiteral())
825
         return other.isLiteral() && other.constantValue() == constantValue();
826
      else if (isConstant())
827
         return other.isConstant() && other.physReg() == physReg();
828
      else if (isUndefined())
829
         return other.isUndefined() && other.regClass() == regClass();
830
      else
831
         return other.isTemp() && other.getTemp() == getTemp();
832
   }
833

834
   constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
835

836
   constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
837

838
   constexpr bool is16bit() const noexcept { return is16bit_; }
839

840
   constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
841

842
   constexpr bool is24bit() const noexcept { return is24bit_; }
843

844
private:
845
   union {
846
      Temp temp;
847
      uint32_t i;
848
      float f;
849
   } data_ = {Temp(0, s1)};
850
   PhysReg reg_;
851
   union {
852
      struct {
853
         uint8_t isTemp_ : 1;
854
         uint8_t isFixed_ : 1;
855
         uint8_t isConstant_ : 1;
856
         uint8_t isKill_ : 1;
857
         uint8_t isUndef_ : 1;
858
         uint8_t isFirstKill_ : 1;
859
         uint8_t constSize : 2;
860
         uint8_t isLateKill_ : 1;
861
         uint8_t is16bit_ : 1;
862
         uint8_t is24bit_ : 1;
863
         uint8_t signext : 1;
864
      };
865
      /* can't initialize bit-fields in c++11, so work around using a union */
866
      uint16_t control_ = 0;
867
   };
868
};
869

870
/**
871
 * Definition Class
872
 * Definitions are the results of Instructions
873
 * and refer to temporary virtual registers
874
 * which are later mapped to physical registers
875
 */
876
class Definition final {
877
public:
878
   constexpr Definition()
879
       : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0), isNUW_(0),
880
         isNoCSE_(0)
881
   {}
882
   Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {}
883
   explicit Definition(Temp tmp) noexcept : temp(tmp) {}
884
   Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
885
   Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type))
886
   {
887
      setFixed(reg);
888
   }
889

890
   constexpr bool isTemp() const noexcept { return tempId() > 0; }
891

892
   constexpr Temp getTemp() const noexcept { return temp; }
893

894
   constexpr uint32_t tempId() const noexcept { return temp.id(); }
895

896
   constexpr void setTemp(Temp t) noexcept { temp = t; }
897

898
   void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
899

900
   constexpr RegClass regClass() const noexcept { return temp.regClass(); }
901

902
   constexpr unsigned bytes() const noexcept { return temp.bytes(); }
903

904
   constexpr unsigned size() const noexcept { return temp.size(); }
905

906
   constexpr bool isFixed() const noexcept { return isFixed_; }
907

908
   constexpr PhysReg physReg() const noexcept { return reg_; }
909

910
   constexpr void setFixed(PhysReg reg) noexcept
911
   {
912
      isFixed_ = 1;
913
      reg_ = reg;
914
   }
915

916
   constexpr void setHint(PhysReg reg) noexcept
917
   {
918
      hasHint_ = 1;
919
      reg_ = reg;
920
   }
921

922
   constexpr bool hasHint() const noexcept { return hasHint_; }
923

924
   constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
925

926
   constexpr bool isKill() const noexcept { return isKill_; }
927

928
   constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
929

930
   constexpr bool isPrecise() const noexcept { return isPrecise_; }
931

932
   /* No Unsigned Wrap */
933
   constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
934

935
   constexpr bool isNUW() const noexcept { return isNUW_; }
936

937
   constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
938

939
   constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
940

941
private:
942
   Temp temp = Temp(0, s1);
943
   PhysReg reg_;
944
   union {
945
      struct {
946
         uint8_t isFixed_ : 1;
947
         uint8_t hasHint_ : 1;
948
         uint8_t isKill_ : 1;
949
         uint8_t isPrecise_ : 1;
950
         uint8_t isNUW_ : 1;
951
         uint8_t isNoCSE_ : 1;
952
      };
953
      /* can't initialize bit-fields in c++11, so work around using a union */
954
      uint8_t control_ = 0;
955
   };
956
};
957

958
struct Block;
959
struct Instruction;
960
struct Pseudo_instruction;
961
struct SOP1_instruction;
962
struct SOP2_instruction;
963
struct SOPK_instruction;
964
struct SOPP_instruction;
965
struct SOPC_instruction;
966
struct SMEM_instruction;
967
struct DS_instruction;
968
struct MTBUF_instruction;
969
struct MUBUF_instruction;
970
struct MIMG_instruction;
971
struct Export_instruction;
972
struct FLAT_instruction;
973
struct Pseudo_branch_instruction;
974
struct Pseudo_barrier_instruction;
975
struct Pseudo_reduction_instruction;
976
struct VOP3P_instruction;
977
struct VOP1_instruction;
978
struct VOP2_instruction;
979
struct VOPC_instruction;
980
struct VOP3_instruction;
981
struct Interp_instruction;
982
struct DPP_instruction;
983
struct SDWA_instruction;
984

985
struct Instruction {
986
   aco_opcode opcode;
987
   Format format;
988
   uint32_t pass_flags;
989

990
   aco::span<Operand> operands;
991
   aco::span<Definition> definitions;
992

993
   constexpr bool usesModifiers() const noexcept;
994

995
   constexpr bool reads_exec() const noexcept
996
   {
997
      for (const Operand& op : operands) {
998
         if (op.isFixed() && op.physReg() == exec)
999
            return true;
1000
      }
1001
      return false;
1002
   }
1003

1004
   Pseudo_instruction& pseudo() noexcept
1005
   {
1006
      assert(isPseudo());
1007
      return *(Pseudo_instruction*)this;
1008
   }
1009
   const Pseudo_instruction& pseudo() const noexcept
1010
   {
1011
      assert(isPseudo());
1012
      return *(Pseudo_instruction*)this;
1013
   }
1014
   constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
1015
   SOP1_instruction& sop1() noexcept
1016
   {
1017
      assert(isSOP1());
1018
      return *(SOP1_instruction*)this;
1019
   }
1020
   const SOP1_instruction& sop1() const noexcept
1021
   {
1022
      assert(isSOP1());
1023
      return *(SOP1_instruction*)this;
1024
   }
1025
   constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
1026
   SOP2_instruction& sop2() noexcept
1027
   {
1028
      assert(isSOP2());
1029
      return *(SOP2_instruction*)this;
1030
   }
1031
   const SOP2_instruction& sop2() const noexcept
1032
   {
1033
      assert(isSOP2());
1034
      return *(SOP2_instruction*)this;
1035
   }
1036
   constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
1037
   SOPK_instruction& sopk() noexcept
1038
   {
1039
      assert(isSOPK());
1040
      return *(SOPK_instruction*)this;
1041
   }
1042
   const SOPK_instruction& sopk() const noexcept
1043
   {
1044
      assert(isSOPK());
1045
      return *(SOPK_instruction*)this;
1046
   }
1047
   constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
1048
   SOPP_instruction& sopp() noexcept
1049
   {
1050
      assert(isSOPP());
1051
      return *(SOPP_instruction*)this;
1052
   }
1053
   const SOPP_instruction& sopp() const noexcept
1054
   {
1055
      assert(isSOPP());
1056
      return *(SOPP_instruction*)this;
1057
   }
1058
   constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
1059
   SOPC_instruction& sopc() noexcept
1060
   {
1061
      assert(isSOPC());
1062
      return *(SOPC_instruction*)this;
1063
   }
1064
   const SOPC_instruction& sopc() const noexcept
1065
   {
1066
      assert(isSOPC());
1067
      return *(SOPC_instruction*)this;
1068
   }
1069
   constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
1070
   SMEM_instruction& smem() noexcept
1071
   {
1072
      assert(isSMEM());
1073
      return *(SMEM_instruction*)this;
1074
   }
1075
   const SMEM_instruction& smem() const noexcept
1076
   {
1077
      assert(isSMEM());
1078
      return *(SMEM_instruction*)this;
1079
   }
1080
   constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
1081
   DS_instruction& ds() noexcept
1082
   {
1083
      assert(isDS());
1084
      return *(DS_instruction*)this;
1085
   }
1086
   const DS_instruction& ds() const noexcept
1087
   {
1088
      assert(isDS());
1089
      return *(DS_instruction*)this;
1090
   }
1091
   constexpr bool isDS() const noexcept { return format == Format::DS; }
1092
   MTBUF_instruction& mtbuf() noexcept
1093
   {
1094
      assert(isMTBUF());
1095
      return *(MTBUF_instruction*)this;
1096
   }
1097
   const MTBUF_instruction& mtbuf() const noexcept
1098
   {
1099
      assert(isMTBUF());
1100
      return *(MTBUF_instruction*)this;
1101
   }
1102
   constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
1103
   MUBUF_instruction& mubuf() noexcept
1104
   {
1105
      assert(isMUBUF());
1106
      return *(MUBUF_instruction*)this;
1107
   }
1108
   const MUBUF_instruction& mubuf() const noexcept
1109
   {
1110
      assert(isMUBUF());
1111
      return *(MUBUF_instruction*)this;
1112
   }
1113
   constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
1114
   MIMG_instruction& mimg() noexcept
1115
   {
1116
      assert(isMIMG());
1117
      return *(MIMG_instruction*)this;
1118
   }
1119
   const MIMG_instruction& mimg() const noexcept
1120
   {
1121
      assert(isMIMG());
1122
      return *(MIMG_instruction*)this;
1123
   }
1124
   constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
1125
   Export_instruction& exp() noexcept
1126
   {
1127
      assert(isEXP());
1128
      return *(Export_instruction*)this;
1129
   }
1130
   const Export_instruction& exp() const noexcept
1131
   {
1132
      assert(isEXP());
1133
      return *(Export_instruction*)this;
1134
   }
1135
   constexpr bool isEXP() const noexcept { return format == Format::EXP; }
1136
   FLAT_instruction& flat() noexcept
1137
   {
1138
      assert(isFlat());
1139
      return *(FLAT_instruction*)this;
1140
   }
1141
   const FLAT_instruction& flat() const noexcept
1142
   {
1143
      assert(isFlat());
1144
      return *(FLAT_instruction*)this;
1145
   }
1146
   constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
1147
   FLAT_instruction& global() noexcept
1148
   {
1149
      assert(isGlobal());
1150
      return *(FLAT_instruction*)this;
1151
   }
1152
   const FLAT_instruction& global() const noexcept
1153
   {
1154
      assert(isGlobal());
1155
      return *(FLAT_instruction*)this;
1156
   }
1157
   constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
1158
   FLAT_instruction& scratch() noexcept
1159
   {
1160
      assert(isScratch());
1161
      return *(FLAT_instruction*)this;
1162
   }
1163
   const FLAT_instruction& scratch() const noexcept
1164
   {
1165
      assert(isScratch());
1166
      return *(FLAT_instruction*)this;
1167
   }
1168
   constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
1169
   Pseudo_branch_instruction& branch() noexcept
1170
   {
1171
      assert(isBranch());
1172
      return *(Pseudo_branch_instruction*)this;
1173
   }
1174
   const Pseudo_branch_instruction& branch() const noexcept
1175
   {
1176
      assert(isBranch());
1177
      return *(Pseudo_branch_instruction*)this;
1178
   }
1179
   constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
1180
   Pseudo_barrier_instruction& barrier() noexcept
1181
   {
1182
      assert(isBarrier());
1183
      return *(Pseudo_barrier_instruction*)this;
1184
   }
1185
   const Pseudo_barrier_instruction& barrier() const noexcept
1186
   {
1187
      assert(isBarrier());
1188
      return *(Pseudo_barrier_instruction*)this;
1189
   }
1190
   constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
1191
   Pseudo_reduction_instruction& reduction() noexcept
1192
   {
1193
      assert(isReduction());
1194
      return *(Pseudo_reduction_instruction*)this;
1195
   }
1196
   const Pseudo_reduction_instruction& reduction() const noexcept
1197
   {
1198
      assert(isReduction());
1199
      return *(Pseudo_reduction_instruction*)this;
1200
   }
1201
   constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
1202
   VOP3P_instruction& vop3p() noexcept
1203
   {
1204
      assert(isVOP3P());
1205
      return *(VOP3P_instruction*)this;
1206
   }
1207
   const VOP3P_instruction& vop3p() const noexcept
1208
   {
1209
      assert(isVOP3P());
1210
      return *(VOP3P_instruction*)this;
1211
   }
1212
   constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; }
1213
   VOP1_instruction& vop1() noexcept
1214
   {
1215
      assert(isVOP1());
1216
      return *(VOP1_instruction*)this;
1217
   }
1218
   const VOP1_instruction& vop1() const noexcept
1219
   {
1220
      assert(isVOP1());
1221
      return *(VOP1_instruction*)this;
1222
   }
1223
   constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
1224
   VOP2_instruction& vop2() noexcept
1225
   {
1226
      assert(isVOP2());
1227
      return *(VOP2_instruction*)this;
1228
   }
1229
   const VOP2_instruction& vop2() const noexcept
1230
   {
1231
      assert(isVOP2());
1232
      return *(VOP2_instruction*)this;
1233
   }
1234
   constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
1235
   VOPC_instruction& vopc() noexcept
1236
   {
1237
      assert(isVOPC());
1238
      return *(VOPC_instruction*)this;
1239
   }
1240
   const VOPC_instruction& vopc() const noexcept
1241
   {
1242
      assert(isVOPC());
1243
      return *(VOPC_instruction*)this;
1244
   }
1245
   constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
1246
   VOP3_instruction& vop3() noexcept
1247
   {
1248
      assert(isVOP3());
1249
      return *(VOP3_instruction*)this;
1250
   }
1251
   const VOP3_instruction& vop3() const noexcept
1252
   {
1253
      assert(isVOP3());
1254
      return *(VOP3_instruction*)this;
1255
   }
1256
   constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
1257
   Interp_instruction& vintrp() noexcept
1258
   {
1259
      assert(isVINTRP());
1260
      return *(Interp_instruction*)this;
1261
   }
1262
   const Interp_instruction& vintrp() const noexcept
1263
   {
1264
      assert(isVINTRP());
1265
      return *(Interp_instruction*)this;
1266
   }
1267
   constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; }
1268
   DPP_instruction& dpp() noexcept
1269
   {
1270
      assert(isDPP());
1271
      return *(DPP_instruction*)this;
1272
   }
1273
   const DPP_instruction& dpp() const noexcept
1274
   {
1275
      assert(isDPP());
1276
      return *(DPP_instruction*)this;
1277
   }
1278
   constexpr bool isDPP() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP; }
1279
   SDWA_instruction& sdwa() noexcept
1280
   {
1281
      assert(isSDWA());
1282
      return *(SDWA_instruction*)this;
1283
   }
1284
   const SDWA_instruction& sdwa() const noexcept
1285
   {
1286
      assert(isSDWA());
1287
      return *(SDWA_instruction*)this;
1288
   }
1289
   constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
1290

1291
   FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
1292

1293
   const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
1294

1295
   constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
1296

1297
   constexpr bool isVALU() const noexcept
1298
   {
1299
      return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P();
1300
   }
1301

1302
   constexpr bool isSALU() const noexcept
1303
   {
1304
      return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
1305
   }
1306

1307
   constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
1308
};
1309
static_assert(sizeof(Instruction) == 16, "Unexpected padding");
1310

1311
struct SOPK_instruction : public Instruction {
1312
   uint16_t imm;
1313
   uint16_t padding;
1314
};
1315
static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1316

1317
struct SOPP_instruction : public Instruction {
1318
   uint32_t imm;
1319
   int block;
1320
};
1321
static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1322

1323
struct SOPC_instruction : public Instruction {};
1324
static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1325

1326
struct SOP1_instruction : public Instruction {};
1327
static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1328

1329
struct SOP2_instruction : public Instruction {};
1330
static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1331

1332
/**
1333
 * Scalar Memory Format:
1334
 * For s_(buffer_)load_dword*:
1335
 * Operand(0): SBASE - SGPR-pair which provides base address
1336
 * Operand(1): Offset - immediate (un)signed offset or SGPR
1337
 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1338
 * Operand(n-1): SOffset - SGPR offset (Vega only)
1339
 *
1340
 * Having no operands is also valid for instructions such as s_dcache_inv.
1341
 *
1342
 */
1343
struct SMEM_instruction : public Instruction {
1344
   memory_sync_info sync;
1345
   bool glc : 1; /* VI+: globally coherent */
1346
   bool dlc : 1; /* NAVI: device level coherent */
1347
   bool nv : 1;  /* VEGA only: Non-volatile */
1348
   bool disable_wqm : 1;
1349
   bool prevent_overflow : 1; /* avoid overflow when combining additions */
1350
   uint8_t padding : 3;
1351
};
1352
static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1353

1354
struct VOP1_instruction : public Instruction {};
1355
static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1356

1357
struct VOP2_instruction : public Instruction {};
1358
static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1359

1360
struct VOPC_instruction : public Instruction {};
1361
static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1362

1363
struct VOP3_instruction : public Instruction {
1364
   bool abs[3];
1365
   bool neg[3];
1366
   uint8_t opsel : 4;
1367
   uint8_t omod : 2;
1368
   bool clamp : 1;
1369
   uint8_t padding0 : 1;
1370
   uint8_t padding1;
1371
};
1372
static_assert(sizeof(VOP3_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1373

1374
struct VOP3P_instruction : public Instruction {
1375
   bool neg_lo[3];
1376
   bool neg_hi[3];
1377
   uint8_t opsel_lo : 3;
1378
   uint8_t opsel_hi : 3;
1379
   bool clamp : 1;
1380
   uint8_t padding0 : 1;
1381
   uint8_t padding1;
1382
};
1383
static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1384

1385
/**
1386
 * Data Parallel Primitives Format:
1387
 * This format can be used for VOP1, VOP2 or VOPC instructions.
1388
 * The swizzle applies to the src0 operand.
1389
 *
1390
 */
1391
struct DPP_instruction : public Instruction {
1392
   bool abs[2];
1393
   bool neg[2];
1394
   uint16_t dpp_ctrl;
1395
   uint8_t row_mask : 4;
1396
   uint8_t bank_mask : 4;
1397
   bool bound_ctrl : 1;
1398
   uint8_t padding : 7;
1399
};
1400
static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1401

1402
enum sdwa_sel : uint8_t {
1403
   /* masks */
1404
   sdwa_wordnum = 0x1,
1405
   sdwa_bytenum = 0x3,
1406
   sdwa_asuint = 0x7 | 0x10,
1407
   sdwa_rasize = 0x3,
1408

1409
   /* flags */
1410
   sdwa_isword = 0x4,
1411
   sdwa_sext = 0x8,
1412
   sdwa_isra = 0x10,
1413

1414
   /* specific values */
1415
   sdwa_ubyte0 = 0,
1416
   sdwa_ubyte1 = 1,
1417
   sdwa_ubyte2 = 2,
1418
   sdwa_ubyte3 = 3,
1419
   sdwa_uword0 = sdwa_isword | 0,
1420
   sdwa_uword1 = sdwa_isword | 1,
1421
   sdwa_udword = 6,
1422

1423
   sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1424
   sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1425
   sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1426
   sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1427
   sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1428
   sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1429
   sdwa_sdword = sdwa_udword | sdwa_sext,
1430

1431
   /* register-allocated */
1432
   sdwa_ubyte = 1 | sdwa_isra,
1433
   sdwa_uword = 2 | sdwa_isra,
1434
   sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1435
   sdwa_sword = sdwa_uword | sdwa_sext,
1436
};
1437

1438
/**
1439
 * Sub-Dword Addressing Format:
1440
 * This format can be used for VOP1, VOP2 or VOPC instructions.
1441
 *
1442
 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1443
 * the definition doesn't have to be VCC on GFX9+.
1444
 *
1445
 */
1446
struct SDWA_instruction : public Instruction {
1447
   /* these destination modifiers aren't available with VOPC except for
1448
    * clamp on GFX8 */
1449
   uint8_t sel[2];
1450
   uint8_t dst_sel;
1451
   bool neg[2];
1452
   bool abs[2];
1453
   bool dst_preserve : 1;
1454
   bool clamp : 1;
1455
   uint8_t omod : 2; /* GFX9+ */
1456
   uint8_t padding : 4;
1457
};
1458
static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1459

1460
struct Interp_instruction : public Instruction {
1461
   uint8_t attribute;
1462
   uint8_t component;
1463
   uint16_t padding;
1464
};
1465
static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1466

1467
/**
1468
 * Local and Global Data Sharing instructions
1469
 * Operand(0): ADDR - VGPR which supplies the address.
1470
 * Operand(1): DATA0 - First data VGPR.
1471
 * Operand(2): DATA1 - Second data VGPR.
1472
 * Operand(n-1): M0 - LDS size.
1473
 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1474
 *
1475
 */
1476
struct DS_instruction : public Instruction {
1477
   memory_sync_info sync;
1478
   bool gds;
1479
   int16_t offset0;
1480
   int8_t offset1;
1481
   uint8_t padding;
1482
};
1483
static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1484

1485
/**
1486
 * Vector Memory Untyped-buffer Instructions
1487
 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1488
 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1489
 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1490
 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1491
 *
1492
 */
1493
struct MUBUF_instruction : public Instruction {
1494
   memory_sync_info sync;
1495
   bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1496
   bool idxen : 1;           /* Supply an index from VGPR (VADDR) */
1497
   bool addr64 : 1;          /* SI, CIK: Address size is 64-bit */
1498
   bool glc : 1;             /* globally coherent */
1499
   bool dlc : 1;             /* NAVI: device level coherent */
1500
   bool slc : 1;             /* system level coherent */
1501
   bool tfe : 1;             /* texture fail enable */
1502
   bool lds : 1;             /* Return read-data to LDS instead of VGPRs */
1503
   uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1504
   uint16_t offset : 12;     /* Unsigned byte offset - 12 bit */
1505
   uint16_t swizzled : 1;
1506
   uint16_t padding0 : 2;
1507
   uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1508
   uint16_t padding1 : 10;
1509
};
1510
static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1511

1512
/**
1513
 * Vector Memory Typed-buffer Instructions
1514
 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1515
 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1516
 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1517
 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1518
 *
1519
 */
1520
struct MTBUF_instruction : public Instruction {
1521
   memory_sync_info sync;
1522
   uint8_t dfmt : 4;         /* Data Format of data in memory buffer */
1523
   uint8_t nfmt : 3;         /* Numeric format of data in memory */
1524
   bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1525
   uint16_t idxen : 1;       /* Supply an index from VGPR (VADDR) */
1526
   uint16_t glc : 1;         /* globally coherent */
1527
   uint16_t dlc : 1;         /* NAVI: device level coherent */
1528
   uint16_t slc : 1;         /* system level coherent */
1529
   uint16_t tfe : 1;         /* texture fail enable */
1530
   uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1531
   uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1532
   uint16_t padding : 4;
1533
   uint16_t offset; /* Unsigned byte offset - 12 bit */
1534
};
1535
static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1536

1537
/**
1538
 * Vector Memory Image Instructions
1539
 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1540
 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1541
 * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
1542
 * Operand(3): VADDR - Address source. Can carry an offset or an index.
1543
 * Definition(0): VDATA - Vector GPR for read result.
1544
 *
1545
 */
1546
struct MIMG_instruction : public Instruction {
1547
   memory_sync_info sync;
1548
   uint8_t dmask;        /* Data VGPR enable mask */
1549
   uint8_t dim : 3;      /* NAVI: dimensionality */
1550
   bool unrm : 1;        /* Force address to be un-normalized */
1551
   bool dlc : 1;         /* NAVI: device level coherent */
1552
   bool glc : 1;         /* globally coherent */
1553
   bool slc : 1;         /* system level coherent */
1554
   bool tfe : 1;         /* texture fail enable */
1555
   bool da : 1;          /* declare an array */
1556
   bool lwe : 1;         /* LOD warning enable */
1557
   bool r128 : 1;        /* NAVI: Texture resource size */
1558
   bool a16 : 1;         /* VEGA, NAVI: Address components are 16-bits */
1559
   bool d16 : 1;         /* Convert 32-bit data to 16-bit data */
1560
   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1561
   uint8_t padding0 : 2;
1562
   uint8_t padding1;
1563
   uint8_t padding2;
1564
};
1565
static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1566

1567
/**
1568
 * Flat/Scratch/Global Instructions
1569
 * Operand(0): ADDR
1570
 * Operand(1): SADDR
1571
 * Operand(2) / Definition(0): DATA/VDST
1572
 *
1573
 */
1574
struct FLAT_instruction : public Instruction {
1575
   memory_sync_info sync;
1576
   bool slc : 1; /* system level coherent */
1577
   bool glc : 1; /* globally coherent */
1578
   bool dlc : 1; /* NAVI: device level coherent */
1579
   bool lds : 1;
1580
   bool nv : 1;
1581
   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1582
   uint8_t padding0 : 2;
1583
   uint16_t offset; /* Vega/Navi only */
1584
   uint16_t padding1;
1585
};
1586
static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1587

1588
struct Export_instruction : public Instruction {
1589
   uint8_t enabled_mask;
1590
   uint8_t dest;
1591
   bool compressed : 1;
1592
   bool done : 1;
1593
   bool valid_mask : 1;
1594
   uint8_t padding0 : 5;
1595
   uint8_t padding1;
1596
};
1597
static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1598

1599
struct Pseudo_instruction : public Instruction {
1600
   PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1601
   bool tmp_in_scc;
1602
   uint8_t padding;
1603
};
1604
static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1605

1606
struct Pseudo_branch_instruction : public Instruction {
1607
   /* target[0] is the block index of the branch target.
1608
    * For conditional branches, target[1] contains the fall-through alternative.
1609
    * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1610
    */
1611
   uint32_t target[2];
1612
};
1613
static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1614

1615
struct Pseudo_barrier_instruction : public Instruction {
1616
   memory_sync_info sync;
1617
   sync_scope exec_scope;
1618
};
1619
static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1620

1621
enum ReduceOp : uint16_t {
1622
   // clang-format off
1623
   iadd8, iadd16, iadd32, iadd64,
1624
   imul8, imul16, imul32, imul64,
1625
          fadd16, fadd32, fadd64,
1626
          fmul16, fmul32, fmul64,
1627
   imin8, imin16, imin32, imin64,
1628
   imax8, imax16, imax32, imax64,
1629
   umin8, umin16, umin32, umin64,
1630
   umax8, umax16, umax32, umax64,
1631
          fmin16, fmin32, fmin64,
1632
          fmax16, fmax32, fmax64,
1633
   iand8, iand16, iand32, iand64,
1634
   ior8, ior16, ior32, ior64,
1635
   ixor8, ixor16, ixor32, ixor64,
1636
   num_reduce_ops,
1637
   // clang-format on
1638
};
1639

1640
/**
1641
 * Subgroup Reduction Instructions, everything except for the data to be
1642
 * reduced and the result as inserted by setup_reduce_temp().
1643
 * Operand(0): data to be reduced
1644
 * Operand(1): reduce temporary
1645
 * Operand(2): vector temporary
1646
 * Definition(0): result
1647
 * Definition(1): scalar temporary
1648
 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1649
 * Definition(3): scc clobber
1650
 * Definition(4): vcc clobber
1651
 *
1652
 */
1653
struct Pseudo_reduction_instruction : public Instruction {
1654
   ReduceOp reduce_op;
1655
   uint16_t cluster_size; // must be 0 for scans
1656
};
1657
static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
1658
              "Unexpected padding");
1659

1660
struct instr_deleter_functor {
1661
   void operator()(void* p) { free(p); }
1662
};
1663

1664
template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1665

1666
template <typename T>
1667
T*
1668
create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1669
                   uint32_t num_definitions)
1670
{
1671
   std::size_t size =
1672
      sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1673
   char* data = (char*)calloc(1, size);
1674
   T* inst = (T*)data;
1675

1676
   inst->opcode = opcode;
1677
   inst->format = format;
1678

1679
   uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1680
   inst->operands = aco::span<Operand>(operands_offset, num_operands);
1681
   uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1682
   inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1683

1684
   return inst;
1685
}
1686

1687
constexpr bool
1688
Instruction::usesModifiers() const noexcept
1689
{
1690
   if (isDPP() || isSDWA())
1691
      return true;
1692

1693
   if (isVOP3P()) {
1694
      const VOP3P_instruction& vop3p = this->vop3p();
1695
      for (unsigned i = 0; i < operands.size(); i++) {
1696
         if (vop3p.neg_lo[i] || vop3p.neg_hi[i])
1697
            return true;
1698

1699
         /* opsel_hi must be 1 to not be considered a modifier - even for constants */
1700
         if (!(vop3p.opsel_hi & (1 << i)))
1701
            return true;
1702
      }
1703
      return vop3p.opsel_lo || vop3p.clamp;
1704
   } else if (isVOP3()) {
1705
      const VOP3_instruction& vop3 = this->vop3();
1706
      for (unsigned i = 0; i < operands.size(); i++) {
1707
         if (vop3.abs[i] || vop3.neg[i])
1708
            return true;
1709
      }
1710
      return vop3.opsel || vop3.clamp || vop3.omod;
1711
   }
1712
   return false;
1713
}
1714

1715
constexpr bool
1716
is_phi(Instruction* instr)
1717
{
1718
   return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1719
}
1720

1721
static inline bool
1722
is_phi(aco_ptr<Instruction>& instr)
1723
{
1724
   return is_phi(instr.get());
1725
}
1726

1727
memory_sync_info get_sync_info(const Instruction* instr);
1728

1729
bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr);
1730

1731
bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1732
bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra);
1733
/* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1734
aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1735
bool needs_exec_mask(const Instruction* instr);
1736

1737
uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1738

1739
unsigned get_mimg_nsa_dwords(const Instruction* instr);
1740

1741
bool should_form_clause(const Instruction* a, const Instruction* b);
1742

1743
enum block_kind {
1744
   /* uniform indicates that leaving this block,
1745
    * all actives lanes stay active */
1746
   block_kind_uniform = 1 << 0,
1747
   block_kind_top_level = 1 << 1,
1748
   block_kind_loop_preheader = 1 << 2,
1749
   block_kind_loop_header = 1 << 3,
1750
   block_kind_loop_exit = 1 << 4,
1751
   block_kind_continue = 1 << 5,
1752
   block_kind_break = 1 << 6,
1753
   block_kind_continue_or_break = 1 << 7,
1754
   block_kind_discard = 1 << 8,
1755
   block_kind_branch = 1 << 9,
1756
   block_kind_merge = 1 << 10,
1757
   block_kind_invert = 1 << 11,
1758
   block_kind_uses_discard_if = 1 << 12,
1759
   block_kind_needs_lowering = 1 << 13,
1760
   block_kind_uses_demote = 1 << 14,
1761
   block_kind_export_end = 1 << 15,
1762
};
1763

1764
struct RegisterDemand {
1765
   constexpr RegisterDemand() = default;
1766
   constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
1767
   int16_t vgpr = 0;
1768
   int16_t sgpr = 0;
1769

1770
   constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
1771
   {
1772
      return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1773
   }
1774

1775
   constexpr bool exceeds(const RegisterDemand other) const noexcept
1776
   {
1777
      return vgpr > other.vgpr || sgpr > other.sgpr;
1778
   }
1779

1780
   constexpr RegisterDemand operator+(const Temp t) const noexcept
1781
   {
1782
      if (t.type() == RegType::sgpr)
1783
         return RegisterDemand(vgpr, sgpr + t.size());
1784
      else
1785
         return RegisterDemand(vgpr + t.size(), sgpr);
1786
   }
1787

1788
   constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
1789
   {
1790
      return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1791
   }
1792

1793
   constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
1794
   {
1795
      return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1796
   }
1797

1798
   constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
1799
   {
1800
      vgpr += other.vgpr;
1801
      sgpr += other.sgpr;
1802
      return *this;
1803
   }
1804

1805
   constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
1806
   {
1807
      vgpr -= other.vgpr;
1808
      sgpr -= other.sgpr;
1809
      return *this;
1810
   }
1811

1812
   constexpr RegisterDemand& operator+=(const Temp t) noexcept
1813
   {
1814
      if (t.type() == RegType::sgpr)
1815
         sgpr += t.size();
1816
      else
1817
         vgpr += t.size();
1818
      return *this;
1819
   }
1820

1821
   constexpr RegisterDemand& operator-=(const Temp t) noexcept
1822
   {
1823
      if (t.type() == RegType::sgpr)
1824
         sgpr -= t.size();
1825
      else
1826
         vgpr -= t.size();
1827
      return *this;
1828
   }
1829

1830
   constexpr void update(const RegisterDemand other) noexcept
1831
   {
1832
      vgpr = std::max(vgpr, other.vgpr);
1833
      sgpr = std::max(sgpr, other.sgpr);
1834
   }
1835
};
1836

1837
/* CFG */
1838
struct Block {
1839
   float_mode fp_mode;
1840
   unsigned index;
1841
   unsigned offset = 0;
1842
   std::vector<aco_ptr<Instruction>> instructions;
1843
   std::vector<unsigned> logical_preds;
1844
   std::vector<unsigned> linear_preds;
1845
   std::vector<unsigned> logical_succs;
1846
   std::vector<unsigned> linear_succs;
1847
   RegisterDemand register_demand = RegisterDemand();
1848
   uint16_t loop_nest_depth = 0;
1849
   uint16_t divergent_if_logical_depth = 0;
1850
   uint16_t uniform_if_depth = 0;
1851
   uint16_t kind = 0;
1852
   int logical_idom = -1;
1853
   int linear_idom = -1;
1854

1855
   /* this information is needed for predecessors to blocks with phis when
1856
    * moving out of ssa */
1857
   bool scc_live_out = false;
1858
   PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1859

1860
   Block() : index(0) {}
1861
};
1862

1863
/*
1864
 * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1865
 */
1866
enum class SWStage : uint8_t {
1867
   None = 0,
1868
   VS = 1 << 0,     /* Vertex Shader */
1869
   GS = 1 << 1,     /* Geometry Shader */
1870
   TCS = 1 << 2,    /* Tessellation Control aka Hull Shader */
1871
   TES = 1 << 3,    /* Tessellation Evaluation aka Domain Shader */
1872
   FS = 1 << 4,     /* Fragment aka Pixel Shader */
1873
   CS = 1 << 5,     /* Compute Shader */
1874
   GSCopy = 1 << 6, /* GS Copy Shader (internal) */
1875

1876
   /* Stage combinations merged to run on a single HWStage */
1877
   VS_GS = VS | GS,
1878
   VS_TCS = VS | TCS,
1879
   TES_GS = TES | GS,
1880
};
1881

1882
constexpr SWStage
1883
operator|(SWStage a, SWStage b)
1884
{
1885
   return static_cast<SWStage>(static_cast<uint8_t>(a) | static_cast<uint8_t>(b));
1886
}
1887

1888
/*
1889
 * Shader stages as running on the AMD GPU.
1890
 *
1891
 * The relation between HWStages and SWStages is not a one-to-one mapping:
1892
 * Some SWStages are merged by ACO to run on a single HWStage.
1893
 * See README.md for details.
1894
 */
1895
enum class HWStage : uint8_t {
1896
   VS,
1897
   ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1898
   GS,  /* Geometry shader on GFX10/legacy and GFX6-9. */
1899
   NGG, /* Primitive shader, used to implement VS, TES, GS. */
1900
   LS,  /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1901
   HS,  /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1902
   FS,
1903
   CS,
1904
};
1905

1906
/*
1907
 * Set of SWStages to be merged into a single shader paired with the
1908
 * HWStage it will run on.
1909
 */
1910
struct Stage {
1911
   constexpr Stage() = default;
1912

1913
   explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
1914

1915
   /* Check if the given SWStage is included */
1916
   constexpr bool has(SWStage stage) const
1917
   {
1918
      return (static_cast<uint8_t>(sw) & static_cast<uint8_t>(stage));
1919
   }
1920

1921
   unsigned num_sw_stages() const { return util_bitcount(static_cast<uint8_t>(sw)); }
1922

1923
   constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
1924

1925
   constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
1926

1927
   /* Mask of merged software stages */
1928
   SWStage sw = SWStage::None;
1929

1930
   /* Active hardware stage */
1931
   HWStage hw{};
1932
};
1933

1934
/* possible settings of Program::stage */
1935
static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
1936
static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
1937
static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
1938
static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
1939
static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
1940
/* GFX10/NGG */
1941
static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS);
1942
static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS);
1943
static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES);
1944
static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS);
1945
/* GFX9 (and GFX10 if NGG isn't used) */
1946
static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
1947
static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
1948
static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
1949
/* pre-GFX9 */
1950
static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
1951
static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
1952
static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
1953
static constexpr Stage tess_eval_es(HWStage::ES,
1954
                                    SWStage::TES); /* tesselation evaluation before geometry */
1955
static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
1956

1957
enum statistic {
1958
   statistic_hash,
1959
   statistic_instructions,
1960
   statistic_copies,
1961
   statistic_branches,
1962
   statistic_latency,
1963
   statistic_inv_throughput,
1964
   statistic_vmem_clauses,
1965
   statistic_smem_clauses,
1966
   statistic_sgpr_presched,
1967
   statistic_vgpr_presched,
1968
   num_statistics
1969
};
1970

1971
struct DeviceInfo {
1972
   uint16_t lds_encoding_granule;
1973
   uint16_t lds_alloc_granule;
1974
   uint32_t lds_limit; /* in bytes */
1975
   bool has_16bank_lds;
1976
   uint16_t physical_sgprs;
1977
   uint16_t physical_vgprs;
1978
   uint16_t vgpr_limit;
1979
   uint16_t sgpr_limit;
1980
   uint16_t sgpr_alloc_granule;
1981
   uint16_t vgpr_alloc_granule; /* must be power of two */
1982
   unsigned max_wave64_per_simd;
1983
   unsigned simd_per_cu;
1984
   bool has_fast_fma32 = false;
1985
   bool xnack_enabled = false;
1986
   bool sram_ecc_enabled = false;
1987
};
1988

1989
enum class CompilationProgress {
1990
   after_isel,
1991
   after_spilling,
1992
   after_ra,
1993
};
1994

1995
class Program final {
1996
public:
1997
   std::vector<Block> blocks;
1998
   std::vector<RegClass> temp_rc = {s1};
1999
   RegisterDemand max_reg_demand = RegisterDemand();
2000
   uint16_t num_waves = 0;
2001
   uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
2002
   ac_shader_config* config;
2003
   struct radv_shader_info* info;
2004
   enum chip_class chip_class;
2005
   enum radeon_family family;
2006
   DeviceInfo dev;
2007
   unsigned wave_size;
2008
   RegClass lane_mask;
2009
   Stage stage;
2010
   bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
2011
   bool needs_wqm = false;   /* there exists a p_wqm instruction */
2012

2013
   std::vector<uint8_t> constant_data;
2014
   Temp private_segment_buffer;
2015
   Temp scratch_offset;
2016

2017
   uint16_t min_waves = 0;
2018
   unsigned workgroup_size; /* if known; otherwise UINT_MAX */
2019
   bool wgp_mode;
2020
   bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */
2021

2022
   bool needs_vcc = false;
2023
   bool needs_flat_scr = false;
2024

2025
   CompilationProgress progress;
2026

2027
   bool collect_statistics = false;
2028
   uint32_t statistics[num_statistics];
2029

2030
   float_mode next_fp_mode;
2031
   unsigned next_loop_depth = 0;
2032
   unsigned next_divergent_if_logical_depth = 0;
2033
   unsigned next_uniform_if_depth = 0;
2034

2035
   struct {
2036
      FILE* output = stderr;
2037
      bool shorten_messages = false;
2038
      void (*func)(void* private_data, enum radv_compiler_debug_level level, const char* message);
2039
      void* private_data;
2040
   } debug;
2041

2042
   uint32_t allocateId(RegClass rc)
2043
   {
2044
      assert(allocationID <= 16777215);
2045
      temp_rc.push_back(rc);
2046
      return allocationID++;
2047
   }
2048

2049
   void allocateRange(unsigned amount)
2050
   {
2051
      assert(allocationID + amount <= 16777216);
2052
      temp_rc.resize(temp_rc.size() + amount);
2053
      allocationID += amount;
2054
   }
2055

2056
   Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
2057

2058
   uint32_t peekAllocationId() { return allocationID; }
2059

2060
   friend void reindex_ssa(Program* program);
2061
   friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out);
2062

2063
   Block* create_and_insert_block()
2064
   {
2065
      Block block;
2066
      return insert_block(std::move(block));
2067
   }
2068

2069
   Block* insert_block(Block&& block)
2070
   {
2071
      block.index = blocks.size();
2072
      block.fp_mode = next_fp_mode;
2073
      block.loop_nest_depth = next_loop_depth;
2074
      block.divergent_if_logical_depth = next_divergent_if_logical_depth;
2075
      block.uniform_if_depth = next_uniform_if_depth;
2076
      blocks.emplace_back(std::move(block));
2077
      return &blocks.back();
2078
   }
2079

2080
private:
2081
   uint32_t allocationID = 1;
2082
};
2083

2084
struct live {
2085
   /* live temps out per block */
2086
   std::vector<IDSet> live_out;
2087
   /* register demand (sgpr/vgpr) per instruction per block */
2088
   std::vector<std::vector<RegisterDemand>> register_demand;
2089
};
2090

2091
struct ra_test_policy {
2092
   /* Force RA to always use its pessimistic fallback algorithm */
2093
   bool skip_optimistic_path = false;
2094
};
2095

2096
void init();
2097

2098
void init_program(Program* program, Stage stage, struct radv_shader_info* info,
2099
                  enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
2100
                  ac_shader_config* config);
2101

2102
void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
2103
                    ac_shader_config* config, struct radv_shader_args* args);
2104
void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
2105
                           struct radv_shader_args* args);
2106
void select_trap_handler_shader(Program* program, struct nir_shader* shader,
2107
                                ac_shader_config* config, struct radv_shader_args* args);
2108

2109
void lower_phis(Program* program);
2110
void calc_min_waves(Program* program);
2111
void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
2112
live live_var_analysis(Program* program);
2113
std::vector<uint16_t> dead_code_analysis(Program* program);
2114
void dominator_tree(Program* program);
2115
void insert_exec_mask(Program* program);
2116
void value_numbering(Program* program);
2117
void optimize(Program* program);
2118
void optimize_postRA(Program* program);
2119
void setup_reduce_temp(Program* program);
2120
void lower_to_cssa(Program* program, live& live_vars);
2121
void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
2122
                         ra_test_policy = {});
2123
void ssa_elimination(Program* program);
2124
void lower_to_hw_instr(Program* program);
2125
void schedule_program(Program* program, live& live_vars);
2126
void spill(Program* program, live& live_vars);
2127
void insert_wait_states(Program* program);
2128
void insert_NOPs(Program* program);
2129
void form_hard_clauses(Program* program);
2130
unsigned emit_program(Program* program, std::vector<uint32_t>& code);
2131
bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
2132
bool validate_ir(Program* program);
2133
bool validate_ra(Program* program);
2134
#ifndef NDEBUG
2135
void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL);
2136
#else
2137
#define perfwarn(program, cond, msg, ...)                                                          \
2138
   do {                                                                                            \
2139
   } while (0)
2140
#endif
2141

2142
void collect_presched_stats(Program* program);
2143
void collect_preasm_stats(Program* program);
2144
void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
2145

2146
enum print_flags {
2147
   print_no_ssa = 0x1,
2148
   print_perf_info = 0x2,
2149
   print_kill = 0x4,
2150
   print_live_vars = 0x8,
2151
};
2152

2153
void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
2154
void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0);
2155
void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
2156
void aco_print_program(const Program* program, FILE* output, const live& live_vars,
2157
                       unsigned flags = 0);
2158

2159
void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...);
2160
void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
2161

2162
#define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
2163
#define aco_err(program, ...)      _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
2164

2165
/* utilities for dealing with register demand */
2166
RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
2167
RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
2168
RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
2169
                                 aco_ptr<Instruction>& instr_before);
2170

2171
/* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
2172
uint16_t get_extra_sgprs(Program* program);
2173

2174
/* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
2175
uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
2176
uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
2177

2178
/* return number of addressable sgprs/vgprs for max_waves */
2179
uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
2180
uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
2181

2182
typedef struct {
2183
   const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
2184
   const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
2185
   const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
2186
   const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
2187
   const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
2188
   const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
2189
   const char* name[static_cast<int>(aco_opcode::num_opcodes)];
2190
   const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
2191
   /* sizes used for input/output modifiers and constants */
2192
   const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
2193
   const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
2194
   const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
2195
} Info;
2196

2197
extern const Info instr_info;
2198

2199
} // namespace aco
2200

2201
#endif /* ACO_IR_H */
2202

2203
Product

Resources

Company