CoCalc -- aco_opcodes.py

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_opcodes.py
⁷¹³⁹ views
1
#
2
# Copyright (c) 2018 Valve Corporation
3
#
4
# Permission is hereby granted, free of charge, to any person obtaining a
5
# copy of this software and associated documentation files (the "Software"),
6
# to deal in the Software without restriction, including without limitation
7
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
# and/or sell copies of the Software, and to permit persons to whom the
9
# Software is furnished to do so, subject to the following conditions:
10
#
11
# The above copyright notice and this permission notice (including the next
12
# paragraph) shall be included in all copies or substantial portions of the
13
# Software.
14
#
15
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
# IN THE SOFTWARE.
22
#
23

24
# Class that represents all the information we have about the opcode
25
# NOTE: this must be kept in sync with aco_op_info
26

27
import sys
28
from enum import Enum
29

30
class InstrClass(Enum):
31
   Valu32 = 0
32
   ValuConvert32 = 1
33
   Valu64 = 2
34
   ValuQuarterRate32 = 3
35
   ValuFma = 4
36
   ValuTranscendental32 = 5
37
   ValuDouble = 6
38
   ValuDoubleAdd = 7
39
   ValuDoubleConvert = 8
40
   ValuDoubleTranscendental = 9
41
   Salu = 10
42
   SMem = 11
43
   Barrier = 12
44
   Branch = 13
45
   Sendmsg = 14
46
   DS = 15
47
   Export = 16
48
   VMem = 17
49
   Waitcnt = 18
50
   Other = 19
51

52
class Format(Enum):
53
   PSEUDO = 0
54
   SOP1 = 1
55
   SOP2 = 2
56
   SOPK = 3
57
   SOPP = 4
58
   SOPC = 5
59
   SMEM = 6
60
   DS = 8
61
   MTBUF = 9
62
   MUBUF = 10
63
   MIMG = 11
64
   EXP = 12
65
   FLAT = 13
66
   GLOBAL = 14
67
   SCRATCH = 15
68
   PSEUDO_BRANCH = 16
69
   PSEUDO_BARRIER = 17
70
   PSEUDO_REDUCTION = 18
71
   VOP3P = 19
72
   VOP1 = 1 << 8
73
   VOP2 = 1 << 9
74
   VOPC = 1 << 10
75
   VOP3 = 1 << 11
76
   VINTRP = 1 << 12
77
   DPP = 1 << 13
78
   SDWA = 1 << 14
79

80
   def get_builder_fields(self):
81
      if self == Format.SOPK:
82
         return [('uint16_t', 'imm', None)]
83
      elif self == Format.SOPP:
84
         return [('uint32_t', 'block', '-1'),
85
                 ('uint32_t', 'imm', '0')]
86
      elif self == Format.SMEM:
87
         return [('memory_sync_info', 'sync', 'memory_sync_info()'),
88
                 ('bool', 'glc', 'false'),
89
                 ('bool', 'dlc', 'false'),
90
                 ('bool', 'nv', 'false')]
91
      elif self == Format.DS:
92
         return [('int16_t', 'offset0', '0'),
93
                 ('int8_t', 'offset1', '0'),
94
                 ('bool', 'gds', 'false')]
95
      elif self == Format.MTBUF:
96
         return [('unsigned', 'dfmt', None),
97
                 ('unsigned', 'nfmt', None),
98
                 ('unsigned', 'offset', None),
99
                 ('bool', 'offen', None),
100
                 ('bool', 'idxen', 'false'),
101
                 ('bool', 'disable_wqm', 'false'),
102
                 ('bool', 'glc', 'false'),
103
                 ('bool', 'dlc', 'false'),
104
                 ('bool', 'slc', 'false'),
105
                 ('bool', 'tfe', 'false')]
106
      elif self == Format.MUBUF:
107
         return [('unsigned', 'offset', None),
108
                 ('bool', 'offen', None),
109
                 ('bool', 'swizzled', 'false'),
110
                 ('bool', 'idxen', 'false'),
111
                 ('bool', 'addr64', 'false'),
112
                 ('bool', 'disable_wqm', 'false'),
113
                 ('bool', 'glc', 'false'),
114
                 ('bool', 'dlc', 'false'),
115
                 ('bool', 'slc', 'false'),
116
                 ('bool', 'tfe', 'false'),
117
                 ('bool', 'lds', 'false')]
118
      elif self == Format.MIMG:
119
         return [('unsigned', 'dmask', '0xF'),
120
                 ('bool', 'da', 'false'),
121
                 ('bool', 'unrm', 'true'),
122
                 ('bool', 'disable_wqm', 'false'),
123
                 ('bool', 'glc', 'false'),
124
                 ('bool', 'dlc', 'false'),
125
                 ('bool', 'slc', 'false'),
126
                 ('bool', 'tfe', 'false'),
127
                 ('bool', 'lwe', 'false'),
128
                 ('bool', 'r128_a16', 'false', 'r128'),
129
                 ('bool', 'd16', 'false')]
130
         return [('unsigned', 'attribute', None),
131
                 ('unsigned', 'component', None)]
132
      elif self == Format.EXP:
133
         return [('unsigned', 'enabled_mask', None),
134
                 ('unsigned', 'dest', None),
135
                 ('bool', 'compr', 'false', 'compressed'),
136
                 ('bool', 'done', 'false'),
137
                 ('bool', 'vm', 'false', 'valid_mask')]
138
      elif self == Format.PSEUDO_BRANCH:
139
         return [('uint32_t', 'target0', '0', 'target[0]'),
140
                 ('uint32_t', 'target1', '0', 'target[1]')]
141
      elif self == Format.PSEUDO_REDUCTION:
142
         return [('ReduceOp', 'op', None, 'reduce_op'),
143
                 ('unsigned', 'cluster_size', '0')]
144
      elif self == Format.PSEUDO_BARRIER:
145
         return [('memory_sync_info', 'sync', None),
146
                 ('sync_scope', 'exec_scope', 'scope_invocation')]
147
      elif self == Format.VINTRP:
148
         return [('unsigned', 'attribute', None),
149
                 ('unsigned', 'component', None)]
150
      elif self == Format.DPP:
151
         return [('uint16_t', 'dpp_ctrl', None),
152
                 ('uint8_t', 'row_mask', '0xF'),
153
                 ('uint8_t', 'bank_mask', '0xF'),
154
                 ('bool', 'bound_ctrl', 'true')]
155
      elif self == Format.VOP3P:
156
         return [('uint8_t', 'opsel_lo', None),
157
                 ('uint8_t', 'opsel_hi', None)]
158
      elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
159
         return [('uint16_t', 'offset', 0),
160
                 ('memory_sync_info', 'sync', 'memory_sync_info()'),
161
                 ('bool', 'glc', 'false'),
162
                 ('bool', 'slc', 'false'),
163
                 ('bool', 'lds', 'false'),
164
                 ('bool', 'nv', 'false')]
165
      else:
166
         return []
167

168
   def get_builder_field_names(self):
169
      return [f[1] for f in self.get_builder_fields()]
170

171
   def get_builder_field_dests(self):
172
      return [(f[3] if len(f) >= 4 else f[1]) for f in self.get_builder_fields()]
173

174
   def get_builder_field_decls(self):
175
      return [('%s %s=%s' % (f[0], f[1], f[2]) if f[2] != None else '%s %s' % (f[0], f[1])) for f in self.get_builder_fields()]
176

177
   def get_builder_initialization(self, num_operands):
178
      res = ''
179
      if self == Format.SDWA:
180
         for i in range(min(num_operands, 2)):
181
            res += 'instr->sel[{0}] = op{0}.op.bytes() == 2 ? sdwa_uword : (op{0}.op.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n'.format(i)
182
         res += 'instr->dst_sel = def0.bytes() == 2 ? sdwa_uword : (def0.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n'
183
         res += 'if (def0.bytes() < 4) instr->dst_preserve = true;'
184
      return res
185

186

187
class Opcode(object):
188
   """Class that represents all the information we have about the opcode
189
   NOTE: this must be kept in sync with aco_op_info
190
   """
191
   def __init__(self, name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod, is_atomic, cls):
192
      """Parameters:
193

194
      - name is the name of the opcode (prepend nir_op_ for the enum name)
195
      - all types are strings that get nir_type_ prepended to them
196
      - input_types is a list of types
197
      - algebraic_properties is a space-seperated string, where nir_op_is_ is
198
        prepended before each entry
199
      - const_expr is an expression or series of statements that computes the
200
        constant value of the opcode given the constant values of its inputs.
201
      """
202
      assert isinstance(name, str)
203
      assert isinstance(opcode_gfx7, int)
204
      assert isinstance(opcode_gfx9, int)
205
      assert isinstance(opcode_gfx10, int)
206
      assert isinstance(format, Format)
207
      assert isinstance(input_mod, bool)
208
      assert isinstance(output_mod, bool)
209

210
      self.name = name
211
      self.opcode_gfx7 = opcode_gfx7
212
      self.opcode_gfx9 = opcode_gfx9
213
      self.opcode_gfx10 = opcode_gfx10
214
      self.input_mod = "1" if input_mod else "0"
215
      self.output_mod = "1" if output_mod else "0"
216
      self.is_atomic = "1" if is_atomic else "0"
217
      self.format = format
218
      self.cls = cls
219

220
      parts = name.replace('_e64', '').rsplit('_', 2)
221
      op_dtype = parts[-1]
222
      def_dtype = parts[-2] if len(parts) > 1 else parts[-1]
223

224
      def_dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]}
225
      op_dtype_sizes = {k:v for k, v in def_dtype_sizes.items()}
226
      # inline constants are 32-bit for 16-bit integer/typeless instructions: https://reviews.llvm.org/D81841
227
      op_dtype_sizes['b16'] = 32
228
      op_dtype_sizes['i16'] = 32
229
      op_dtype_sizes['u16'] = 32
230

231
      # If we can't tell the definition size and the operand size, default to
232
      # 32. Some opcodes can have a larger definition size, but
233
      # get_subdword_definition_info() handles that.
234
      self.operand_size = op_dtype_sizes.get(op_dtype, 32)
235
      self.definition_size = def_dtype_sizes.get(def_dtype, self.operand_size)
236

237
      # exceptions for operands:
238
      if 'qsad_' in name:
239
        self.operand_size = 0
240
      elif 'sad_' in name:
241
        self.operand_size = 32
242
      elif name in ['v_mad_u64_u32', 'v_mad_i64_i32']:
243
        self.operand_size = 0
244
      elif self.operand_size == 24:
245
        self.operand_size = 32
246
      elif op_dtype == 'u8' or op_dtype == 'i8':
247
        self.operand_size = 32
248
      elif name in ['v_cvt_f32_ubyte0', 'v_cvt_f32_ubyte1',
249
                    'v_cvt_f32_ubyte2', 'v_cvt_f32_ubyte3']:
250
        self.operand_size = 32
251

252
      # exceptions for definitions:
253
      if 'qsad_' in name:
254
        self.definition_size = 0
255
      elif 'sad_' in name:
256
        self.definition_size = 32
257
      elif '_pk' in name:
258
        self.definition_size = 32
259

260

261
# global dictionary of opcodes
262
opcodes = {}
263

264
def opcode(name, opcode_gfx7 = -1, opcode_gfx9 = -1, opcode_gfx10 = -1, format = Format.PSEUDO, cls = InstrClass.Other, input_mod = False, output_mod = False, is_atomic = False):
265
   assert name not in opcodes
266
   opcodes[name] = Opcode(name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod, is_atomic, cls)
267

268
def default_class(opcodes, cls):
269
   for op in opcodes:
270
      if isinstance(op[-1], InstrClass):
271
         yield op
272
      else:
273
         yield op + (cls,)
274

275
opcode("exp", 0, 0, 0, format = Format.EXP, cls = InstrClass.Export)
276
opcode("p_parallelcopy")
277
opcode("p_startpgm")
278
opcode("p_phi")
279
opcode("p_linear_phi")
280
opcode("p_as_uniform")
281
opcode("p_unit_test")
282

283
opcode("p_create_vector")
284
opcode("p_extract_vector")
285
opcode("p_split_vector")
286

287
# start/end the parts where we can use exec based instructions
288
# implicitly
289
opcode("p_logical_start")
290
opcode("p_logical_end")
291

292
# e.g. subgroupMin() in SPIR-V
293
opcode("p_reduce", format=Format.PSEUDO_REDUCTION)
294
# e.g. subgroupInclusiveMin()
295
opcode("p_inclusive_scan", format=Format.PSEUDO_REDUCTION)
296
# e.g. subgroupExclusiveMin()
297
opcode("p_exclusive_scan", format=Format.PSEUDO_REDUCTION)
298

299
opcode("p_branch", format=Format.PSEUDO_BRANCH)
300
opcode("p_cbranch", format=Format.PSEUDO_BRANCH)
301
opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH)
302
opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH)
303

304
opcode("p_barrier", format=Format.PSEUDO_BARRIER)
305

306
opcode("p_spill")
307
opcode("p_reload")
308

309
# start/end linear vgprs
310
opcode("p_start_linear_vgpr")
311
opcode("p_end_linear_vgpr")
312

313
opcode("p_wqm")
314
opcode("p_discard_if")
315
opcode("p_demote_to_helper")
316
opcode("p_is_helper")
317
opcode("p_exit_early_if")
318

319
# simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
320
opcode("p_bpermute")
321

322
opcode("p_constaddr")
323

324
# These don't have to be pseudo-ops, but it makes optimization easier to only
325
# have to consider two instructions.
326
# (src0 >> (index * bits)) & ((1 << bits) - 1) with optional sign extension
327
opcode("p_extract") # src1=index, src2=bits, src3=signext
328
# (src0 & ((1 << bits) - 1)) << (index * bits)
329
opcode("p_insert") # src1=index, src2=bits
330

331

332
# SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
333
SOP2 = {
334
  # GFX6, GFX7, GFX8, GFX9, GFX10, name
335
   (0x00, 0x00, 0x00, 0x00, 0x00, "s_add_u32"),
336
   (0x01, 0x01, 0x01, 0x01, 0x01, "s_sub_u32"),
337
   (0x02, 0x02, 0x02, 0x02, 0x02, "s_add_i32"),
338
   (0x03, 0x03, 0x03, 0x03, 0x03, "s_sub_i32"),
339
   (0x04, 0x04, 0x04, 0x04, 0x04, "s_addc_u32"),
340
   (0x05, 0x05, 0x05, 0x05, 0x05, "s_subb_u32"),
341
   (0x06, 0x06, 0x06, 0x06, 0x06, "s_min_i32"),
342
   (0x07, 0x07, 0x07, 0x07, 0x07, "s_min_u32"),
343
   (0x08, 0x08, 0x08, 0x08, 0x08, "s_max_i32"),
344
   (0x09, 0x09, 0x09, 0x09, 0x09, "s_max_u32"),
345
   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_cselect_b32"),
346
   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_cselect_b64"),
347
   (0x0e, 0x0e, 0x0c, 0x0c, 0x0e, "s_and_b32"),
348
   (0x0f, 0x0f, 0x0d, 0x0d, 0x0f, "s_and_b64"),
349
   (0x10, 0x10, 0x0e, 0x0e, 0x10, "s_or_b32"),
350
   (0x11, 0x11, 0x0f, 0x0f, 0x11, "s_or_b64"),
351
   (0x12, 0x12, 0x10, 0x10, 0x12, "s_xor_b32"),
352
   (0x13, 0x13, 0x11, 0x11, 0x13, "s_xor_b64"),
353
   (0x14, 0x14, 0x12, 0x12, 0x14, "s_andn2_b32"),
354
   (0x15, 0x15, 0x13, 0x13, 0x15, "s_andn2_b64"),
355
   (0x16, 0x16, 0x14, 0x14, 0x16, "s_orn2_b32"),
356
   (0x17, 0x17, 0x15, 0x15, 0x17, "s_orn2_b64"),
357
   (0x18, 0x18, 0x16, 0x16, 0x18, "s_nand_b32"),
358
   (0x19, 0x19, 0x17, 0x17, 0x19, "s_nand_b64"),
359
   (0x1a, 0x1a, 0x18, 0x18, 0x1a, "s_nor_b32"),
360
   (0x1b, 0x1b, 0x19, 0x19, 0x1b, "s_nor_b64"),
361
   (0x1c, 0x1c, 0x1a, 0x1a, 0x1c, "s_xnor_b32"),
362
   (0x1d, 0x1d, 0x1b, 0x1b, 0x1d, "s_xnor_b64"),
363
   (0x1e, 0x1e, 0x1c, 0x1c, 0x1e, "s_lshl_b32"),
364
   (0x1f, 0x1f, 0x1d, 0x1d, 0x1f, "s_lshl_b64"),
365
   (0x20, 0x20, 0x1e, 0x1e, 0x20, "s_lshr_b32"),
366
   (0x21, 0x21, 0x1f, 0x1f, 0x21, "s_lshr_b64"),
367
   (0x22, 0x22, 0x20, 0x20, 0x22, "s_ashr_i32"),
368
   (0x23, 0x23, 0x21, 0x21, 0x23, "s_ashr_i64"),
369
   (0x24, 0x24, 0x22, 0x22, 0x24, "s_bfm_b32"),
370
   (0x25, 0x25, 0x23, 0x23, 0x25, "s_bfm_b64"),
371
   (0x26, 0x26, 0x24, 0x24, 0x26, "s_mul_i32"),
372
   (0x27, 0x27, 0x25, 0x25, 0x27, "s_bfe_u32"),
373
   (0x28, 0x28, 0x26, 0x26, 0x28, "s_bfe_i32"),
374
   (0x29, 0x29, 0x27, 0x27, 0x29, "s_bfe_u64"),
375
   (0x2a, 0x2a, 0x28, 0x28, 0x2a, "s_bfe_i64"),
376
   (0x2b, 0x2b, 0x29, 0x29,   -1, "s_cbranch_g_fork", InstrClass.Branch),
377
   (0x2c, 0x2c, 0x2a, 0x2a, 0x2c, "s_absdiff_i32"),
378
   (  -1,   -1, 0x2b, 0x2b,   -1, "s_rfe_restore_b64", InstrClass.Branch),
379
   (  -1,   -1,   -1, 0x2e, 0x2e, "s_lshl1_add_u32"),
380
   (  -1,   -1,   -1, 0x2f, 0x2f, "s_lshl2_add_u32"),
381
   (  -1,   -1,   -1, 0x30, 0x30, "s_lshl3_add_u32"),
382
   (  -1,   -1,   -1, 0x31, 0x31, "s_lshl4_add_u32"),
383
   (  -1,   -1,   -1, 0x32, 0x32, "s_pack_ll_b32_b16"),
384
   (  -1,   -1,   -1, 0x33, 0x33, "s_pack_lh_b32_b16"),
385
   (  -1,   -1,   -1, 0x34, 0x34, "s_pack_hh_b32_b16"),
386
   (  -1,   -1,   -1, 0x2c, 0x35, "s_mul_hi_u32"),
387
   (  -1,   -1,   -1, 0x2d, 0x36, "s_mul_hi_i32"),
388
   # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP2.
389
   (  -1,   -1,   -1,   -1,   -1, "p_constaddr_addlo"),
390
}
391
for (gfx6, gfx7, gfx8, gfx9, gfx10, name, cls) in default_class(SOP2, InstrClass.Salu):
392
    opcode(name, gfx7, gfx9, gfx10, Format.SOP2, cls)
393

394

395
# SOPK instructions: 0 input (+ imm), 1 output + optional scc
396
SOPK = {
397
  # GFX6, GFX7, GFX8, GFX9, GFX10, name
398
   (0x00, 0x00, 0x00, 0x00, 0x00, "s_movk_i32"),
399
   (  -1,   -1,   -1,   -1, 0x01, "s_version"), # GFX10+
400
   (0x02, 0x02, 0x01, 0x01, 0x02, "s_cmovk_i32"), # GFX8_GFX9
401
   (0x03, 0x03, 0x02, 0x02, 0x03, "s_cmpk_eq_i32"),
402
   (0x04, 0x04, 0x03, 0x03, 0x04, "s_cmpk_lg_i32"),
403
   (0x05, 0x05, 0x04, 0x04, 0x05, "s_cmpk_gt_i32"),
404
   (0x06, 0x06, 0x05, 0x05, 0x06, "s_cmpk_ge_i32"),
405
   (0x07, 0x07, 0x06, 0x06, 0x07, "s_cmpk_lt_i32"),
406
   (0x08, 0x08, 0x07, 0x07, 0x08, "s_cmpk_le_i32"),
407
   (0x09, 0x09, 0x08, 0x08, 0x09, "s_cmpk_eq_u32"),
408
   (0x0a, 0x0a, 0x09, 0x09, 0x0a, "s_cmpk_lg_u32"),
409
   (0x0b, 0x0b, 0x0a, 0x0a, 0x0b, "s_cmpk_gt_u32"),
410
   (0x0c, 0x0c, 0x0b, 0x0b, 0x0c, "s_cmpk_ge_u32"),
411
   (0x0d, 0x0d, 0x0c, 0x0c, 0x0d, "s_cmpk_lt_u32"),
412
   (0x0e, 0x0e, 0x0d, 0x0d, 0x0e, "s_cmpk_le_u32"),
413
   (0x0f, 0x0f, 0x0e, 0x0e, 0x0f, "s_addk_i32"),
414
   (0x10, 0x10, 0x0f, 0x0f, 0x10, "s_mulk_i32"),
415
   (0x11, 0x11, 0x10, 0x10,   -1, "s_cbranch_i_fork", InstrClass.Branch),
416
   (0x12, 0x12, 0x11, 0x11, 0x12, "s_getreg_b32"),
417
   (0x13, 0x13, 0x12, 0x12, 0x13, "s_setreg_b32"),
418
   (0x15, 0x15, 0x14, 0x14, 0x15, "s_setreg_imm32_b32"), # requires 32bit literal
419
   (  -1,   -1, 0x15, 0x15, 0x16, "s_call_b64", InstrClass.Branch),
420
   (  -1,   -1,   -1,   -1, 0x17, "s_waitcnt_vscnt", InstrClass.Waitcnt),
421
   (  -1,   -1,   -1,   -1, 0x18, "s_waitcnt_vmcnt", InstrClass.Waitcnt),
422
   (  -1,   -1,   -1,   -1, 0x19, "s_waitcnt_expcnt", InstrClass.Waitcnt),
423
   (  -1,   -1,   -1,   -1, 0x1a, "s_waitcnt_lgkmcnt", InstrClass.Waitcnt),
424
   (  -1,   -1,   -1,   -1, 0x1b, "s_subvector_loop_begin", InstrClass.Branch),
425
   (  -1,   -1,   -1,   -1, 0x1c, "s_subvector_loop_end", InstrClass.Branch),
426
}
427
for (gfx6, gfx7, gfx8, gfx9, gfx10, name, cls) in default_class(SOPK, InstrClass.Salu):
428
   opcode(name, gfx7, gfx9, gfx10, Format.SOPK, cls)
429

430

431
# SOP1 instructions: 1 input, 1 output (+optional SCC)
432
SOP1 = {
433
  # GFX6, GFX7, GFX8, GFX9, GFX10, name
434
   (0x03, 0x03, 0x00, 0x00, 0x03, "s_mov_b32"),
435
   (0x04, 0x04, 0x01, 0x01, 0x04, "s_mov_b64"),
436
   (0x05, 0x05, 0x02, 0x02, 0x05, "s_cmov_b32"),
437
   (0x06, 0x06, 0x03, 0x03, 0x06, "s_cmov_b64"),
438
   (0x07, 0x07, 0x04, 0x04, 0x07, "s_not_b32"),
439
   (0x08, 0x08, 0x05, 0x05, 0x08, "s_not_b64"),
440
   (0x09, 0x09, 0x06, 0x06, 0x09, "s_wqm_b32"),
441
   (0x0a, 0x0a, 0x07, 0x07, 0x0a, "s_wqm_b64"),
442
   (0x0b, 0x0b, 0x08, 0x08, 0x0b, "s_brev_b32"),
443
   (0x0c, 0x0c, 0x09, 0x09, 0x0c, "s_brev_b64"),
444
   (0x0d, 0x0d, 0x0a, 0x0a, 0x0d, "s_bcnt0_i32_b32"),
445
   (0x0e, 0x0e, 0x0b, 0x0b, 0x0e, "s_bcnt0_i32_b64"),
446
   (0x0f, 0x0f, 0x0c, 0x0c, 0x0f, "s_bcnt1_i32_b32"),
447
   (0x10, 0x10, 0x0d, 0x0d, 0x10, "s_bcnt1_i32_b64"),
448
   (0x11, 0x11, 0x0e, 0x0e, 0x11, "s_ff0_i32_b32"),
449
   (0x12, 0x12, 0x0f, 0x0f, 0x12, "s_ff0_i32_b64"),
450
   (0x13, 0x13, 0x10, 0x10, 0x13, "s_ff1_i32_b32"),
451
   (0x14, 0x14, 0x11, 0x11, 0x14, "s_ff1_i32_b64"),
452
   (0x15, 0x15, 0x12, 0x12, 0x15, "s_flbit_i32_b32"),
453
   (0x16, 0x16, 0x13, 0x13, 0x16, "s_flbit_i32_b64"),
454
   (0x17, 0x17, 0x14, 0x14, 0x17, "s_flbit_i32"),
455
   (0x18, 0x18, 0x15, 0x15, 0x18, "s_flbit_i32_i64"),
456
   (0x19, 0x19, 0x16, 0x16, 0x19, "s_sext_i32_i8"),
457
   (0x1a, 0x1a, 0x17, 0x17, 0x1a, "s_sext_i32_i16"),
458
   (0x1b, 0x1b, 0x18, 0x18, 0x1b, "s_bitset0_b32"),
459
   (0x1c, 0x1c, 0x19, 0x19, 0x1c, "s_bitset0_b64"),
460
   (0x1d, 0x1d, 0x1a, 0x1a, 0x1d, "s_bitset1_b32"),
461
   (0x1e, 0x1e, 0x1b, 0x1b, 0x1e, "s_bitset1_b64"),
462
   (0x1f, 0x1f, 0x1c, 0x1c, 0x1f, "s_getpc_b64"),
463
   (0x20, 0x20, 0x1d, 0x1d, 0x20, "s_setpc_b64", InstrClass.Branch),
464
   (0x21, 0x21, 0x1e, 0x1e, 0x21, "s_swappc_b64", InstrClass.Branch),
465
   (0x22, 0x22, 0x1f, 0x1f, 0x22, "s_rfe_b64", InstrClass.Branch),
466
   (0x24, 0x24, 0x20, 0x20, 0x24, "s_and_saveexec_b64"),
467
   (0x25, 0x25, 0x21, 0x21, 0x25, "s_or_saveexec_b64"),
468
   (0x26, 0x26, 0x22, 0x22, 0x26, "s_xor_saveexec_b64"),
469
   (0x27, 0x27, 0x23, 0x23, 0x27, "s_andn2_saveexec_b64"),
470
   (0x28, 0x28, 0x24, 0x24, 0x28, "s_orn2_saveexec_b64"),
471
   (0x29, 0x29, 0x25, 0x25, 0x29, "s_nand_saveexec_b64"),
472
   (0x2a, 0x2a, 0x26, 0x26, 0x2a, "s_nor_saveexec_b64"),
473
   (0x2b, 0x2b, 0x27, 0x27, 0x2b, "s_xnor_saveexec_b64"),
474
   (0x2c, 0x2c, 0x28, 0x28, 0x2c, "s_quadmask_b32"),
475
   (0x2d, 0x2d, 0x29, 0x29, 0x2d, "s_quadmask_b64"),
476
   (0x2e, 0x2e, 0x2a, 0x2a, 0x2e, "s_movrels_b32"),
477
   (0x2f, 0x2f, 0x2b, 0x2b, 0x2f, "s_movrels_b64"),
478
   (0x30, 0x30, 0x2c, 0x2c, 0x30, "s_movreld_b32"),
479
   (0x31, 0x31, 0x2d, 0x2d, 0x31, "s_movreld_b64"),
480
   (0x32, 0x32, 0x2e, 0x2e,   -1, "s_cbranch_join", InstrClass.Branch),
481
   (0x34, 0x34, 0x30, 0x30, 0x34, "s_abs_i32"),
482
   (0x35, 0x35,   -1,   -1, 0x35, "s_mov_fed_b32"),
483
   (  -1,   -1, 0x32, 0x32,   -1, "s_set_gpr_idx_idx"),
484
   (  -1,   -1,   -1, 0x33, 0x37, "s_andn1_saveexec_b64"),
485
   (  -1,   -1,   -1, 0x34, 0x38, "s_orn1_saveexec_b64"),
486
   (  -1,   -1,   -1, 0x35, 0x39, "s_andn1_wrexec_b64"),
487
   (  -1,   -1,   -1, 0x36, 0x3a, "s_andn2_wrexec_b64"),
488
   (  -1,   -1,   -1, 0x37, 0x3b, "s_bitreplicate_b64_b32"),
489
   (  -1,   -1,   -1,   -1, 0x3c, "s_and_saveexec_b32"),
490
   (  -1,   -1,   -1,   -1, 0x3d, "s_or_saveexec_b32"),
491
   (  -1,   -1,   -1,   -1, 0x3e, "s_xor_saveexec_b32"),
492
   (  -1,   -1,   -1,   -1, 0x3f, "s_andn2_saveexec_b32"),
493
   (  -1,   -1,   -1,   -1, 0x40, "s_orn2_saveexec_b32"),
494
   (  -1,   -1,   -1,   -1, 0x41, "s_nand_saveexec_b32"),
495
   (  -1,   -1,   -1,   -1, 0x42, "s_nor_saveexec_b32"),
496
   (  -1,   -1,   -1,   -1, 0x43, "s_xnor_saveexec_b32"),
497
   (  -1,   -1,   -1,   -1, 0x44, "s_andn1_saveexec_b32"),
498
   (  -1,   -1,   -1,   -1, 0x45, "s_orn1_saveexec_b32"),
499
   (  -1,   -1,   -1,   -1, 0x46, "s_andn1_wrexec_b32"),
500
   (  -1,   -1,   -1,   -1, 0x47, "s_andn2_wrexec_b32"),
501
   (  -1,   -1,   -1,   -1, 0x49, "s_movrelsd_2_b32"),
502
   # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP1.
503
   (  -1,   -1,   -1,   -1,   -1, "p_constaddr_getpc"),
504
}
505
for (gfx6, gfx7, gfx8, gfx9, gfx10, name, cls) in default_class(SOP1, InstrClass.Salu):
506
   opcode(name, gfx7, gfx9, gfx10, Format.SOP1, cls)
507

508

509
# SOPC instructions: 2 inputs and 0 outputs (+SCC)
510
SOPC = {
511
  # GFX6, GFX7, GFX8, GFX9, GFX10, name
512
   (0x00, 0x00, 0x00, 0x00, 0x00, "s_cmp_eq_i32"),
513
   (0x01, 0x01, 0x01, 0x01, 0x01, "s_cmp_lg_i32"),
514
   (0x02, 0x02, 0x02, 0x02, 0x02, "s_cmp_gt_i32"),
515
   (0x03, 0x03, 0x03, 0x03, 0x03, "s_cmp_ge_i32"),
516
   (0x04, 0x04, 0x04, 0x04, 0x04, "s_cmp_lt_i32"),
517
   (0x05, 0x05, 0x05, 0x05, 0x05, "s_cmp_le_i32"),
518
   (0x06, 0x06, 0x06, 0x06, 0x06, "s_cmp_eq_u32"),
519
   (0x07, 0x07, 0x07, 0x07, 0x07, "s_cmp_lg_u32"),
520
   (0x08, 0x08, 0x08, 0x08, 0x08, "s_cmp_gt_u32"),
521
   (0x09, 0x09, 0x09, 0x09, 0x09, "s_cmp_ge_u32"),
522
   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_cmp_lt_u32"),
523
   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_cmp_le_u32"),
524
   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_bitcmp0_b32"),
525
   (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "s_bitcmp1_b32"),
526
   (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "s_bitcmp0_b64"),
527
   (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "s_bitcmp1_b64"),
528
   (0x10, 0x10, 0x10, 0x10,   -1, "s_setvskip"),
529
   (  -1,   -1, 0x11, 0x11,   -1, "s_set_gpr_idx_on"),
530
   (  -1,   -1, 0x12, 0x12, 0x12, "s_cmp_eq_u64"),
531
   (  -1,   -1, 0x13, 0x13, 0x13, "s_cmp_lg_u64"),
532
}
533
for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPC:
534
   opcode(name, gfx7, gfx9, gfx10, Format.SOPC, InstrClass.Salu)
535

536

537
# SOPP instructions: 0 inputs (+optional scc/vcc), 0 outputs
538
SOPP = {
539
  # GFX6, GFX7, GFX8, GFX9, GFX10, name
540
   (0x00, 0x00, 0x00, 0x00, 0x00, "s_nop"),
541
   (0x01, 0x01, 0x01, 0x01, 0x01, "s_endpgm"),
542
   (0x02, 0x02, 0x02, 0x02, 0x02, "s_branch", InstrClass.Branch),
543
   (  -1,   -1, 0x03, 0x03, 0x03, "s_wakeup"),
544
   (0x04, 0x04, 0x04, 0x04, 0x04, "s_cbranch_scc0", InstrClass.Branch),
545
   (0x05, 0x05, 0x05, 0x05, 0x05, "s_cbranch_scc1", InstrClass.Branch),
546
   (0x06, 0x06, 0x06, 0x06, 0x06, "s_cbranch_vccz", InstrClass.Branch),
547
   (0x07, 0x07, 0x07, 0x07, 0x07, "s_cbranch_vccnz", InstrClass.Branch),
548
   (0x08, 0x08, 0x08, 0x08, 0x08, "s_cbranch_execz", InstrClass.Branch),
549
   (0x09, 0x09, 0x09, 0x09, 0x09, "s_cbranch_execnz", InstrClass.Branch),
550
   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_barrier", InstrClass.Barrier),
551
   (  -1, 0x0b, 0x0b, 0x0b, 0x0b, "s_setkill"),
552
   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_waitcnt", InstrClass.Waitcnt),
553
   (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "s_sethalt"),
554
   (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "s_sleep"),
555
   (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "s_setprio"),
556
   (0x10, 0x10, 0x10, 0x10, 0x10, "s_sendmsg", InstrClass.Sendmsg),
557
   (0x11, 0x11, 0x11, 0x11, 0x11, "s_sendmsghalt", InstrClass.Sendmsg),
558
   (0x12, 0x12, 0x12, 0x12, 0x12, "s_trap", InstrClass.Branch),
559
   (0x13, 0x13, 0x13, 0x13, 0x13, "s_icache_inv"),
560
   (0x14, 0x14, 0x14, 0x14, 0x14, "s_incperflevel"),
561
   (0x15, 0x15, 0x15, 0x15, 0x15, "s_decperflevel"),
562
   (0x16, 0x16, 0x16, 0x16, 0x16, "s_ttracedata"),
563
   (  -1, 0x17, 0x17, 0x17, 0x17, "s_cbranch_cdbgsys", InstrClass.Branch),
564
   (  -1, 0x18, 0x18, 0x18, 0x18, "s_cbranch_cdbguser", InstrClass.Branch),
565
   (  -1, 0x19, 0x19, 0x19, 0x19, "s_cbranch_cdbgsys_or_user", InstrClass.Branch),
566
   (  -1, 0x1a, 0x1a, 0x1a, 0x1a, "s_cbranch_cdbgsys_and_user", InstrClass.Branch),
567
   (  -1,   -1, 0x1b, 0x1b, 0x1b, "s_endpgm_saved"),
568
   (  -1,   -1, 0x1c, 0x1c,   -1, "s_set_gpr_idx_off"),
569
   (  -1,   -1, 0x1d, 0x1d,   -1, "s_set_gpr_idx_mode"),
570
   (  -1,   -1,   -1, 0x1e, 0x1e, "s_endpgm_ordered_ps_done"),
571
   (  -1,   -1,   -1,   -1, 0x1f, "s_code_end"),
572
   (  -1,   -1,   -1,   -1, 0x20, "s_inst_prefetch"),
573
   (  -1,   -1,   -1,   -1, 0x21, "s_clause"),
574
   (  -1,   -1,   -1,   -1, 0x22, "s_wait_idle"),
575
   (  -1,   -1,   -1,   -1, 0x23, "s_waitcnt_depctr"),
576
   (  -1,   -1,   -1,   -1, 0x24, "s_round_mode"),
577
   (  -1,   -1,   -1,   -1, 0x25, "s_denorm_mode"),
578
   (  -1,   -1,   -1,   -1, 0x26, "s_ttracedata_imm"),
579
}
580
for (gfx6, gfx7, gfx8, gfx9, gfx10, name, cls) in default_class(SOPP, InstrClass.Salu):
581
   opcode(name, gfx7, gfx9, gfx10, Format.SOPP, cls)
582

583

584
# SMEM instructions: sbase input (2 sgpr), potentially 2 offset inputs, 1 sdata input/output
585
# Unlike GFX10, GFX10.3 does not have SMEM store, atomic or scratch instructions
586
SMEM = {
587
  # GFX6, GFX7, GFX8, GFX9, GFX10, name
588
   (0x00, 0x00, 0x00, 0x00, 0x00, "s_load_dword"),
589
   (0x01, 0x01, 0x01, 0x01, 0x01, "s_load_dwordx2"),
590
   (0x02, 0x02, 0x02, 0x02, 0x02, "s_load_dwordx4"),
591
   (0x03, 0x03, 0x03, 0x03, 0x03, "s_load_dwordx8"),
592
   (0x04, 0x04, 0x04, 0x04, 0x04, "s_load_dwordx16"),
593
   (  -1,   -1,   -1, 0x05, 0x05, "s_scratch_load_dword"),
594
   (  -1,   -1,   -1, 0x06, 0x06, "s_scratch_load_dwordx2"),
595
   (  -1,   -1,   -1, 0x07, 0x07, "s_scratch_load_dwordx4"),
596
   (0x08, 0x08, 0x08, 0x08, 0x08, "s_buffer_load_dword"),
597
   (0x09, 0x09, 0x09, 0x09, 0x09, "s_buffer_load_dwordx2"),
598
   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_buffer_load_dwordx4"),
599
   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_buffer_load_dwordx8"),
600
   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_buffer_load_dwordx16"),
601
   (  -1,   -1, 0x10, 0x10, 0x10, "s_store_dword"),
602
   (  -1,   -1, 0x11, 0x11, 0x11, "s_store_dwordx2"),
603
   (  -1,   -1, 0x12, 0x12, 0x12, "s_store_dwordx4"),
604
   (  -1,   -1,   -1, 0x15, 0x15, "s_scratch_store_dword"),
605
   (  -1,   -1,   -1, 0x16, 0x16, "s_scratch_store_dwordx2"),
606
   (  -1,   -1,   -1, 0x17, 0x17, "s_scratch_store_dwordx4"),
607
   (  -1,   -1, 0x18, 0x18, 0x18, "s_buffer_store_dword"),
608
   (  -1,   -1, 0x19, 0x19, 0x19, "s_buffer_store_dwordx2"),
609
   (  -1,   -1, 0x1a, 0x1a, 0x1a, "s_buffer_store_dwordx4"),
610
   (  -1,   -1, 0x1f, 0x1f, 0x1f, "s_gl1_inv"),
611
   (0x1f, 0x1f, 0x20, 0x20, 0x20, "s_dcache_inv"),
612
   (  -1,   -1, 0x21, 0x21, 0x21, "s_dcache_wb"),
613
   (  -1, 0x1d, 0x22, 0x22,   -1, "s_dcache_inv_vol"),
614
   (  -1,   -1, 0x23, 0x23,   -1, "s_dcache_wb_vol"),
615
   (0x1e, 0x1e, 0x24, 0x24, 0x24, "s_memtime"), #GFX6-GFX10
616
   (  -1,   -1, 0x25, 0x25, 0x25, "s_memrealtime"),
617
   (  -1,   -1, 0x26, 0x26, 0x26, "s_atc_probe"),
618
   (  -1,   -1, 0x27, 0x27, 0x27, "s_atc_probe_buffer"),
619
   (  -1,   -1,   -1, 0x28, 0x28, "s_dcache_discard"),
620
   (  -1,   -1,   -1, 0x29, 0x29, "s_dcache_discard_x2"),
621
   (  -1,   -1,   -1,   -1, 0x2a, "s_get_waveid_in_workgroup"),
622
   (  -1,   -1,   -1, 0x40, 0x40, "s_buffer_atomic_swap"),
623
   (  -1,   -1,   -1, 0x41, 0x41, "s_buffer_atomic_cmpswap"),
624
   (  -1,   -1,   -1, 0x42, 0x42, "s_buffer_atomic_add"),
625
   (  -1,   -1,   -1, 0x43, 0x43, "s_buffer_atomic_sub"),
626
   (  -1,   -1,   -1, 0x44, 0x44, "s_buffer_atomic_smin"),
627
   (  -1,   -1,   -1, 0x45, 0x45, "s_buffer_atomic_umin"),
628
   (  -1,   -1,   -1, 0x46, 0x46, "s_buffer_atomic_smax"),
629
   (  -1,   -1,   -1, 0x47, 0x47, "s_buffer_atomic_umax"),
630
   (  -1,   -1,   -1, 0x48, 0x48, "s_buffer_atomic_and"),
631
   (  -1,   -1,   -1, 0x49, 0x49, "s_buffer_atomic_or"),
632
   (  -1,   -1,   -1, 0x4a, 0x4a, "s_buffer_atomic_xor"),
633
   (  -1,   -1,   -1, 0x4b, 0x4b, "s_buffer_atomic_inc"),
634
   (  -1,   -1,   -1, 0x4c, 0x4c, "s_buffer_atomic_dec"),
635
   (  -1,   -1,   -1, 0x60, 0x60, "s_buffer_atomic_swap_x2"),
636
   (  -1,   -1,   -1, 0x61, 0x61, "s_buffer_atomic_cmpswap_x2"),
637
   (  -1,   -1,   -1, 0x62, 0x62, "s_buffer_atomic_add_x2"),
638
   (  -1,   -1,   -1, 0x63, 0x63, "s_buffer_atomic_sub_x2"),
639
   (  -1,   -1,   -1, 0x64, 0x64, "s_buffer_atomic_smin_x2"),
640
   (  -1,   -1,   -1, 0x65, 0x65, "s_buffer_atomic_umin_x2"),
641
   (  -1,   -1,   -1, 0x66, 0x66, "s_buffer_atomic_smax_x2"),
642
   (  -1,   -1,   -1, 0x67, 0x67, "s_buffer_atomic_umax_x2"),
643
   (  -1,   -1,   -1, 0x68, 0x68, "s_buffer_atomic_and_x2"),
644
   (  -1,   -1,   -1, 0x69, 0x69, "s_buffer_atomic_or_x2"),
645
   (  -1,   -1,   -1, 0x6a, 0x6a, "s_buffer_atomic_xor_x2"),
646
   (  -1,   -1,   -1, 0x6b, 0x6b, "s_buffer_atomic_inc_x2"),
647
   (  -1,   -1,   -1, 0x6c, 0x6c, "s_buffer_atomic_dec_x2"),
648
   (  -1,   -1,   -1, 0x80, 0x80, "s_atomic_swap"),
649
   (  -1,   -1,   -1, 0x81, 0x81, "s_atomic_cmpswap"),
650
   (  -1,   -1,   -1, 0x82, 0x82, "s_atomic_add"),
651
   (  -1,   -1,   -1, 0x83, 0x83, "s_atomic_sub"),
652
   (  -1,   -1,   -1, 0x84, 0x84, "s_atomic_smin"),
653
   (  -1,   -1,   -1, 0x85, 0x85, "s_atomic_umin"),
654
   (  -1,   -1,   -1, 0x86, 0x86, "s_atomic_smax"),
655
   (  -1,   -1,   -1, 0x87, 0x87, "s_atomic_umax"),
656
   (  -1,   -1,   -1, 0x88, 0x88, "s_atomic_and"),
657
   (  -1,   -1,   -1, 0x89, 0x89, "s_atomic_or"),
658
   (  -1,   -1,   -1, 0x8a, 0x8a, "s_atomic_xor"),
659
   (  -1,   -1,   -1, 0x8b, 0x8b, "s_atomic_inc"),
660
   (  -1,   -1,   -1, 0x8c, 0x8c, "s_atomic_dec"),
661
   (  -1,   -1,   -1, 0xa0, 0xa0, "s_atomic_swap_x2"),
662
   (  -1,   -1,   -1, 0xa1, 0xa1, "s_atomic_cmpswap_x2"),
663
   (  -1,   -1,   -1, 0xa2, 0xa2, "s_atomic_add_x2"),
664
   (  -1,   -1,   -1, 0xa3, 0xa3, "s_atomic_sub_x2"),
665
   (  -1,   -1,   -1, 0xa4, 0xa4, "s_atomic_smin_x2"),
666
   (  -1,   -1,   -1, 0xa5, 0xa5, "s_atomic_umin_x2"),
667
   (  -1,   -1,   -1, 0xa6, 0xa6, "s_atomic_smax_x2"),
668
   (  -1,   -1,   -1, 0xa7, 0xa7, "s_atomic_umax_x2"),
669
   (  -1,   -1,   -1, 0xa8, 0xa8, "s_atomic_and_x2"),
670
   (  -1,   -1,   -1, 0xa9, 0xa9, "s_atomic_or_x2"),
671
   (  -1,   -1,   -1, 0xaa, 0xaa, "s_atomic_xor_x2"),
672
   (  -1,   -1,   -1, 0xab, 0xab, "s_atomic_inc_x2"),
673
   (  -1,   -1,   -1, 0xac, 0xac, "s_atomic_dec_x2"),
674
}
675
for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SMEM:
676
   opcode(name, gfx7, gfx9, gfx10, Format.SMEM, InstrClass.SMem, is_atomic = "atomic" in name)
677

678

679
# VOP2 instructions: 2 inputs, 1 output (+ optional vcc)
680
# TODO: misses some GFX6_7 opcodes which were shifted to VOP3 in GFX8
681
VOP2 = {
682
  # GFX6, GFX7, GFX8, GFX9, GFX10, name, input/output modifiers
683
   (0x01, 0x01,   -1,   -1,   -1, "v_readlane_b32", False),
684
   (0x02, 0x02,   -1,   -1,   -1, "v_writelane_b32", False),
685
   (0x03, 0x03, 0x01, 0x01, 0x03, "v_add_f32", True),
686
   (0x04, 0x04, 0x02, 0x02, 0x04, "v_sub_f32", True),
687
   (0x05, 0x05, 0x03, 0x03, 0x05, "v_subrev_f32", True),
688
   (0x06, 0x06,   -1,   -1, 0x06, "v_mac_legacy_f32", True),
689
   (0x07, 0x07, 0x04, 0x04, 0x07, "v_mul_legacy_f32", True),
690
   (0x08, 0x08, 0x05, 0x05, 0x08, "v_mul_f32", True),
691
   (0x09, 0x09, 0x06, 0x06, 0x09, "v_mul_i32_i24", False),
692
   (0x0a, 0x0a, 0x07, 0x07, 0x0a, "v_mul_hi_i32_i24", False),
693
   (0x0b, 0x0b, 0x08, 0x08, 0x0b, "v_mul_u32_u24", False),
694
   (0x0c, 0x0c, 0x09, 0x09, 0x0c, "v_mul_hi_u32_u24", False),
695
   (0x0d, 0x0d,   -1,   -1,   -1, "v_min_legacy_f32", True),
696
   (0x0e, 0x0e,   -1,   -1,   -1, "v_max_legacy_f32", True),
697
   (0x0f, 0x0f, 0x0a, 0x0a, 0x0f, "v_min_f32", True),
698
   (0x10, 0x10, 0x0b, 0x0b, 0x10, "v_max_f32", True),
699
   (0x11, 0x11, 0x0c, 0x0c, 0x11, "v_min_i32", False),
700
   (0x12, 0x12, 0x0d, 0x0d, 0x12, "v_max_i32", False),
701
   (0x13, 0x13, 0x0e, 0x0e, 0x13, "v_min_u32", False),
702
   (0x14, 0x14, 0x0f, 0x0f, 0x14, "v_max_u32", False),
703
   (0x15, 0x15,   -1,   -1,   -1, "v_lshr_b32", False),
704
   (0x16, 0x16, 0x10, 0x10, 0x16, "v_lshrrev_b32", False),
705
   (0x17, 0x17,   -1,   -1,   -1, "v_ashr_i32", False),
706
   (0x18, 0x18, 0x11, 0x11, 0x18, "v_ashrrev_i32", False),
707
   (0x19, 0x19,   -1,   -1,   -1, "v_lshl_b32", False),
708
   (0x1a, 0x1a, 0x12, 0x12, 0x1a, "v_lshlrev_b32", False),
709
   (0x1b, 0x1b, 0x13, 0x13, 0x1b, "v_and_b32", False),
710
   (0x1c, 0x1c, 0x14, 0x14, 0x1c, "v_or_b32", False),
711
   (0x1d, 0x1d, 0x15, 0x15, 0x1d, "v_xor_b32", False),
712
   (  -1,   -1,   -1,   -1, 0x1e, "v_xnor_b32", False),
713
   (0x1f, 0x1f, 0x16, 0x16, 0x1f, "v_mac_f32", True),
714
   (0x20, 0x20, 0x17, 0x17, 0x20, "v_madmk_f32", False),
715
   (0x21, 0x21, 0x18, 0x18, 0x21, "v_madak_f32", False),
716
   (0x24, 0x24,   -1,   -1,   -1, "v_mbcnt_hi_u32_b32", False),
717
   (0x25, 0x25, 0x19, 0x19,   -1, "v_add_co_u32", False), # VOP3B only in RDNA
718
   (0x26, 0x26, 0x1a, 0x1a,   -1, "v_sub_co_u32", False), # VOP3B only in RDNA
719
   (0x27, 0x27, 0x1b, 0x1b,   -1, "v_subrev_co_u32", False), # VOP3B only in RDNA
720
   (0x28, 0x28, 0x1c, 0x1c, 0x28, "v_addc_co_u32", False), # v_add_co_ci_u32 in RDNA
721
   (0x29, 0x29, 0x1d, 0x1d, 0x29, "v_subb_co_u32", False), # v_sub_co_ci_u32 in RDNA
722
   (0x2a, 0x2a, 0x1e, 0x1e, 0x2a, "v_subbrev_co_u32", False), # v_subrev_co_ci_u32 in RDNA
723
   (  -1,   -1,   -1,   -1, 0x2b, "v_fmac_f32", True),
724
   (  -1,   -1,   -1,   -1, 0x2c, "v_fmamk_f32", True),
725
   (  -1,   -1,   -1,   -1, 0x2d, "v_fmaak_f32", True),
726
   (0x2f, 0x2f,   -1,   -1, 0x2f, "v_cvt_pkrtz_f16_f32", True),
727
   (  -1,   -1, 0x1f, 0x1f, 0x32, "v_add_f16", True),
728
   (  -1,   -1, 0x20, 0x20, 0x33, "v_sub_f16", True),
729
   (  -1,   -1, 0x21, 0x21, 0x34, "v_subrev_f16", True),
730
   (  -1,   -1, 0x22, 0x22, 0x35, "v_mul_f16", True),
731
   (  -1,   -1, 0x23, 0x23,   -1, "v_mac_f16", True),
732
   (  -1,   -1, 0x24, 0x24,   -1, "v_madmk_f16", False),
733
   (  -1,   -1, 0x25, 0x25,   -1, "v_madak_f16", False),
734
   (  -1,   -1, 0x26, 0x26,   -1, "v_add_u16", False),
735
   (  -1,   -1, 0x27, 0x27,   -1, "v_sub_u16", False),
736
   (  -1,   -1, 0x28, 0x28,   -1, "v_subrev_u16", False),
737
   (  -1,   -1, 0x29, 0x29,   -1, "v_mul_lo_u16", False),
738
   (  -1,   -1, 0x2a, 0x2a,   -1, "v_lshlrev_b16", False),
739
   (  -1,   -1, 0x2b, 0x2b,   -1, "v_lshrrev_b16", False),
740
   (  -1,   -1, 0x2c, 0x2c,   -1, "v_ashrrev_i16", False),
741
   (  -1,   -1, 0x2d, 0x2d, 0x39, "v_max_f16", True),
742
   (  -1,   -1, 0x2e, 0x2e, 0x3a, "v_min_f16", True),
743
   (  -1,   -1, 0x2f, 0x2f,   -1, "v_max_u16", False),
744
   (  -1,   -1, 0x30, 0x30,   -1, "v_max_i16", False),
745
   (  -1,   -1, 0x31, 0x31,   -1, "v_min_u16", False),
746
   (  -1,   -1, 0x32, 0x32,   -1, "v_min_i16", False),
747
   (  -1,   -1, 0x33, 0x33, 0x3b, "v_ldexp_f16", False),
748
   (  -1,   -1,   -1, 0x34, 0x25, "v_add_u32", False), # use v_add_co_u32 on GFX8, called v_add_nc_u32 in RDNA
749
   (  -1,   -1,   -1, 0x35, 0x26, "v_sub_u32", False), # use v_sub_co_u32 on GFX8, called v_sub_nc_u32 in RDNA
750
   (  -1,   -1,   -1, 0x36, 0x27, "v_subrev_u32", False), # use v_subrev_co_u32 on GFX8, called v_subrev_nc_u32 in RDNA
751
   (  -1,   -1,   -1,   -1, 0x36, "v_fmac_f16", False),
752
   (  -1,   -1,   -1,   -1, 0x37, "v_fmamk_f16", False),
753
   (  -1,   -1,   -1,   -1, 0x38, "v_fmaak_f16", False),
754
   (  -1,   -1,   -1,   -1, 0x3c, "v_pk_fmac_f16", False),
755
}
756
for (gfx6, gfx7, gfx8, gfx9, gfx10, name, modifiers) in VOP2:
757
   opcode(name, gfx7, gfx9, gfx10, Format.VOP2, InstrClass.Valu32, modifiers, modifiers)
758

759
if True:
760
    # v_cndmask_b32 can use input modifiers but not output modifiers
761
    (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32")
762
    opcode(name, gfx7, gfx9, gfx10, Format.VOP2, InstrClass.Valu32, True, False)
763

764

765
# VOP1 instructions: instructions with 1 input and 1 output
766
VOP1 = {
767
  # GFX6, GFX7, GFX8, GFX9, GFX10, name, input_modifiers, output_modifiers
768
   (0x00, 0x00, 0x00, 0x00, 0x00, "v_nop", False, False),
769
   (0x01, 0x01, 0x01, 0x01, 0x01, "v_mov_b32", False, False),
770
   (0x02, 0x02, 0x02, 0x02, 0x02, "v_readfirstlane_b32", False, False),
771
   (0x03, 0x03, 0x03, 0x03, 0x03, "v_cvt_i32_f64", True, False, InstrClass.ValuDoubleConvert),
772
   (0x04, 0x04, 0x04, 0x04, 0x04, "v_cvt_f64_i32", False, True, InstrClass.ValuDoubleConvert),
773
   (0x05, 0x05, 0x05, 0x05, 0x05, "v_cvt_f32_i32", False, True),
774
   (0x06, 0x06, 0x06, 0x06, 0x06, "v_cvt_f32_u32", False, True),
775
   (0x07, 0x07, 0x07, 0x07, 0x07, "v_cvt_u32_f32", True, False),
776
   (0x08, 0x08, 0x08, 0x08, 0x08, "v_cvt_i32_f32", True, False),
777
   (0x09, 0x09,   -1,   -1, 0x09, "v_mov_fed_b32", True, False), # LLVM mentions it for GFX8_9
778
   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "v_cvt_f16_f32", True, True),
779
   (  -1,   -1,   -1,   -1,   -1, "p_cvt_f16_f32_rtne", True, True),
780
   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "v_cvt_f32_f16", True, True),
781
   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "v_cvt_rpi_i32_f32", True, False),
782
   (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "v_cvt_flr_i32_f32", True, False),
783
   (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "v_cvt_off_f32_i4", False, True),
784
   (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "v_cvt_f32_f64", True, True, InstrClass.ValuDoubleConvert),
785
   (0x10, 0x10, 0x10, 0x10, 0x10, "v_cvt_f64_f32", True, True, InstrClass.ValuDoubleConvert),
786
   (0x11, 0x11, 0x11, 0x11, 0x11, "v_cvt_f32_ubyte0", False, True),
787
   (0x12, 0x12, 0x12, 0x12, 0x12, "v_cvt_f32_ubyte1", False, True),
788
   (0x13, 0x13, 0x13, 0x13, 0x13, "v_cvt_f32_ubyte2", False, True),
789
   (0x14, 0x14, 0x14, 0x14, 0x14, "v_cvt_f32_ubyte3", False, True),
790
   (0x15, 0x15, 0x15, 0x15, 0x15, "v_cvt_u32_f64", True, False, InstrClass.ValuDoubleConvert),
791
   (0x16, 0x16, 0x16, 0x16, 0x16, "v_cvt_f64_u32", False, True, InstrClass.ValuDoubleConvert),
792
   (  -1, 0x17, 0x17, 0x17, 0x17, "v_trunc_f64", True, True, InstrClass.ValuDouble),
793
   (  -1, 0x18, 0x18, 0x18, 0x18, "v_ceil_f64", True, True, InstrClass.ValuDouble),
794
   (  -1, 0x19, 0x19, 0x19, 0x19, "v_rndne_f64", True, True, InstrClass.ValuDouble),
795
   (  -1, 0x1a, 0x1a, 0x1a, 0x1a, "v_floor_f64", True, True, InstrClass.ValuDouble),
796
   (  -1,   -1,   -1,   -1, 0x1b, "v_pipeflush", False, False),
797
   (0x20, 0x20, 0x1b, 0x1b, 0x20, "v_fract_f32", True, True),
798
   (0x21, 0x21, 0x1c, 0x1c, 0x21, "v_trunc_f32", True, True),
799
   (0x22, 0x22, 0x1d, 0x1d, 0x22, "v_ceil_f32", True, True),
800
   (0x23, 0x23, 0x1e, 0x1e, 0x23, "v_rndne_f32", True, True),
801
   (0x24, 0x24, 0x1f, 0x1f, 0x24, "v_floor_f32", True, True),
802
   (0x25, 0x25, 0x20, 0x20, 0x25, "v_exp_f32", True, True, InstrClass.ValuTranscendental32),
803
   (0x26, 0x26,   -1,   -1,   -1, "v_log_clamp_f32", True, True, InstrClass.ValuTranscendental32),
804
   (0x27, 0x27, 0x21, 0x21, 0x27, "v_log_f32", True, True, InstrClass.ValuTranscendental32),
805
   (0x28, 0x28,   -1,   -1,   -1, "v_rcp_clamp_f32", True, True, InstrClass.ValuTranscendental32),
806
   (0x29, 0x29,   -1,   -1,   -1, "v_rcp_legacy_f32", True, True, InstrClass.ValuTranscendental32),
807
   (0x2a, 0x2a, 0x22, 0x22, 0x2a, "v_rcp_f32", True, True, InstrClass.ValuTranscendental32),
808
   (0x2b, 0x2b, 0x23, 0x23, 0x2b, "v_rcp_iflag_f32", True, True, InstrClass.ValuTranscendental32),
809
   (0x2c, 0x2c,   -1,   -1,   -1, "v_rsq_clamp_f32", True, True, InstrClass.ValuTranscendental32),
810
   (0x2d, 0x2d,   -1,   -1,   -1, "v_rsq_legacy_f32", True, True, InstrClass.ValuTranscendental32),
811
   (0x2e, 0x2e, 0x24, 0x24, 0x2e, "v_rsq_f32", True, True, InstrClass.ValuTranscendental32),
812
   (0x2f, 0x2f, 0x25, 0x25, 0x2f, "v_rcp_f64", True, True, InstrClass.ValuDoubleTranscendental),
813
   (0x30, 0x30,   -1,   -1,   -1, "v_rcp_clamp_f64", True, True, InstrClass.ValuDoubleTranscendental),
814
   (0x31, 0x31, 0x26, 0x26, 0x31, "v_rsq_f64", True, True, InstrClass.ValuDoubleTranscendental),
815
   (0x32, 0x32,   -1,   -1,   -1, "v_rsq_clamp_f64", True, True, InstrClass.ValuDoubleTranscendental),
816
   (0x33, 0x33, 0x27, 0x27, 0x33, "v_sqrt_f32", True, True, InstrClass.ValuTranscendental32),
817
   (0x34, 0x34, 0x28, 0x28, 0x34, "v_sqrt_f64", True, True, InstrClass.ValuDoubleTranscendental),
818
   (0x35, 0x35, 0x29, 0x29, 0x35, "v_sin_f32", True, True, InstrClass.ValuTranscendental32),
819
   (0x36, 0x36, 0x2a, 0x2a, 0x36, "v_cos_f32", True, True, InstrClass.ValuTranscendental32),
820
   (0x37, 0x37, 0x2b, 0x2b, 0x37, "v_not_b32", False, False),
821
   (0x38, 0x38, 0x2c, 0x2c, 0x38, "v_bfrev_b32", False, False),
822
   (0x39, 0x39, 0x2d, 0x2d, 0x39, "v_ffbh_u32", False, False),
823
   (0x3a, 0x3a, 0x2e, 0x2e, 0x3a, "v_ffbl_b32", False, False),
824
   (0x3b, 0x3b, 0x2f, 0x2f, 0x3b, "v_ffbh_i32", False, False),
825
   (0x3c, 0x3c, 0x30, 0x30, 0x3c, "v_frexp_exp_i32_f64", True, False, InstrClass.ValuDouble),
826
   (0x3d, 0x3d, 0x31, 0x31, 0x3d, "v_frexp_mant_f64", True, False, InstrClass.ValuDouble),
827
   (0x3e, 0x3e, 0x32, 0x32, 0x3e, "v_fract_f64", True, True, InstrClass.ValuDouble),
828
   (0x3f, 0x3f, 0x33, 0x33, 0x3f, "v_frexp_exp_i32_f32", True, False),
829
   (0x40, 0x40, 0x34, 0x34, 0x40, "v_frexp_mant_f32", True, False),
830
   (0x41, 0x41, 0x35, 0x35, 0x41, "v_clrexcp", False, False),
831
   (0x42, 0x42, 0x36,   -1, 0x42, "v_movreld_b32", False, False),
832
   (0x43, 0x43, 0x37,   -1, 0x43, "v_movrels_b32", False, False),
833
   (0x44, 0x44, 0x38,   -1, 0x44, "v_movrelsd_b32", False, False),
834
   (  -1,   -1,   -1,   -1, 0x48, "v_movrelsd_2_b32", False, False),
835
   (  -1,   -1,   -1, 0x37,   -1, "v_screen_partition_4se_b32", False, False),
836
   (  -1,   -1, 0x39, 0x39, 0x50, "v_cvt_f16_u16", False, True),
837
   (  -1,   -1, 0x3a, 0x3a, 0x51, "v_cvt_f16_i16", False, True),
838
   (  -1,   -1, 0x3b, 0x3b, 0x52, "v_cvt_u16_f16", True, False),
839
   (  -1,   -1, 0x3c, 0x3c, 0x53, "v_cvt_i16_f16", True, False),
840
   (  -1,   -1, 0x3d, 0x3d, 0x54, "v_rcp_f16", True, True, InstrClass.ValuTranscendental32),
841
   (  -1,   -1, 0x3e, 0x3e, 0x55, "v_sqrt_f16", True, True, InstrClass.ValuTranscendental32),
842
   (  -1,   -1, 0x3f, 0x3f, 0x56, "v_rsq_f16", True, True, InstrClass.ValuTranscendental32),
843
   (  -1,   -1, 0x40, 0x40, 0x57, "v_log_f16", True, True, InstrClass.ValuTranscendental32),
844
   (  -1,   -1, 0x41, 0x41, 0x58, "v_exp_f16", True, True, InstrClass.ValuTranscendental32),
845
   (  -1,   -1, 0x42, 0x42, 0x59, "v_frexp_mant_f16", True, False),
846
   (  -1,   -1, 0x43, 0x43, 0x5a, "v_frexp_exp_i16_f16", True, False),
847
   (  -1,   -1, 0x44, 0x44, 0x5b, "v_floor_f16", True, True),
848
   (  -1,   -1, 0x45, 0x45, 0x5c, "v_ceil_f16", True, True),
849
   (  -1,   -1, 0x46, 0x46, 0x5d, "v_trunc_f16", True, True),
850
   (  -1,   -1, 0x47, 0x47, 0x5e, "v_rndne_f16", True, True),
851
   (  -1,   -1, 0x48, 0x48, 0x5f, "v_fract_f16", True, True),
852
   (  -1,   -1, 0x49, 0x49, 0x60, "v_sin_f16", True, True, InstrClass.ValuTranscendental32),
853
   (  -1,   -1, 0x4a, 0x4a, 0x61, "v_cos_f16", True, True, InstrClass.ValuTranscendental32),
854
   (  -1, 0x46, 0x4b, 0x4b,   -1, "v_exp_legacy_f32", True, True, InstrClass.ValuTranscendental32),
855
   (  -1, 0x45, 0x4c, 0x4c,   -1, "v_log_legacy_f32", True, True, InstrClass.ValuTranscendental32),
856
   (  -1,   -1,   -1, 0x4f, 0x62, "v_sat_pk_u8_i16", False, False),
857
   (  -1,   -1,   -1, 0x4d, 0x63, "v_cvt_norm_i16_f16", True, False),
858
   (  -1,   -1,   -1, 0x4e, 0x64, "v_cvt_norm_u16_f16", True, False),
859
   (  -1,   -1,   -1, 0x51, 0x65, "v_swap_b32", False, False),
860
   (  -1,   -1,   -1,   -1, 0x68, "v_swaprel_b32", False, False),
861
}
862
for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod, cls) in default_class(VOP1, InstrClass.Valu32):
863
   opcode(name, gfx7, gfx9, gfx10, Format.VOP1, cls, in_mod, out_mod)
864

865

866
# VOPC instructions:
867

868
VOPC_CLASS = {
869
   (0x88, 0x88, 0x10, 0x10, 0x88, "v_cmp_class_f32"),
870
   (  -1,   -1, 0x14, 0x14, 0x8f, "v_cmp_class_f16"),
871
   (0x98, 0x98, 0x11, 0x11, 0x98, "v_cmpx_class_f32"),
872
   (  -1,   -1, 0x15, 0x15, 0x9f, "v_cmpx_class_f16"),
873
   (0xa8, 0xa8, 0x12, 0x12, 0xa8, "v_cmp_class_f64", InstrClass.ValuDouble),
874
   (0xb8, 0xb8, 0x13, 0x13, 0xb8, "v_cmpx_class_f64", InstrClass.ValuDouble),
875
}
876
for (gfx6, gfx7, gfx8, gfx9, gfx10, name, cls) in default_class(VOPC_CLASS, InstrClass.Valu32):
877
    opcode(name, gfx7, gfx9, gfx10, Format.VOPC, cls, True, False)
878

879
COMPF = ["f", "lt", "eq", "le", "gt", "lg", "ge", "o", "u", "nge", "nlg", "ngt", "nle", "neq", "nlt", "tru"]
880

881
for i in range(8):
882
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x20+i, 0x20+i, 0xc8+i, "v_cmp_"+COMPF[i]+"_f16")
883
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32, True, False)
884
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x30+i, 0x30+i, 0xd8+i, "v_cmpx_"+COMPF[i]+"_f16")
885
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32, True, False)
886
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x28+i, 0x28+i, 0xe8+i, "v_cmp_"+COMPF[i+8]+"_f16")
887
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32, True, False)
888
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x38+i, 0x38+i, 0xf8+i, "v_cmpx_"+COMPF[i+8]+"_f16")
889
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32, True, False)
890

891
for i in range(16):
892
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x00+i, 0x00+i, 0x40+i, 0x40+i, 0x00+i, "v_cmp_"+COMPF[i]+"_f32")
893
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32, True, False)
894
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x10+i, 0x10+i, 0x50+i, 0x50+i, 0x10+i, "v_cmpx_"+COMPF[i]+"_f32")
895
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32, True, False)
896
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x20+i, 0x20+i, 0x60+i, 0x60+i, 0x20+i, "v_cmp_"+COMPF[i]+"_f64")
897
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.ValuDouble, True, False)
898
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x30+i, 0x30+i, 0x70+i, 0x70+i, 0x30+i, "v_cmpx_"+COMPF[i]+"_f64")
899
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.ValuDouble, True, False)
900
   # GFX_6_7
901
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x40+i, 0x40+i, -1, -1, -1, "v_cmps_"+COMPF[i]+"_f32")
902
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x50+i, 0x50+i, -1, -1, -1, "v_cmpsx_"+COMPF[i]+"_f32")
903
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x60+i, 0x60+i, -1, -1, -1, "v_cmps_"+COMPF[i]+"_f64")
904
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x70+i, 0x70+i, -1, -1, -1, "v_cmpsx_"+COMPF[i]+"_f64")
905

906
COMPI = ["f", "lt", "eq", "le", "gt", "lg", "ge", "tru"]
907

908
# GFX_8_9
909
for i in [0,7]: # only 0 and 7
910
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, -1, "v_cmp_"+COMPI[i]+"_i16")
911
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32)
912
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, -1, "v_cmpx_"+COMPI[i]+"_i16")
913
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32)
914
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, -1, "v_cmp_"+COMPI[i]+"_u16")
915
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32)
916
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, -1, "v_cmpx_"+COMPI[i]+"_u16")
917
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32)
918

919
for i in range(1, 7): # [1..6]
920
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, 0x88+i, "v_cmp_"+COMPI[i]+"_i16")
921
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32)
922
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, 0x98+i, "v_cmpx_"+COMPI[i]+"_i16")
923
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32)
924
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, 0xa8+i, "v_cmp_"+COMPI[i]+"_u16")
925
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32)
926
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, 0xb8+i, "v_cmpx_"+COMPI[i]+"_u16")
927
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32)
928

929
for i in range(8):
930
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x80+i, 0x80+i, 0xc0+i, 0xc0+i, 0x80+i, "v_cmp_"+COMPI[i]+"_i32")
931
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32)
932
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x90+i, 0x90+i, 0xd0+i, 0xd0+i, 0x90+i, "v_cmpx_"+COMPI[i]+"_i32")
933
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32)
934
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xa0+i, 0xa0+i, 0xe0+i, 0xe0+i, 0xa0+i, "v_cmp_"+COMPI[i]+"_i64")
935
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu64)
936
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xb0+i, 0xb0+i, 0xf0+i, 0xf0+i, 0xb0+i, "v_cmpx_"+COMPI[i]+"_i64")
937
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu64)
938
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xc0+i, 0xc0+i, 0xc8+i, 0xc8+i, 0xc0+i, "v_cmp_"+COMPI[i]+"_u32")
939
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32)
940
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xd0+i, 0xd0+i, 0xd8+i, 0xd8+i, 0xd0+i, "v_cmpx_"+COMPI[i]+"_u32")
941
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu32)
942
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xe0+i, 0xe0+i, 0xe8+i, 0xe8+i, 0xe0+i, "v_cmp_"+COMPI[i]+"_u64")
943
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu64)
944
   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xf0+i, 0xf0+i, 0xf8+i, 0xf8+i, 0xf0+i, "v_cmpx_"+COMPI[i]+"_u64")
945
   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, InstrClass.Valu64)
946

947

948
# VOPP instructions: packed 16bit instructions - 1 or 2 inputs and 1 output
949
VOPP = {
950
   # opcode, name, input/output modifiers
951
   (0x00, "v_pk_mad_i16", False),
952
   (0x01, "v_pk_mul_lo_u16", False),
953
   (0x02, "v_pk_add_i16", False),
954
   (0x03, "v_pk_sub_i16", False),
955
   (0x04, "v_pk_lshlrev_b16", False),
956
   (0x05, "v_pk_lshrrev_b16", False),
957
   (0x06, "v_pk_ashrrev_i16", False),
958
   (0x07, "v_pk_max_i16", False),
959
   (0x08, "v_pk_min_i16", False),
960
   (0x09, "v_pk_mad_u16", False),
961
   (0x0a, "v_pk_add_u16", False),
962
   (0x0b, "v_pk_sub_u16", False),
963
   (0x0c, "v_pk_max_u16", False),
964
   (0x0d, "v_pk_min_u16", False),
965
   (0x0e, "v_pk_fma_f16", True),
966
   (0x0f, "v_pk_add_f16", True),
967
   (0x10, "v_pk_mul_f16", True),
968
   (0x11, "v_pk_min_f16", True),
969
   (0x12, "v_pk_max_f16", True),
970
   (0x20, "v_fma_mix_f32", True), # v_mad_mix_f32 in VEGA ISA, v_fma_mix_f32 in RDNA ISA
971
   (0x21, "v_fma_mixlo_f16", True), # v_mad_mixlo_f16 in VEGA ISA, v_fma_mixlo_f16 in RDNA ISA
972
   (0x22, "v_fma_mixhi_f16", True), # v_mad_mixhi_f16 in VEGA ISA, v_fma_mixhi_f16 in RDNA ISA
973
}
974
# note that these are only supported on gfx9+ so we'll need to distinguish between gfx8 and gfx9 here
975
# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, -1, code, code, name)
976
for (code, name, modifiers) in VOPP:
977
   opcode(name, -1, code, code, Format.VOP3P, InstrClass.Valu32, modifiers, modifiers)
978

979

980
# VINTERP instructions: 
981
VINTRP = {
982
   (0x00, "v_interp_p1_f32"),
983
   (0x01, "v_interp_p2_f32"),
984
   (0x02, "v_interp_mov_f32"),
985
}
986
# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
987
for (code, name) in VINTRP:
988
   opcode(name, code, code, code, Format.VINTRP, InstrClass.Valu32)
989

990
# VOP3 instructions: 3 inputs, 1 output
991
# VOP3b instructions: have a unique scalar output, e.g. VOP2 with vcc out
992
VOP3 = {
993
   (0x140, 0x140, 0x1c0, 0x1c0, 0x140, "v_mad_legacy_f32", True, True), # GFX6-GFX10
994
   (0x141, 0x141, 0x1c1, 0x1c1, 0x141, "v_mad_f32", True, True),
995
   (0x142, 0x142, 0x1c2, 0x1c2, 0x142, "v_mad_i32_i24", False, False),
996
   (0x143, 0x143, 0x1c3, 0x1c3, 0x143, "v_mad_u32_u24", False, False),
997
   (0x144, 0x144, 0x1c4, 0x1c4, 0x144, "v_cubeid_f32", True, True),
998
   (0x145, 0x145, 0x1c5, 0x1c5, 0x145, "v_cubesc_f32", True, True),
999
   (0x146, 0x146, 0x1c6, 0x1c6, 0x146, "v_cubetc_f32", True, True),
1000
   (0x147, 0x147, 0x1c7, 0x1c7, 0x147, "v_cubema_f32", True, True),
1001
   (0x148, 0x148, 0x1c8, 0x1c8, 0x148, "v_bfe_u32", False, False),
1002
   (0x149, 0x149, 0x1c9, 0x1c9, 0x149, "v_bfe_i32", False, False),
1003
   (0x14a, 0x14a, 0x1ca, 0x1ca, 0x14a, "v_bfi_b32", False, False),
1004
   (0x14b, 0x14b, 0x1cb, 0x1cb, 0x14b, "v_fma_f32", True, True, InstrClass.ValuFma),
1005
   (0x14c, 0x14c, 0x1cc, 0x1cc, 0x14c, "v_fma_f64", True, True, InstrClass.ValuDouble),
1006
   (0x14d, 0x14d, 0x1cd, 0x1cd, 0x14d, "v_lerp_u8", False, False),
1007
   (0x14e, 0x14e, 0x1ce, 0x1ce, 0x14e, "v_alignbit_b32", False, False),
1008
   (0x14f, 0x14f, 0x1cf, 0x1cf, 0x14f, "v_alignbyte_b32", False, False),
1009
   (0x150, 0x150,    -1,    -1, 0x150, "v_mullit_f32", True, True),
1010
   (0x151, 0x151, 0x1d0, 0x1d0, 0x151, "v_min3_f32", True, True),
1011
   (0x152, 0x152, 0x1d1, 0x1d1, 0x152, "v_min3_i32", False, False),
1012
   (0x153, 0x153, 0x1d2, 0x1d2, 0x153, "v_min3_u32", False, False),
1013
   (0x154, 0x154, 0x1d3, 0x1d3, 0x154, "v_max3_f32", True, True),
1014
   (0x155, 0x155, 0x1d4, 0x1d4, 0x155, "v_max3_i32", False, False),
1015
   (0x156, 0x156, 0x1d5, 0x1d5, 0x156, "v_max3_u32", False, False),
1016
   (0x157, 0x157, 0x1d6, 0x1d6, 0x157, "v_med3_f32", True, True),
1017
   (0x158, 0x158, 0x1d7, 0x1d7, 0x158, "v_med3_i32", False, False),
1018
   (0x159, 0x159, 0x1d8, 0x1d8, 0x159, "v_med3_u32", False, False),
1019
   (0x15a, 0x15a, 0x1d9, 0x1d9, 0x15a, "v_sad_u8", False, False),
1020
   (0x15b, 0x15b, 0x1da, 0x1da, 0x15b, "v_sad_hi_u8", False, False),
1021
   (0x15c, 0x15c, 0x1db, 0x1db, 0x15c, "v_sad_u16", False, False),
1022
   (0x15d, 0x15d, 0x1dc, 0x1dc, 0x15d, "v_sad_u32", False, False),
1023
   (0x15e, 0x15e, 0x1dd, 0x1dd, 0x15e, "v_cvt_pk_u8_f32", True, False),
1024
   (0x15f, 0x15f, 0x1de, 0x1de, 0x15f, "v_div_fixup_f32", True, True),
1025
   (0x160, 0x160, 0x1df, 0x1df, 0x160, "v_div_fixup_f64", True, True),
1026
   (0x161, 0x161,    -1,    -1,    -1, "v_lshl_b64", False, False, InstrClass.Valu64),
1027
   (0x162, 0x162,    -1,    -1,    -1, "v_lshr_b64", False, False, InstrClass.Valu64),
1028
   (0x163, 0x163,    -1,    -1,    -1, "v_ashr_i64", False, False, InstrClass.Valu64),
1029
   (0x164, 0x164, 0x280, 0x280, 0x164, "v_add_f64", True, True, InstrClass.ValuDoubleAdd),
1030
   (0x165, 0x165, 0x281, 0x281, 0x165, "v_mul_f64", True, True, InstrClass.ValuDouble),
1031
   (0x166, 0x166, 0x282, 0x282, 0x166, "v_min_f64", True, True, InstrClass.ValuDouble),
1032
   (0x167, 0x167, 0x283, 0x283, 0x167, "v_max_f64", True, True, InstrClass.ValuDouble),
1033
   (0x168, 0x168, 0x284, 0x284, 0x168, "v_ldexp_f64", False, True, InstrClass.ValuDouble), # src1 can take input modifiers
1034
   (0x169, 0x169, 0x285, 0x285, 0x169, "v_mul_lo_u32", False, False, InstrClass.ValuQuarterRate32),
1035
   (0x16a, 0x16a, 0x286, 0x286, 0x16a, "v_mul_hi_u32", False, False, InstrClass.ValuQuarterRate32),
1036
   (0x16b, 0x16b, 0x285, 0x285, 0x16b, "v_mul_lo_i32", False, False, InstrClass.ValuQuarterRate32), # identical to v_mul_lo_u32
1037
   (0x16c, 0x16c, 0x287, 0x287, 0x16c, "v_mul_hi_i32", False, False, InstrClass.ValuQuarterRate32),
1038
   (0x16d, 0x16d, 0x1e0, 0x1e0, 0x16d, "v_div_scale_f32", True, True), # writes to VCC
1039
   (0x16e, 0x16e, 0x1e1, 0x1e1, 0x16e, "v_div_scale_f64", True, True, InstrClass.ValuDouble), # writes to VCC
1040
   (0x16f, 0x16f, 0x1e2, 0x1e2, 0x16f, "v_div_fmas_f32", True, True), # takes VCC input
1041
   (0x170, 0x170, 0x1e3, 0x1e3, 0x170, "v_div_fmas_f64", True, True, InstrClass.ValuDouble), # takes VCC input
1042
   (0x171, 0x171, 0x1e4, 0x1e4, 0x171, "v_msad_u8", False, False),
1043
   (0x172, 0x172, 0x1e5, 0x1e5, 0x172, "v_qsad_pk_u16_u8", False, False),
1044
   (0x172,    -1,    -1,    -1,    -1, "v_qsad_u8", False, False), # what's the difference?
1045
   (0x173, 0x173, 0x1e6, 0x1e6, 0x173, "v_mqsad_pk_u16_u8", False, False),
1046
   (0x173,    -1,    -1,    -1,    -1, "v_mqsad_u8", False, False), # what's the difference?
1047
   (0x174, 0x174, 0x292, 0x292, 0x174, "v_trig_preop_f64", False, False, InstrClass.ValuDouble),
1048
   (   -1, 0x175, 0x1e7, 0x1e7, 0x175, "v_mqsad_u32_u8", False, False),
1049
   (   -1, 0x176, 0x1e8, 0x1e8, 0x176, "v_mad_u64_u32", False, False, InstrClass.Valu64),
1050
   (   -1, 0x177, 0x1e9, 0x1e9, 0x177, "v_mad_i64_i32", False, False, InstrClass.Valu64),
1051
   (   -1,    -1, 0x1ea, 0x1ea,    -1, "v_mad_legacy_f16", True, True),
1052
   (   -1,    -1, 0x1eb, 0x1eb,    -1, "v_mad_legacy_u16", False, False),
1053
   (   -1,    -1, 0x1ec, 0x1ec,    -1, "v_mad_legacy_i16", False, False),
1054
   (   -1,    -1, 0x1ed, 0x1ed, 0x344, "v_perm_b32", False, False),
1055
   (   -1,    -1, 0x1ee, 0x1ee,    -1, "v_fma_legacy_f16", True, True, InstrClass.ValuFma),
1056
   (   -1,    -1, 0x1ef, 0x1ef,    -1, "v_div_fixup_legacy_f16", True, True),
1057
   (0x12c, 0x12c, 0x1f0, 0x1f0,    -1, "v_cvt_pkaccum_u8_f32", True, False),
1058
   (   -1,    -1,    -1, 0x1f1, 0x373, "v_mad_u32_u16", False, False),
1059
   (   -1,    -1,    -1, 0x1f2, 0x375, "v_mad_i32_i16", False, False),
1060
   (   -1,    -1,    -1, 0x1f3, 0x345, "v_xad_u32", False, False),
1061
   (   -1,    -1,    -1, 0x1f4, 0x351, "v_min3_f16", True, True),
1062
   (   -1,    -1,    -1, 0x1f5, 0x352, "v_min3_i16", False, False),
1063
   (   -1,    -1,    -1, 0x1f6, 0x353, "v_min3_u16", False, False),
1064
   (   -1,    -1,    -1, 0x1f7, 0x354, "v_max3_f16", True, True),
1065
   (   -1,    -1,    -1, 0x1f8, 0x355, "v_max3_i16", False, False),
1066
   (   -1,    -1,    -1, 0x1f9, 0x356, "v_max3_u16", False, False),
1067
   (   -1,    -1,    -1, 0x1fa, 0x357, "v_med3_f16", True, True),
1068
   (   -1,    -1,    -1, 0x1fb, 0x358, "v_med3_i16", False, False),
1069
   (   -1,    -1,    -1, 0x1fc, 0x359, "v_med3_u16", False, False),
1070
   (   -1,    -1,    -1, 0x1fd, 0x346, "v_lshl_add_u32", False, False),
1071
   (   -1,    -1,    -1, 0x1fe, 0x347, "v_add_lshl_u32", False, False),
1072
   (   -1,    -1,    -1, 0x1ff, 0x36d, "v_add3_u32", False, False),
1073
   (   -1,    -1,    -1, 0x200, 0x36f, "v_lshl_or_b32", False, False),
1074
   (   -1,    -1,    -1, 0x201, 0x371, "v_and_or_b32", False, False),
1075
   (   -1,    -1,    -1, 0x202, 0x372, "v_or3_b32", False, False),
1076
   (   -1,    -1,    -1, 0x203,    -1, "v_mad_f16", True, True),
1077
   (   -1,    -1,    -1, 0x204, 0x340, "v_mad_u16", False, False),
1078
   (   -1,    -1,    -1, 0x205, 0x35e, "v_mad_i16", False, False),
1079
   (   -1,    -1,    -1, 0x206, 0x34b, "v_fma_f16", True, True),
1080
   (   -1,    -1,    -1, 0x207, 0x35f, "v_div_fixup_f16", True, True),
1081
   (   -1,    -1, 0x274, 0x274, 0x342, "v_interp_p1ll_f16", True, True),
1082
   (   -1,    -1, 0x275, 0x275, 0x343, "v_interp_p1lv_f16", True, True),
1083
   (   -1,    -1, 0x276, 0x276,    -1, "v_interp_p2_legacy_f16", True, True),
1084
   (   -1,    -1,    -1, 0x277, 0x35a, "v_interp_p2_f16", True, True),
1085
   (0x12b, 0x12b, 0x288, 0x288, 0x362, "v_ldexp_f32", False, True),
1086
   (   -1,    -1, 0x289, 0x289, 0x360, "v_readlane_b32_e64", False, False),
1087
   (   -1,    -1, 0x28a, 0x28a, 0x361, "v_writelane_b32_e64", False, False),
1088
   (0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False),
1089
   (0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False),
1090
   (   -1,    -1, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32_e64", False, False),
1091
   (   -1,    -1, 0x28f, 0x28f, 0x2ff, "v_lshlrev_b64", False, False, InstrClass.Valu64),
1092
   (   -1,    -1, 0x290, 0x290, 0x300, "v_lshrrev_b64", False, False, InstrClass.Valu64),
1093
   (   -1,    -1, 0x291, 0x291, 0x301, "v_ashrrev_i64", False, False, InstrClass.Valu64),
1094
   (0x11e, 0x11e, 0x293, 0x293, 0x363, "v_bfm_b32", False, False),
1095
   (0x12d, 0x12d, 0x294, 0x294, 0x368, "v_cvt_pknorm_i16_f32", True, False),
1096
   (0x12e, 0x12e, 0x295, 0x295, 0x369, "v_cvt_pknorm_u16_f32", True, False),
1097
   (0x12f, 0x12f, 0x296, 0x296, 0x12f, "v_cvt_pkrtz_f16_f32_e64", True, False), # GFX6_7_10 is VOP2 with opcode 0x02f
1098
   (0x130, 0x130, 0x297, 0x297, 0x36a, "v_cvt_pk_u16_u32", False, False),
1099
   (0x131, 0x131, 0x298, 0x298, 0x36b, "v_cvt_pk_i16_i32", False, False),
1100
   (   -1,    -1,    -1, 0x299, 0x312, "v_cvt_pknorm_i16_f16", True, False),
1101
   (   -1,    -1,    -1, 0x29a, 0x313, "v_cvt_pknorm_u16_f16", True, False),
1102
   (   -1,    -1,    -1, 0x29c, 0x37f, "v_add_i32", False, False),
1103
   (   -1,    -1,    -1, 0x29d, 0x376, "v_sub_i32", False, False),
1104
   (   -1,    -1,    -1, 0x29e, 0x30d, "v_add_i16", False, False),
1105
   (   -1,    -1,    -1, 0x29f, 0x30e, "v_sub_i16", False, False),
1106
   (   -1,    -1,    -1, 0x2a0, 0x311, "v_pack_b32_f16", True, False),
1107
   (   -1,    -1,    -1,    -1, 0x178, "v_xor3_b32", False, False),
1108
   (   -1,    -1,    -1,    -1, 0x377, "v_permlane16_b32", False, False),
1109
   (   -1,    -1,    -1,    -1, 0x378, "v_permlanex16_b32", False, False),
1110
   (   -1,    -1,    -1,    -1, 0x30f, "v_add_co_u32_e64", False, False),
1111
   (   -1,    -1,    -1,    -1, 0x310, "v_sub_co_u32_e64", False, False),
1112
   (   -1,    -1,    -1,    -1, 0x319, "v_subrev_co_u32_e64", False, False),
1113
   (   -1,    -1,    -1,    -1, 0x303, "v_add_u16_e64", False, False),
1114
   (   -1,    -1,    -1,    -1, 0x304, "v_sub_u16_e64", False, False),
1115
   (   -1,    -1,    -1,    -1, 0x305, "v_mul_lo_u16_e64", False, False),
1116
   (   -1,    -1,    -1,    -1, 0x309, "v_max_u16_e64", False, False),
1117
   (   -1,    -1,    -1,    -1, 0x30a, "v_max_i16_e64", False, False),
1118
   (   -1,    -1,    -1,    -1, 0x30b, "v_min_u16_e64", False, False),
1119
   (   -1,    -1,    -1,    -1, 0x30c, "v_min_i16_e64", False, False),
1120
   (   -1,    -1,    -1,    -1, 0x307, "v_lshrrev_b16_e64", False, False),
1121
   (   -1,    -1,    -1,    -1, 0x308, "v_ashrrev_i16_e64", False, False),
1122
   (   -1,    -1,    -1,    -1, 0x314, "v_lshlrev_b16_e64", False, False),
1123
   (   -1,    -1,    -1,    -1, 0x140, "v_fma_legacy_f32", True, True, InstrClass.ValuFma), #GFX10.3+
1124
}
1125
for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod, cls) in default_class(VOP3, InstrClass.Valu32):
1126
   opcode(name, gfx7, gfx9, gfx10, Format.VOP3, cls, in_mod, out_mod)
1127

1128

1129
# DS instructions: 3 inputs (1 addr, 2 data), 1 output
1130
DS = {
1131
   (0x00, 0x00, 0x00, 0x00, 0x00, "ds_add_u32"),
1132
   (0x01, 0x01, 0x01, 0x01, 0x01, "ds_sub_u32"),
1133
   (0x02, 0x02, 0x02, 0x02, 0x02, "ds_rsub_u32"),
1134
   (0x03, 0x03, 0x03, 0x03, 0x03, "ds_inc_u32"),
1135
   (0x04, 0x04, 0x04, 0x04, 0x04, "ds_dec_u32"),
1136
   (0x05, 0x05, 0x05, 0x05, 0x05, "ds_min_i32"),
1137
   (0x06, 0x06, 0x06, 0x06, 0x06, "ds_max_i32"),
1138
   (0x07, 0x07, 0x07, 0x07, 0x07, "ds_min_u32"),
1139
   (0x08, 0x08, 0x08, 0x08, 0x08, "ds_max_u32"),
1140
   (0x09, 0x09, 0x09, 0x09, 0x09, "ds_and_b32"),
1141
   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "ds_or_b32"),
1142
   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "ds_xor_b32"),
1143
   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "ds_mskor_b32"),
1144
   (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "ds_write_b32"),
1145
   (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "ds_write2_b32"),
1146
   (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "ds_write2st64_b32"),
1147
   (0x10, 0x10, 0x10, 0x10, 0x10, "ds_cmpst_b32"),
1148
   (0x11, 0x11, 0x11, 0x11, 0x11, "ds_cmpst_f32"),
1149
   (0x12, 0x12, 0x12, 0x12, 0x12, "ds_min_f32"),
1150
   (0x13, 0x13, 0x13, 0x13, 0x13, "ds_max_f32"),
1151
   (  -1, 0x14, 0x14, 0x14, 0x14, "ds_nop"),
1152
   (  -1,   -1, 0x15, 0x15, 0x15, "ds_add_f32"),
1153
   (  -1,   -1, 0x1d, 0x1d, 0xb0, "ds_write_addtid_b32"),
1154
   (0x1e, 0x1e, 0x1e, 0x1e, 0x1e, "ds_write_b8"),
1155
   (0x1f, 0x1f, 0x1f, 0x1f, 0x1f, "ds_write_b16"),
1156
   (0x20, 0x20, 0x20, 0x20, 0x20, "ds_add_rtn_u32"),
1157
   (0x21, 0x21, 0x21, 0x21, 0x21, "ds_sub_rtn_u32"),
1158
   (0x22, 0x22, 0x22, 0x22, 0x22, "ds_rsub_rtn_u32"),
1159
   (0x23, 0x23, 0x23, 0x23, 0x23, "ds_inc_rtn_u32"),
1160
   (0x24, 0x24, 0x24, 0x24, 0x24, "ds_dec_rtn_u32"),
1161
   (0x25, 0x25, 0x25, 0x25, 0x25, "ds_min_rtn_i32"),
1162
   (0x26, 0x26, 0x26, 0x26, 0x26, "ds_max_rtn_i32"),
1163
   (0x27, 0x27, 0x27, 0x27, 0x27, "ds_min_rtn_u32"),
1164
   (0x28, 0x28, 0x28, 0x28, 0x28, "ds_max_rtn_u32"),
1165
   (0x29, 0x29, 0x29, 0x29, 0x29, "ds_and_rtn_b32"),
1166
   (0x2a, 0x2a, 0x2a, 0x2a, 0x2a, "ds_or_rtn_b32"),
1167
   (0x2b, 0x2b, 0x2b, 0x2b, 0x2b, "ds_xor_rtn_b32"),
1168
   (0x2c, 0x2c, 0x2c, 0x2c, 0x2c, "ds_mskor_rtn_b32"),
1169
   (0x2d, 0x2d, 0x2d, 0x2d, 0x2d, "ds_wrxchg_rtn_b32"),
1170
   (0x2e, 0x2e, 0x2e, 0x2e, 0x2e, "ds_wrxchg2_rtn_b32"),
1171
   (0x2f, 0x2f, 0x2f, 0x2f, 0x2f, "ds_wrxchg2st64_rtn_b32"),
1172
   (0x30, 0x30, 0x30, 0x30, 0x30, "ds_cmpst_rtn_b32"),
1173
   (0x31, 0x31, 0x31, 0x31, 0x31, "ds_cmpst_rtn_f32"),
1174
   (0x32, 0x32, 0x32, 0x32, 0x32, "ds_min_rtn_f32"),
1175
   (0x33, 0x33, 0x33, 0x33, 0x33, "ds_max_rtn_f32"),
1176
   (  -1, 0x34, 0x34, 0x34, 0x34, "ds_wrap_rtn_b32"),
1177
   (  -1,   -1, 0x35, 0x35, 0x55, "ds_add_rtn_f32"),
1178
   (0x36, 0x36, 0x36, 0x36, 0x36, "ds_read_b32"),
1179
   (0x37, 0x37, 0x37, 0x37, 0x37, "ds_read2_b32"),
1180
   (0x38, 0x38, 0x38, 0x38, 0x38, "ds_read2st64_b32"),
1181
   (0x39, 0x39, 0x39, 0x39, 0x39, "ds_read_i8"),
1182
   (0x3a, 0x3a, 0x3a, 0x3a, 0x3a, "ds_read_u8"),
1183
   (0x3b, 0x3b, 0x3b, 0x3b, 0x3b, "ds_read_i16"),
1184
   (0x3c, 0x3c, 0x3c, 0x3c, 0x3c, "ds_read_u16"),
1185
   (0x35, 0x35, 0x3d, 0x3d, 0x35, "ds_swizzle_b32"), #data1 & offset, no addr/data2
1186
   (  -1,   -1, 0x3e, 0x3e, 0xb2, "ds_permute_b32"),
1187
   (  -1,   -1, 0x3f, 0x3f, 0xb3, "ds_bpermute_b32"),
1188
   (0x40, 0x40, 0x40, 0x40, 0x40, "ds_add_u64"),
1189
   (0x41, 0x41, 0x41, 0x41, 0x41, "ds_sub_u64"),
1190
   (0x42, 0x42, 0x42, 0x42, 0x42, "ds_rsub_u64"),
1191
   (0x43, 0x43, 0x43, 0x43, 0x43, "ds_inc_u64"),
1192
   (0x44, 0x44, 0x44, 0x44, 0x44, "ds_dec_u64"),
1193
   (0x45, 0x45, 0x45, 0x45, 0x45, "ds_min_i64"),
1194
   (0x46, 0x46, 0x46, 0x46, 0x46, "ds_max_i64"),
1195
   (0x47, 0x47, 0x47, 0x47, 0x47, "ds_min_u64"),
1196
   (0x48, 0x48, 0x48, 0x48, 0x48, "ds_max_u64"),
1197
   (0x49, 0x49, 0x49, 0x49, 0x49, "ds_and_b64"),
1198
   (0x4a, 0x4a, 0x4a, 0x4a, 0x4a, "ds_or_b64"),
1199
   (0x4b, 0x4b, 0x4b, 0x4b, 0x4b, "ds_xor_b64"),
1200
   (0x4c, 0x4c, 0x4c, 0x4c, 0x4c, "ds_mskor_b64"),
1201
   (0x4d, 0x4d, 0x4d, 0x4d, 0x4d, "ds_write_b64"),
1202
   (0x4e, 0x4e, 0x4e, 0x4e, 0x4e, "ds_write2_b64"),
1203
   (0x4f, 0x4f, 0x4f, 0x4f, 0x4f, "ds_write2st64_b64"),
1204
   (0x50, 0x50, 0x50, 0x50, 0x50, "ds_cmpst_b64"),
1205
   (0x51, 0x51, 0x51, 0x51, 0x51, "ds_cmpst_f64"),
1206
   (0x52, 0x52, 0x52, 0x52, 0x52, "ds_min_f64"),
1207
   (0x53, 0x53, 0x53, 0x53, 0x53, "ds_max_f64"),
1208
   (  -1,   -1,   -1, 0x54, 0xa0, "ds_write_b8_d16_hi"),
1209
   (  -1,   -1,   -1, 0x55, 0xa1, "ds_write_b16_d16_hi"),
1210
   (  -1,   -1,   -1, 0x56, 0xa2, "ds_read_u8_d16"),
1211
   (  -1,   -1,   -1, 0x57, 0xa3, "ds_read_u8_d16_hi"),
1212
   (  -1,   -1,   -1, 0x58, 0xa4, "ds_read_i8_d16"),
1213
   (  -1,   -1,   -1, 0x59, 0xa5, "ds_read_i8_d16_hi"),
1214
   (  -1,   -1,   -1, 0x5a, 0xa6, "ds_read_u16_d16"),
1215
   (  -1,   -1,   -1, 0x5b, 0xa7, "ds_read_u16_d16_hi"),
1216
   (0x60, 0x60, 0x60, 0x60, 0x60, "ds_add_rtn_u64"),
1217
   (0x61, 0x61, 0x61, 0x61, 0x61, "ds_sub_rtn_u64"),
1218
   (0x62, 0x62, 0x62, 0x62, 0x62, "ds_rsub_rtn_u64"),
1219
   (0x63, 0x63, 0x63, 0x63, 0x63, "ds_inc_rtn_u64"),
1220
   (0x64, 0x64, 0x64, 0x64, 0x64, "ds_dec_rtn_u64"),
1221
   (0x65, 0x65, 0x65, 0x65, 0x65, "ds_min_rtn_i64"),
1222
   (0x66, 0x66, 0x66, 0x66, 0x66, "ds_max_rtn_i64"),
1223
   (0x67, 0x67, 0x67, 0x67, 0x67, "ds_min_rtn_u64"),
1224
   (0x68, 0x68, 0x68, 0x68, 0x68, "ds_max_rtn_u64"),
1225
   (0x69, 0x69, 0x69, 0x69, 0x69, "ds_and_rtn_b64"),
1226
   (0x6a, 0x6a, 0x6a, 0x6a, 0x6a, "ds_or_rtn_b64"),
1227
   (0x6b, 0x6b, 0x6b, 0x6b, 0x6b, "ds_xor_rtn_b64"),
1228
   (0x6c, 0x6c, 0x6c, 0x6c, 0x6c, "ds_mskor_rtn_b64"),
1229
   (0x6d, 0x6d, 0x6d, 0x6d, 0x6d, "ds_wrxchg_rtn_b64"),
1230
   (0x6e, 0x6e, 0x6e, 0x6e, 0x6e, "ds_wrxchg2_rtn_b64"),
1231
   (0x6f, 0x6f, 0x6f, 0x6f, 0x6f, "ds_wrxchg2st64_rtn_b64"),
1232
   (0x70, 0x70, 0x70, 0x70, 0x70, "ds_cmpst_rtn_b64"),
1233
   (0x71, 0x71, 0x71, 0x71, 0x71, "ds_cmpst_rtn_f64"),
1234
   (0x72, 0x72, 0x72, 0x72, 0x72, "ds_min_rtn_f64"),
1235
   (0x73, 0x73, 0x73, 0x73, 0x73, "ds_max_rtn_f64"),
1236
   (0x76, 0x76, 0x76, 0x76, 0x76, "ds_read_b64"),
1237
   (0x77, 0x77, 0x77, 0x77, 0x77, "ds_read2_b64"),
1238
   (0x78, 0x78, 0x78, 0x78, 0x78, "ds_read2st64_b64"),
1239
   (  -1, 0x7e, 0x7e, 0x7e, 0x7e, "ds_condxchg32_rtn_b64"),
1240
   (0x80, 0x80, 0x80, 0x80, 0x80, "ds_add_src2_u32"),
1241
   (0x81, 0x81, 0x81, 0x81, 0x81, "ds_sub_src2_u32"),
1242
   (0x82, 0x82, 0x82, 0x82, 0x82, "ds_rsub_src2_u32"),
1243
   (0x83, 0x83, 0x83, 0x83, 0x83, "ds_inc_src2_u32"),
1244
   (0x84, 0x84, 0x84, 0x84, 0x84, "ds_dec_src2_u32"),
1245
   (0x85, 0x85, 0x85, 0x85, 0x85, "ds_min_src2_i32"),
1246
   (0x86, 0x86, 0x86, 0x86, 0x86, "ds_max_src2_i32"),
1247
   (0x87, 0x87, 0x87, 0x87, 0x87, "ds_min_src2_u32"),
1248
   (0x88, 0x88, 0x88, 0x88, 0x88, "ds_max_src2_u32"),
1249
   (0x89, 0x89, 0x89, 0x89, 0x89, "ds_and_src2_b32"),
1250
   (0x8a, 0x8a, 0x8a, 0x8a, 0x8a, "ds_or_src2_b32"),
1251
   (0x8b, 0x8b, 0x8b, 0x8b, 0x8b, "ds_xor_src2_b32"),
1252
   (0x8d, 0x8d, 0x8d, 0x8d, 0x8d, "ds_write_src2_b32"),
1253
   (0x92, 0x92, 0x92, 0x92, 0x92, "ds_min_src2_f32"),
1254
   (0x93, 0x93, 0x93, 0x93, 0x93, "ds_max_src2_f32"),
1255
   (  -1,   -1, 0x95, 0x95, 0x95, "ds_add_src2_f32"),
1256
   (  -1, 0x18, 0x98, 0x98, 0x18, "ds_gws_sema_release_all"),
1257
   (0x19, 0x19, 0x99, 0x99, 0x19, "ds_gws_init"),
1258
   (0x1a, 0x1a, 0x9a, 0x9a, 0x1a, "ds_gws_sema_v"),
1259
   (0x1b, 0x1b, 0x9b, 0x9b, 0x1b, "ds_gws_sema_br"),
1260
   (0x1c, 0x1c, 0x9c, 0x9c, 0x1c, "ds_gws_sema_p"),
1261
   (0x1d, 0x1d, 0x9d, 0x9d, 0x1d, "ds_gws_barrier"),
1262
   (  -1,   -1, 0xb6, 0xb6, 0xb1, "ds_read_addtid_b32"),
1263
   (0x3d, 0x3d, 0xbd, 0xbd, 0x3d, "ds_consume"),
1264
   (0x3e, 0x3e, 0xbe, 0xbe, 0x3e, "ds_append"),
1265
   (0x3f, 0x3f, 0xbf, 0xbf, 0x3f, "ds_ordered_count"),
1266
   (0xc0, 0xc0, 0xc0, 0xc0, 0xc0, "ds_add_src2_u64"),
1267
   (0xc1, 0xc1, 0xc1, 0xc1, 0xc1, "ds_sub_src2_u64"),
1268
   (0xc2, 0xc2, 0xc2, 0xc2, 0xc2, "ds_rsub_src2_u64"),
1269
   (0xc3, 0xc3, 0xc3, 0xc3, 0xc3, "ds_inc_src2_u64"),
1270
   (0xc4, 0xc4, 0xc4, 0xc4, 0xc4, "ds_dec_src2_u64"),
1271
   (0xc5, 0xc5, 0xc5, 0xc5, 0xc5, "ds_min_src2_i64"),
1272
   (0xc6, 0xc6, 0xc6, 0xc6, 0xc6, "ds_max_src2_i64"),
1273
   (0xc7, 0xc7, 0xc7, 0xc7, 0xc7, "ds_min_src2_u64"),
1274
   (0xc8, 0xc8, 0xc8, 0xc8, 0xc8, "ds_max_src2_u64"),
1275
   (0xc9, 0xc9, 0xc9, 0xc9, 0xc9, "ds_and_src2_b64"),
1276
   (0xca, 0xca, 0xca, 0xca, 0xca, "ds_or_src2_b64"),
1277
   (0xcb, 0xcb, 0xcb, 0xcb, 0xcb, "ds_xor_src2_b64"),
1278
   (0xcd, 0xcd, 0xcd, 0xcd, 0xcd, "ds_write_src2_b64"),
1279
   (0xd2, 0xd2, 0xd2, 0xd2, 0xd2, "ds_min_src2_f64"),
1280
   (0xd3, 0xd3, 0xd3, 0xd3, 0xd3, "ds_max_src2_f64"),
1281
   (  -1, 0xde, 0xde, 0xde, 0xde, "ds_write_b96"),
1282
   (  -1, 0xdf, 0xdf, 0xdf, 0xdf, "ds_write_b128"),
1283
   (  -1, 0xfd, 0xfd,   -1,   -1, "ds_condxchg32_rtn_b128"),
1284
   (  -1, 0xfe, 0xfe, 0xfe, 0xfe, "ds_read_b96"),
1285
   (  -1, 0xff, 0xff, 0xff, 0xff, "ds_read_b128"),
1286
}
1287
for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in DS:
1288
    opcode(name, gfx7, gfx9, gfx10, Format.DS, InstrClass.DS)
1289

1290
# MUBUF instructions:
1291
MUBUF = {
1292
   (0x00, 0x00, 0x00, 0x00, 0x00, "buffer_load_format_x"),
1293
   (0x01, 0x01, 0x01, 0x01, 0x01, "buffer_load_format_xy"),
1294
   (0x02, 0x02, 0x02, 0x02, 0x02, "buffer_load_format_xyz"),
1295
   (0x03, 0x03, 0x03, 0x03, 0x03, "buffer_load_format_xyzw"),
1296
   (0x04, 0x04, 0x04, 0x04, 0x04, "buffer_store_format_x"),
1297
   (0x05, 0x05, 0x05, 0x05, 0x05, "buffer_store_format_xy"),
1298
   (0x06, 0x06, 0x06, 0x06, 0x06, "buffer_store_format_xyz"),
1299
   (0x07, 0x07, 0x07, 0x07, 0x07, "buffer_store_format_xyzw"),
1300
   (  -1,   -1, 0x08, 0x08, 0x80, "buffer_load_format_d16_x"),
1301
   (  -1,   -1, 0x09, 0x09, 0x81, "buffer_load_format_d16_xy"),
1302
   (  -1,   -1, 0x0a, 0x0a, 0x82, "buffer_load_format_d16_xyz"),
1303
   (  -1,   -1, 0x0b, 0x0b, 0x83, "buffer_load_format_d16_xyzw"),
1304
   (  -1,   -1, 0x0c, 0x0c, 0x84, "buffer_store_format_d16_x"),
1305
   (  -1,   -1, 0x0d, 0x0d, 0x85, "buffer_store_format_d16_xy"),
1306
   (  -1,   -1, 0x0e, 0x0e, 0x86, "buffer_store_format_d16_xyz"),
1307
   (  -1,   -1, 0x0f, 0x0f, 0x87, "buffer_store_format_d16_xyzw"),
1308
   (0x08, 0x08, 0x10, 0x10, 0x08, "buffer_load_ubyte"),
1309
   (0x09, 0x09, 0x11, 0x11, 0x09, "buffer_load_sbyte"),
1310
   (0x0a, 0x0a, 0x12, 0x12, 0x0a, "buffer_load_ushort"),
1311
   (0x0b, 0x0b, 0x13, 0x13, 0x0b, "buffer_load_sshort"),
1312
   (0x0c, 0x0c, 0x14, 0x14, 0x0c, "buffer_load_dword"),
1313
   (0x0d, 0x0d, 0x15, 0x15, 0x0d, "buffer_load_dwordx2"),
1314
   (  -1, 0x0f, 0x16, 0x16, 0x0f, "buffer_load_dwordx3"),
1315
   (0x0f, 0x0e, 0x17, 0x17, 0x0e, "buffer_load_dwordx4"),
1316
   (0x18, 0x18, 0x18, 0x18, 0x18, "buffer_store_byte"),
1317
   (  -1,   -1,   -1, 0x19, 0x19, "buffer_store_byte_d16_hi"),
1318
   (0x1a, 0x1a, 0x1a, 0x1a, 0x1a, "buffer_store_short"),
1319
   (  -1,   -1,   -1, 0x1b, 0x1b, "buffer_store_short_d16_hi"),
1320
   (0x1c, 0x1c, 0x1c, 0x1c, 0x1c, "buffer_store_dword"),
1321
   (0x1d, 0x1d, 0x1d, 0x1d, 0x1d, "buffer_store_dwordx2"),
1322
   (  -1, 0x1f, 0x1e, 0x1e, 0x1f, "buffer_store_dwordx3"),
1323
   (0x1e, 0x1e, 0x1f, 0x1f, 0x1e, "buffer_store_dwordx4"),
1324
   (  -1,   -1,   -1, 0x20, 0x20, "buffer_load_ubyte_d16"),
1325
   (  -1,   -1,   -1, 0x21, 0x21, "buffer_load_ubyte_d16_hi"),
1326
   (  -1,   -1,   -1, 0x22, 0x22, "buffer_load_sbyte_d16"),
1327
   (  -1,   -1,   -1, 0x23, 0x23, "buffer_load_sbyte_d16_hi"),
1328
   (  -1,   -1,   -1, 0x24, 0x24, "buffer_load_short_d16"),
1329
   (  -1,   -1,   -1, 0x25, 0x25, "buffer_load_short_d16_hi"),
1330
   (  -1,   -1,   -1, 0x26, 0x26, "buffer_load_format_d16_hi_x"),
1331
   (  -1,   -1,   -1, 0x27, 0x27, "buffer_store_format_d16_hi_x"),
1332
   (  -1,   -1, 0x3d, 0x3d,   -1, "buffer_store_lds_dword"),
1333
   (0x71, 0x71, 0x3e, 0x3e,   -1, "buffer_wbinvl1"),
1334
   (0x70, 0x70, 0x3f, 0x3f,   -1, "buffer_wbinvl1_vol"),
1335
   (0x30, 0x30, 0x40, 0x40, 0x30, "buffer_atomic_swap"),
1336
   (0x31, 0x31, 0x41, 0x41, 0x31, "buffer_atomic_cmpswap"),
1337
   (0x32, 0x32, 0x42, 0x42, 0x32, "buffer_atomic_add"),
1338
   (0x33, 0x33, 0x43, 0x43, 0x33, "buffer_atomic_sub"),
1339
   (0x34,   -1,   -1,   -1,   -1, "buffer_atomic_rsub"),
1340
   (0x35, 0x35, 0x44, 0x44, 0x35, "buffer_atomic_smin"),
1341
   (0x36, 0x36, 0x45, 0x45, 0x36, "buffer_atomic_umin"),
1342
   (0x37, 0x37, 0x46, 0x46, 0x37, "buffer_atomic_smax"),
1343
   (0x38, 0x38, 0x47, 0x47, 0x38, "buffer_atomic_umax"),
1344
   (0x39, 0x39, 0x48, 0x48, 0x39, "buffer_atomic_and"),
1345
   (0x3a, 0x3a, 0x49, 0x49, 0x3a, "buffer_atomic_or"),
1346
   (0x3b, 0x3b, 0x4a, 0x4a, 0x3b, "buffer_atomic_xor"),
1347
   (0x3c, 0x3c, 0x4b, 0x4b, 0x3c, "buffer_atomic_inc"),
1348
   (0x3d, 0x3d, 0x4c, 0x4c, 0x3d, "buffer_atomic_dec"),
1349
   (0x3e, 0x3e,   -1,   -1, 0x3e, "buffer_atomic_fcmpswap"),
1350
   (0x3f, 0x3f,   -1,   -1, 0x3f, "buffer_atomic_fmin"),
1351
   (0x40, 0x40,   -1,   -1, 0x40, "buffer_atomic_fmax"),
1352
   (0x50, 0x50, 0x60, 0x60, 0x50, "buffer_atomic_swap_x2"),
1353
   (0x51, 0x51, 0x61, 0x61, 0x51, "buffer_atomic_cmpswap_x2"),
1354
   (0x52, 0x52, 0x62, 0x62, 0x52, "buffer_atomic_add_x2"),
1355
   (0x53, 0x53, 0x63, 0x63, 0x53, "buffer_atomic_sub_x2"),
1356
   (0x54,   -1,   -1,   -1,   -1, "buffer_atomic_rsub_x2"),
1357
   (0x55, 0x55, 0x64, 0x64, 0x55, "buffer_atomic_smin_x2"),
1358
   (0x56, 0x56, 0x65, 0x65, 0x56, "buffer_atomic_umin_x2"),
1359
   (0x57, 0x57, 0x66, 0x66, 0x57, "buffer_atomic_smax_x2"),
1360
   (0x58, 0x58, 0x67, 0x67, 0x58, "buffer_atomic_umax_x2"),
1361
   (0x59, 0x59, 0x68, 0x68, 0x59, "buffer_atomic_and_x2"),
1362
   (0x5a, 0x5a, 0x69, 0x69, 0x5a, "buffer_atomic_or_x2"),
1363
   (0x5b, 0x5b, 0x6a, 0x6a, 0x5b, "buffer_atomic_xor_x2"),
1364
   (0x5c, 0x5c, 0x6b, 0x6b, 0x5c, "buffer_atomic_inc_x2"),
1365
   (0x5d, 0x5d, 0x6c, 0x6c, 0x5d, "buffer_atomic_dec_x2"),
1366
   (0x5e, 0x5e,   -1,   -1, 0x5e, "buffer_atomic_fcmpswap_x2"),
1367
   (0x5f, 0x5f,   -1,   -1, 0x5f, "buffer_atomic_fmin_x2"),
1368
   (0x60, 0x60,   -1,   -1, 0x60, "buffer_atomic_fmax_x2"),
1369
   (  -1,   -1,   -1,   -1, 0x71, "buffer_gl0_inv"),
1370
   (  -1,   -1,   -1,   -1, 0x72, "buffer_gl1_inv"),
1371
   (  -1,   -1,   -1,   -1, 0x34, "buffer_atomic_csub"), #GFX10.3+. seems glc must be set
1372
}
1373
for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MUBUF:
1374
    opcode(name, gfx7, gfx9, gfx10, Format.MUBUF, InstrClass.VMem, is_atomic = "atomic" in name)
1375

1376
MTBUF = {
1377
   (0x00, 0x00, 0x00, 0x00, 0x00, "tbuffer_load_format_x"),
1378
   (0x01, 0x01, 0x01, 0x01, 0x01, "tbuffer_load_format_xy"),
1379
   (0x02, 0x02, 0x02, 0x02, 0x02, "tbuffer_load_format_xyz"),
1380
   (0x03, 0x03, 0x03, 0x03, 0x03, "tbuffer_load_format_xyzw"),
1381
   (0x04, 0x04, 0x04, 0x04, 0x04, "tbuffer_store_format_x"),
1382
   (0x05, 0x05, 0x05, 0x05, 0x05, "tbuffer_store_format_xy"),
1383
   (0x06, 0x06, 0x06, 0x06, 0x06, "tbuffer_store_format_xyz"),
1384
   (0x07, 0x07, 0x07, 0x07, 0x07, "tbuffer_store_format_xyzw"),
1385
   (  -1,   -1, 0x08, 0x08, 0x08, "tbuffer_load_format_d16_x"),
1386
   (  -1,   -1, 0x09, 0x09, 0x09, "tbuffer_load_format_d16_xy"),
1387
   (  -1,   -1, 0x0a, 0x0a, 0x0a, "tbuffer_load_format_d16_xyz"),
1388
   (  -1,   -1, 0x0b, 0x0b, 0x0b, "tbuffer_load_format_d16_xyzw"),
1389
   (  -1,   -1, 0x0c, 0x0c, 0x0c, "tbuffer_store_format_d16_x"),
1390
   (  -1,   -1, 0x0d, 0x0d, 0x0d, "tbuffer_store_format_d16_xy"),
1391
   (  -1,   -1, 0x0e, 0x0e, 0x0e, "tbuffer_store_format_d16_xyz"),
1392
   (  -1,   -1, 0x0f, 0x0f, 0x0f, "tbuffer_store_format_d16_xyzw"),
1393
}
1394
for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MTBUF:
1395
    opcode(name, gfx7, gfx9, gfx10, Format.MTBUF, InstrClass.VMem)
1396

1397

1398
IMAGE = {
1399
   (0x00, "image_load"),
1400
   (0x01, "image_load_mip"),
1401
   (0x02, "image_load_pck"),
1402
   (0x03, "image_load_pck_sgn"),
1403
   (0x04, "image_load_mip_pck"),
1404
   (0x05, "image_load_mip_pck_sgn"),
1405
   (0x08, "image_store"),
1406
   (0x09, "image_store_mip"),
1407
   (0x0a, "image_store_pck"),
1408
   (0x0b, "image_store_mip_pck"),
1409
   (0x0e, "image_get_resinfo"),
1410
   (0x60, "image_get_lod"),
1411
}
1412
# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
1413
for (code, name) in IMAGE:
1414
   opcode(name, code, code, code, Format.MIMG, InstrClass.VMem)
1415

1416
opcode("image_msaa_load", -1, -1, 0x80, Format.MIMG, InstrClass.VMem) #GFX10.3+
1417

1418
IMAGE_ATOMIC = {
1419
   (0x0f, 0x0f, 0x10, "image_atomic_swap"),
1420
   (0x10, 0x10, 0x11, "image_atomic_cmpswap"),
1421
   (0x11, 0x11, 0x12, "image_atomic_add"),
1422
   (0x12, 0x12, 0x13, "image_atomic_sub"),
1423
   (0x13,   -1,   -1, "image_atomic_rsub"),
1424
   (0x14, 0x14, 0x14, "image_atomic_smin"),
1425
   (0x15, 0x15, 0x15, "image_atomic_umin"),
1426
   (0x16, 0x16, 0x16, "image_atomic_smax"),
1427
   (0x17, 0x17, 0x17, "image_atomic_umax"),
1428
   (0x18, 0x18, 0x18, "image_atomic_and"),
1429
   (0x19, 0x19, 0x19, "image_atomic_or"),
1430
   (0x1a, 0x1a, 0x1a, "image_atomic_xor"),
1431
   (0x1b, 0x1b, 0x1b, "image_atomic_inc"),
1432
   (0x1c, 0x1c, 0x1c, "image_atomic_dec"),
1433
   (0x1d, 0x1d,   -1, "image_atomic_fcmpswap"),
1434
   (0x1e, 0x1e,   -1, "image_atomic_fmin"),
1435
   (0x1f, 0x1f,   -1, "image_atomic_fmax"),
1436
}
1437
# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (gfx6, gfx7, gfx89, gfx89, ???, name)
1438
# gfx7 and gfx10 opcodes are the same here
1439
for (gfx6, gfx7, gfx89, name) in IMAGE_ATOMIC:
1440
   opcode(name, gfx7, gfx89, gfx7, Format.MIMG, InstrClass.VMem, is_atomic = True)
1441

1442
IMAGE_SAMPLE = {
1443
   (0x20, "image_sample"),
1444
   (0x21, "image_sample_cl"),
1445
   (0x22, "image_sample_d"),
1446
   (0x23, "image_sample_d_cl"),
1447
   (0x24, "image_sample_l"),
1448
   (0x25, "image_sample_b"),
1449
   (0x26, "image_sample_b_cl"),
1450
   (0x27, "image_sample_lz"),
1451
   (0x28, "image_sample_c"),
1452
   (0x29, "image_sample_c_cl"),
1453
   (0x2a, "image_sample_c_d"),
1454
   (0x2b, "image_sample_c_d_cl"),
1455
   (0x2c, "image_sample_c_l"),
1456
   (0x2d, "image_sample_c_b"),
1457
   (0x2e, "image_sample_c_b_cl"),
1458
   (0x2f, "image_sample_c_lz"),
1459
   (0x30, "image_sample_o"),
1460
   (0x31, "image_sample_cl_o"),
1461
   (0x32, "image_sample_d_o"),
1462
   (0x33, "image_sample_d_cl_o"),
1463
   (0x34, "image_sample_l_o"),
1464
   (0x35, "image_sample_b_o"),
1465
   (0x36, "image_sample_b_cl_o"),
1466
   (0x37, "image_sample_lz_o"),
1467
   (0x38, "image_sample_c_o"),
1468
   (0x39, "image_sample_c_cl_o"),
1469
   (0x3a, "image_sample_c_d_o"),
1470
   (0x3b, "image_sample_c_d_cl_o"),
1471
   (0x3c, "image_sample_c_l_o"),
1472
   (0x3d, "image_sample_c_b_o"),
1473
   (0x3e, "image_sample_c_b_cl_o"),
1474
   (0x3f, "image_sample_c_lz_o"),
1475
   (0x68, "image_sample_cd"),
1476
   (0x69, "image_sample_cd_cl"),
1477
   (0x6a, "image_sample_c_cd"),
1478
   (0x6b, "image_sample_c_cd_cl"),
1479
   (0x6c, "image_sample_cd_o"),
1480
   (0x6d, "image_sample_cd_cl_o"),
1481
   (0x6e, "image_sample_c_cd_o"),
1482
   (0x6f, "image_sample_c_cd_cl_o"),
1483
}
1484
# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
1485
for (code, name) in IMAGE_SAMPLE:
1486
   opcode(name, code, code, code, Format.MIMG, InstrClass.VMem)
1487

1488
IMAGE_GATHER4 = {
1489
   (0x40, "image_gather4"),
1490
   (0x41, "image_gather4_cl"),
1491
   #(0x42, "image_gather4h"), VEGA only?
1492
   (0x44, "image_gather4_l"), # following instructions have different opcodes according to ISA sheet.
1493
   (0x45, "image_gather4_b"),
1494
   (0x46, "image_gather4_b_cl"),
1495
   (0x47, "image_gather4_lz"),
1496
   (0x48, "image_gather4_c"),
1497
   (0x49, "image_gather4_c_cl"), # previous instructions have different opcodes according to ISA sheet.
1498
   #(0x4a, "image_gather4h_pck"), VEGA only?
1499
   #(0x4b, "image_gather8h_pck"), VGEA only?
1500
   (0x4c, "image_gather4_c_l"),
1501
   (0x4d, "image_gather4_c_b"),
1502
   (0x4e, "image_gather4_c_b_cl"),
1503
   (0x4f, "image_gather4_c_lz"),
1504
   (0x50, "image_gather4_o"),
1505
   (0x51, "image_gather4_cl_o"),
1506
   (0x54, "image_gather4_l_o"),
1507
   (0x55, "image_gather4_b_o"),
1508
   (0x56, "image_gather4_b_cl_o"),
1509
   (0x57, "image_gather4_lz_o"),
1510
   (0x58, "image_gather4_c_o"),
1511
   (0x59, "image_gather4_c_cl_o"),
1512
   (0x5c, "image_gather4_c_l_o"),
1513
   (0x5d, "image_gather4_c_b_o"),
1514
   (0x5e, "image_gather4_c_b_cl_o"),
1515
   (0x5f, "image_gather4_c_lz_o"),
1516
}
1517
# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
1518
for (code, name) in IMAGE_GATHER4:
1519
   opcode(name, code, code, code, Format.MIMG, InstrClass.VMem)
1520

1521
opcode("image_bvh64_intersect_ray", -1, -1, 231, Format.MIMG, InstrClass.VMem)
1522

1523
FLAT = {
1524
   #GFX7, GFX8_9, GFX10
1525
   (0x08, 0x10, 0x08, "flat_load_ubyte"),
1526
   (0x09, 0x11, 0x09, "flat_load_sbyte"),
1527
   (0x0a, 0x12, 0x0a, "flat_load_ushort"),
1528
   (0x0b, 0x13, 0x0b, "flat_load_sshort"),
1529
   (0x0c, 0x14, 0x0c, "flat_load_dword"),
1530
   (0x0d, 0x15, 0x0d, "flat_load_dwordx2"),
1531
   (0x0f, 0x16, 0x0f, "flat_load_dwordx3"),
1532
   (0x0e, 0x17, 0x0e, "flat_load_dwordx4"),
1533
   (0x18, 0x18, 0x18, "flat_store_byte"),
1534
   (  -1, 0x19, 0x19, "flat_store_byte_d16_hi"),
1535
   (0x1a, 0x1a, 0x1a, "flat_store_short"),
1536
   (  -1, 0x1b, 0x1b, "flat_store_short_d16_hi"),
1537
   (0x1c, 0x1c, 0x1c, "flat_store_dword"),
1538
   (0x1d, 0x1d, 0x1d, "flat_store_dwordx2"),
1539
   (0x1f, 0x1e, 0x1f, "flat_store_dwordx3"),
1540
   (0x1e, 0x1f, 0x1e, "flat_store_dwordx4"),
1541
   (  -1, 0x20, 0x20, "flat_load_ubyte_d16"),
1542
   (  -1, 0x21, 0x21, "flat_load_ubyte_d16_hi"),
1543
   (  -1, 0x22, 0x22, "flat_load_sbyte_d16"),
1544
   (  -1, 0x23, 0x23, "flat_load_sbyte_d16_hi"),
1545
   (  -1, 0x24, 0x24, "flat_load_short_d16"),
1546
   (  -1, 0x25, 0x25, "flat_load_short_d16_hi"),
1547
   (0x30, 0x40, 0x30, "flat_atomic_swap"),
1548
   (0x31, 0x41, 0x31, "flat_atomic_cmpswap"),
1549
   (0x32, 0x42, 0x32, "flat_atomic_add"),
1550
   (0x33, 0x43, 0x33, "flat_atomic_sub"),
1551
   (0x35, 0x44, 0x35, "flat_atomic_smin"),
1552
   (0x36, 0x45, 0x36, "flat_atomic_umin"),
1553
   (0x37, 0x46, 0x37, "flat_atomic_smax"),
1554
   (0x38, 0x47, 0x38, "flat_atomic_umax"),
1555
   (0x39, 0x48, 0x39, "flat_atomic_and"),
1556
   (0x3a, 0x49, 0x3a, "flat_atomic_or"),
1557
   (0x3b, 0x4a, 0x3b, "flat_atomic_xor"),
1558
   (0x3c, 0x4b, 0x3c, "flat_atomic_inc"),
1559
   (0x3d, 0x4c, 0x3d, "flat_atomic_dec"),
1560
   (0x3e,   -1, 0x3e, "flat_atomic_fcmpswap"),
1561
   (0x3f,   -1, 0x3f, "flat_atomic_fmin"),
1562
   (0x40,   -1, 0x40, "flat_atomic_fmax"),
1563
   (0x50, 0x60, 0x50, "flat_atomic_swap_x2"),
1564
   (0x51, 0x61, 0x51, "flat_atomic_cmpswap_x2"),
1565
   (0x52, 0x62, 0x52, "flat_atomic_add_x2"),
1566
   (0x53, 0x63, 0x53, "flat_atomic_sub_x2"),
1567
   (0x55, 0x64, 0x55, "flat_atomic_smin_x2"),
1568
   (0x56, 0x65, 0x56, "flat_atomic_umin_x2"),
1569
   (0x57, 0x66, 0x57, "flat_atomic_smax_x2"),
1570
   (0x58, 0x67, 0x58, "flat_atomic_umax_x2"),
1571
   (0x59, 0x68, 0x59, "flat_atomic_and_x2"),
1572
   (0x5a, 0x69, 0x5a, "flat_atomic_or_x2"),
1573
   (0x5b, 0x6a, 0x5b, "flat_atomic_xor_x2"),
1574
   (0x5c, 0x6b, 0x5c, "flat_atomic_inc_x2"),
1575
   (0x5d, 0x6c, 0x5d, "flat_atomic_dec_x2"),
1576
   (0x5e,   -1, 0x5e, "flat_atomic_fcmpswap_x2"),
1577
   (0x5f,   -1, 0x5f, "flat_atomic_fmin_x2"),
1578
   (0x60,   -1, 0x60, "flat_atomic_fmax_x2"),
1579
}
1580
for (gfx7, gfx8, gfx10, name) in FLAT:
1581
    opcode(name, gfx7, gfx8, gfx10, Format.FLAT, InstrClass.VMem, is_atomic = "atomic" in name) #TODO: also LDS?
1582

1583
GLOBAL = {
1584
   #GFX8_9, GFX10
1585
   (0x10, 0x08, "global_load_ubyte"),
1586
   (0x11, 0x09, "global_load_sbyte"),
1587
   (0x12, 0x0a, "global_load_ushort"),
1588
   (0x13, 0x0b, "global_load_sshort"),
1589
   (0x14, 0x0c, "global_load_dword"),
1590
   (0x15, 0x0d, "global_load_dwordx2"),
1591
   (0x16, 0x0f, "global_load_dwordx3"),
1592
   (0x17, 0x0e, "global_load_dwordx4"),
1593
   (0x18, 0x18, "global_store_byte"),
1594
   (0x19, 0x19, "global_store_byte_d16_hi"),
1595
   (0x1a, 0x1a, "global_store_short"),
1596
   (0x1b, 0x1b, "global_store_short_d16_hi"),
1597
   (0x1c, 0x1c, "global_store_dword"),
1598
   (0x1d, 0x1d, "global_store_dwordx2"),
1599
   (0x1e, 0x1f, "global_store_dwordx3"),
1600
   (0x1f, 0x1e, "global_store_dwordx4"),
1601
   (0x20, 0x20, "global_load_ubyte_d16"),
1602
   (0x21, 0x21, "global_load_ubyte_d16_hi"),
1603
   (0x22, 0x22, "global_load_sbyte_d16"),
1604
   (0x23, 0x23, "global_load_sbyte_d16_hi"),
1605
   (0x24, 0x24, "global_load_short_d16"),
1606
   (0x25, 0x25, "global_load_short_d16_hi"),
1607
   (0x40, 0x30, "global_atomic_swap"),
1608
   (0x41, 0x31, "global_atomic_cmpswap"),
1609
   (0x42, 0x32, "global_atomic_add"),
1610
   (0x43, 0x33, "global_atomic_sub"),
1611
   (0x44, 0x35, "global_atomic_smin"),
1612
   (0x45, 0x36, "global_atomic_umin"),
1613
   (0x46, 0x37, "global_atomic_smax"),
1614
   (0x47, 0x38, "global_atomic_umax"),
1615
   (0x48, 0x39, "global_atomic_and"),
1616
   (0x49, 0x3a, "global_atomic_or"),
1617
   (0x4a, 0x3b, "global_atomic_xor"),
1618
   (0x4b, 0x3c, "global_atomic_inc"),
1619
   (0x4c, 0x3d, "global_atomic_dec"),
1620
   (  -1, 0x3e, "global_atomic_fcmpswap"),
1621
   (  -1, 0x3f, "global_atomic_fmin"),
1622
   (  -1, 0x40, "global_atomic_fmax"),
1623
   (0x60, 0x50, "global_atomic_swap_x2"),
1624
   (0x61, 0x51, "global_atomic_cmpswap_x2"),
1625
   (0x62, 0x52, "global_atomic_add_x2"),
1626
   (0x63, 0x53, "global_atomic_sub_x2"),
1627
   (0x64, 0x55, "global_atomic_smin_x2"),
1628
   (0x65, 0x56, "global_atomic_umin_x2"),
1629
   (0x66, 0x57, "global_atomic_smax_x2"),
1630
   (0x67, 0x58, "global_atomic_umax_x2"),
1631
   (0x68, 0x59, "global_atomic_and_x2"),
1632
   (0x69, 0x5a, "global_atomic_or_x2"),
1633
   (0x6a, 0x5b, "global_atomic_xor_x2"),
1634
   (0x6b, 0x5c, "global_atomic_inc_x2"),
1635
   (0x6c, 0x5d, "global_atomic_dec_x2"),
1636
   (  -1, 0x5e, "global_atomic_fcmpswap_x2"),
1637
   (  -1, 0x5f, "global_atomic_fmin_x2"),
1638
   (  -1, 0x60, "global_atomic_fmax_x2"),
1639
   (  -1, 0x16, "global_load_dword_addtid"), #GFX10.3+
1640
   (  -1, 0x17, "global_store_dword_addtid"), #GFX10.3+
1641
   (  -1, 0x34, "global_atomic_csub"), #GFX10.3+. seems glc must be set
1642
}
1643
for (gfx8, gfx10, name) in GLOBAL:
1644
    opcode(name, -1, gfx8, gfx10, Format.GLOBAL, InstrClass.VMem, is_atomic = "atomic" in name)
1645

1646
SCRATCH = {
1647
   #GFX8_9, GFX10
1648
   (0x10, 0x08, "scratch_load_ubyte"),
1649
   (0x11, 0x09, "scratch_load_sbyte"),
1650
   (0x12, 0x0a, "scratch_load_ushort"),
1651
   (0x13, 0x0b, "scratch_load_sshort"),
1652
   (0x14, 0x0c, "scratch_load_dword"),
1653
   (0x15, 0x0d, "scratch_load_dwordx2"),
1654
   (0x16, 0x0f, "scratch_load_dwordx3"),
1655
   (0x17, 0x0e, "scratch_load_dwordx4"),
1656
   (0x18, 0x18, "scratch_store_byte"),
1657
   (0x19, 0x19, "scratch_store_byte_d16_hi"),
1658
   (0x1a, 0x1a, "scratch_store_short"),
1659
   (0x1b, 0x1b, "scratch_store_short_d16_hi"),
1660
   (0x1c, 0x1c, "scratch_store_dword"),
1661
   (0x1d, 0x1d, "scratch_store_dwordx2"),
1662
   (0x1e, 0x1f, "scratch_store_dwordx3"),
1663
   (0x1f, 0x1e, "scratch_store_dwordx4"),
1664
   (0x20, 0x20, "scratch_load_ubyte_d16"),
1665
   (0x21, 0x21, "scratch_load_ubyte_d16_hi"),
1666
   (0x22, 0x22, "scratch_load_sbyte_d16"),
1667
   (0x23, 0x23, "scratch_load_sbyte_d16_hi"),
1668
   (0x24, 0x24, "scratch_load_short_d16"),
1669
   (0x25, 0x25, "scratch_load_short_d16_hi"),
1670
}
1671
for (gfx8, gfx10, name) in SCRATCH:
1672
    opcode(name, -1, gfx8, gfx10, Format.SCRATCH, InstrClass.VMem)
1673

1674
# check for duplicate opcode numbers
1675
for ver in ['gfx9', 'gfx10']:
1676
    op_to_name = {}
1677
    for op in opcodes.values():
1678
        if op.format in [Format.PSEUDO, Format.PSEUDO_BRANCH, Format.PSEUDO_BARRIER, Format.PSEUDO_REDUCTION]:
1679
            continue
1680

1681
        num = getattr(op, 'opcode_' + ver)
1682
        if num == -1:
1683
            continue
1684

1685
        key = (op.format, num)
1686

1687
        if key in op_to_name:
1688
            # exceptions
1689
            names = set([op_to_name[key], op.name])
1690
            if ver in ['gfx8', 'gfx9'] and names == set(['v_mul_lo_i32', 'v_mul_lo_u32']):
1691
                continue
1692
            # v_mad_legacy_f32 is replaced with v_fma_legacy_f32 on GFX10.3
1693
            if ver == 'gfx10' and names == set(['v_mad_legacy_f32', 'v_fma_legacy_f32']):
1694
                continue
1695

1696
            print('%s and %s share the same opcode number (%s)' % (op_to_name[key], op.name, ver))
1697
            sys.exit(1)
1698
        else:
1699
            op_to_name[key] = op.name
1700

1701
# These instructions write the entire 32-bit VGPR, but it's not clear in Opcode's constructor that
1702
# it should be 32, since it works accidentally.
1703
assert(opcodes['ds_read_u8'].definition_size == 32)
1704
assert(opcodes['ds_read_u16'].definition_size == 32)
1705

1706
Product

Resources

Company