Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_assembler.cpp
4550 views
1
/*
2
* Copyright © 2018 Valve Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*
23
*/
24
25
#include "aco_builder.h"
26
#include "aco_ir.h"
27
28
#include "common/sid.h"
29
30
#include "util/memstream.h"
31
32
#include <algorithm>
33
#include <map>
34
#include <vector>
35
36
namespace aco {
37
38
struct constaddr_info {
39
unsigned getpc_end;
40
unsigned add_literal;
41
};
42
43
struct asm_context {
44
Program* program;
45
enum chip_class chip_class;
46
std::vector<std::pair<int, SOPP_instruction*>> branches;
47
std::map<unsigned, constaddr_info> constaddrs;
48
const int16_t* opcode;
49
// TODO: keep track of branch instructions referring blocks
50
// and, when emitting the block, correct the offset in instr
51
asm_context(Program* program_) : program(program_), chip_class(program->chip_class)
52
{
53
if (chip_class <= GFX7)
54
opcode = &instr_info.opcode_gfx7[0];
55
else if (chip_class <= GFX9)
56
opcode = &instr_info.opcode_gfx9[0];
57
else if (chip_class >= GFX10)
58
opcode = &instr_info.opcode_gfx10[0];
59
}
60
61
int subvector_begin_pos = -1;
62
};
63
64
static uint32_t
65
get_sdwa_sel(unsigned sel, PhysReg reg)
66
{
67
if (sel & sdwa_isra) {
68
unsigned size = sdwa_rasize & sel;
69
if (size == 1)
70
return reg.byte();
71
else /* size == 2 */
72
return sdwa_isword | (reg.byte() >> 1);
73
}
74
return sel & sdwa_asuint;
75
}
76
77
unsigned
78
get_mimg_nsa_dwords(const Instruction* instr)
79
{
80
unsigned addr_dwords = instr->operands.size() - 3;
81
for (unsigned i = 1; i < addr_dwords; i++) {
82
if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4))
83
return DIV_ROUND_UP(addr_dwords - 1, 4);
84
}
85
return 0;
86
}
87
88
void
89
emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
90
{
91
/* lower remaining pseudo-instructions */
92
if (instr->opcode == aco_opcode::p_constaddr_getpc) {
93
ctx.constaddrs[instr->operands[0].constantValue()].getpc_end = out.size() + 1;
94
95
instr->opcode = aco_opcode::s_getpc_b64;
96
instr->operands.pop_back();
97
} else if (instr->opcode == aco_opcode::p_constaddr_addlo) {
98
ctx.constaddrs[instr->operands[1].constantValue()].add_literal = out.size() + 1;
99
100
instr->opcode = aco_opcode::s_add_u32;
101
instr->operands[1] = Operand::zero();
102
instr->operands[1].setFixed(PhysReg(255));
103
}
104
105
uint32_t opcode = ctx.opcode[(int)instr->opcode];
106
if (opcode == (uint32_t)-1) {
107
char* outmem;
108
size_t outsize;
109
struct u_memstream mem;
110
u_memstream_open(&mem, &outmem, &outsize);
111
FILE* const memf = u_memstream_get(&mem);
112
113
fprintf(memf, "Unsupported opcode: ");
114
aco_print_instr(instr, memf);
115
u_memstream_close(&mem);
116
117
aco_err(ctx.program, outmem);
118
free(outmem);
119
120
abort();
121
}
122
123
switch (instr->format) {
124
case Format::SOP2: {
125
uint32_t encoding = (0b10 << 30);
126
encoding |= opcode << 23;
127
encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0;
128
encoding |= instr->operands.size() >= 2 ? instr->operands[1].physReg() << 8 : 0;
129
encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
130
out.push_back(encoding);
131
break;
132
}
133
case Format::SOPK: {
134
SOPK_instruction& sopk = instr->sopk();
135
136
if (instr->opcode == aco_opcode::s_subvector_loop_begin) {
137
assert(ctx.chip_class >= GFX10);
138
assert(ctx.subvector_begin_pos == -1);
139
ctx.subvector_begin_pos = out.size();
140
} else if (instr->opcode == aco_opcode::s_subvector_loop_end) {
141
assert(ctx.chip_class >= GFX10);
142
assert(ctx.subvector_begin_pos != -1);
143
/* Adjust s_subvector_loop_begin instruction to the address after the end */
144
out[ctx.subvector_begin_pos] |= (out.size() - ctx.subvector_begin_pos);
145
/* Adjust s_subvector_loop_end instruction to the address after the beginning */
146
sopk.imm = (uint16_t)(ctx.subvector_begin_pos - (int)out.size());
147
ctx.subvector_begin_pos = -1;
148
}
149
150
uint32_t encoding = (0b1011 << 28);
151
encoding |= opcode << 23;
152
encoding |= !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc)
153
? instr->definitions[0].physReg() << 16
154
: !instr->operands.empty() && instr->operands[0].physReg() <= 127
155
? instr->operands[0].physReg() << 16
156
: 0;
157
encoding |= sopk.imm;
158
out.push_back(encoding);
159
break;
160
}
161
case Format::SOP1: {
162
uint32_t encoding = (0b101111101 << 23);
163
if (opcode >= 55 && ctx.chip_class <= GFX9) {
164
assert(ctx.chip_class == GFX9 && opcode < 60);
165
opcode = opcode - 4;
166
}
167
encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0;
168
encoding |= opcode << 8;
169
encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
170
out.push_back(encoding);
171
break;
172
}
173
case Format::SOPC: {
174
uint32_t encoding = (0b101111110 << 23);
175
encoding |= opcode << 16;
176
encoding |= instr->operands.size() == 2 ? instr->operands[1].physReg() << 8 : 0;
177
encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
178
out.push_back(encoding);
179
break;
180
}
181
case Format::SOPP: {
182
SOPP_instruction& sopp = instr->sopp();
183
uint32_t encoding = (0b101111111 << 23);
184
encoding |= opcode << 16;
185
encoding |= (uint16_t)sopp.imm;
186
if (sopp.block != -1) {
187
sopp.pass_flags = 0;
188
ctx.branches.emplace_back(out.size(), &sopp);
189
}
190
out.push_back(encoding);
191
break;
192
}
193
case Format::SMEM: {
194
SMEM_instruction& smem = instr->smem();
195
bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
196
bool is_load = !instr->definitions.empty();
197
uint32_t encoding = 0;
198
199
if (ctx.chip_class <= GFX7) {
200
encoding = (0b11000 << 27);
201
encoding |= opcode << 22;
202
encoding |= instr->definitions.size() ? instr->definitions[0].physReg() << 15 : 0;
203
encoding |= instr->operands.size() ? (instr->operands[0].physReg() >> 1) << 9 : 0;
204
if (instr->operands.size() >= 2) {
205
if (!instr->operands[1].isConstant()) {
206
encoding |= instr->operands[1].physReg().reg();
207
} else if (instr->operands[1].constantValue() >= 1024) {
208
encoding |= 255; /* SQ_SRC_LITERAL */
209
} else {
210
encoding |= instr->operands[1].constantValue() >> 2;
211
encoding |= 1 << 8;
212
}
213
}
214
out.push_back(encoding);
215
/* SMRD instructions can take a literal on GFX7 */
216
if (instr->operands.size() >= 2 && instr->operands[1].isConstant() &&
217
instr->operands[1].constantValue() >= 1024)
218
out.push_back(instr->operands[1].constantValue() >> 2);
219
return;
220
}
221
222
if (ctx.chip_class <= GFX9) {
223
encoding = (0b110000 << 26);
224
assert(!smem.dlc); /* Device-level coherent is not supported on GFX9 and lower */
225
encoding |= smem.nv ? 1 << 15 : 0;
226
} else {
227
encoding = (0b111101 << 26);
228
assert(!smem.nv); /* Non-volatile is not supported on GFX10 */
229
encoding |= smem.dlc ? 1 << 14 : 0;
230
}
231
232
encoding |= opcode << 18;
233
encoding |= smem.glc ? 1 << 16 : 0;
234
235
if (ctx.chip_class <= GFX9) {
236
if (instr->operands.size() >= 2)
237
encoding |= instr->operands[1].isConstant() ? 1 << 17 : 0; /* IMM - immediate enable */
238
}
239
if (ctx.chip_class == GFX9) {
240
encoding |= soe ? 1 << 14 : 0;
241
}
242
243
if (is_load || instr->operands.size() >= 3) { /* SDATA */
244
encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg())
245
<< 6;
246
}
247
if (instr->operands.size() >= 1) { /* SBASE */
248
encoding |= instr->operands[0].physReg() >> 1;
249
}
250
251
out.push_back(encoding);
252
encoding = 0;
253
254
int32_t offset = 0;
255
uint32_t soffset = ctx.chip_class >= GFX10
256
? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */
257
: 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on
258
GFX8 and below) */
259
if (instr->operands.size() >= 2) {
260
const Operand& op_off1 = instr->operands[1];
261
if (ctx.chip_class <= GFX9) {
262
offset = op_off1.isConstant() ? op_off1.constantValue() : op_off1.physReg();
263
} else {
264
/* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an
265
* SGPR */
266
if (op_off1.isConstant()) {
267
offset = op_off1.constantValue();
268
} else {
269
soffset = op_off1.physReg();
270
assert(!soe); /* There is no place to put the other SGPR offset, if any */
271
}
272
}
273
274
if (soe) {
275
const Operand& op_off2 = instr->operands.back();
276
assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant
277
and an SGPR at the same time */
278
assert(!op_off2.isConstant());
279
soffset = op_off2.physReg();
280
}
281
}
282
encoding |= offset;
283
encoding |= soffset << 25;
284
285
out.push_back(encoding);
286
return;
287
}
288
case Format::VOP2: {
289
uint32_t encoding = 0;
290
encoding |= opcode << 25;
291
encoding |= (0xFF & instr->definitions[0].physReg()) << 17;
292
encoding |= (0xFF & instr->operands[1].physReg()) << 9;
293
encoding |= instr->operands[0].physReg();
294
out.push_back(encoding);
295
break;
296
}
297
case Format::VOP1: {
298
uint32_t encoding = (0b0111111 << 25);
299
if (!instr->definitions.empty())
300
encoding |= (0xFF & instr->definitions[0].physReg()) << 17;
301
encoding |= opcode << 9;
302
if (!instr->operands.empty())
303
encoding |= instr->operands[0].physReg();
304
out.push_back(encoding);
305
break;
306
}
307
case Format::VOPC: {
308
uint32_t encoding = (0b0111110 << 25);
309
encoding |= opcode << 17;
310
encoding |= (0xFF & instr->operands[1].physReg()) << 9;
311
encoding |= instr->operands[0].physReg();
312
out.push_back(encoding);
313
break;
314
}
315
case Format::VINTRP: {
316
Interp_instruction& interp = instr->vintrp();
317
uint32_t encoding = 0;
318
319
if (instr->opcode == aco_opcode::v_interp_p1ll_f16 ||
320
instr->opcode == aco_opcode::v_interp_p1lv_f16 ||
321
instr->opcode == aco_opcode::v_interp_p2_legacy_f16 ||
322
instr->opcode == aco_opcode::v_interp_p2_f16) {
323
if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
324
encoding = (0b110100 << 26);
325
} else if (ctx.chip_class >= GFX10) {
326
encoding = (0b110101 << 26);
327
} else {
328
unreachable("Unknown chip_class.");
329
}
330
331
encoding |= opcode << 16;
332
encoding |= (0xFF & instr->definitions[0].physReg());
333
out.push_back(encoding);
334
335
encoding = 0;
336
encoding |= interp.attribute;
337
encoding |= interp.component << 6;
338
encoding |= instr->operands[0].physReg() << 9;
339
if (instr->opcode == aco_opcode::v_interp_p2_f16 ||
340
instr->opcode == aco_opcode::v_interp_p2_legacy_f16 ||
341
instr->opcode == aco_opcode::v_interp_p1lv_f16) {
342
encoding |= instr->operands[2].physReg() << 18;
343
}
344
out.push_back(encoding);
345
} else {
346
if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
347
encoding = (0b110101 << 26); /* Vega ISA doc says 110010 but it's wrong */
348
} else {
349
encoding = (0b110010 << 26);
350
}
351
352
assert(encoding);
353
encoding |= (0xFF & instr->definitions[0].physReg()) << 18;
354
encoding |= opcode << 16;
355
encoding |= interp.attribute << 10;
356
encoding |= interp.component << 8;
357
if (instr->opcode == aco_opcode::v_interp_mov_f32)
358
encoding |= (0x3 & instr->operands[0].constantValue());
359
else
360
encoding |= (0xFF & instr->operands[0].physReg());
361
out.push_back(encoding);
362
}
363
break;
364
}
365
case Format::DS: {
366
DS_instruction& ds = instr->ds();
367
uint32_t encoding = (0b110110 << 26);
368
if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
369
encoding |= opcode << 17;
370
encoding |= (ds.gds ? 1 : 0) << 16;
371
} else {
372
encoding |= opcode << 18;
373
encoding |= (ds.gds ? 1 : 0) << 17;
374
}
375
encoding |= ((0xFF & ds.offset1) << 8);
376
encoding |= (0xFFFF & ds.offset0);
377
out.push_back(encoding);
378
encoding = 0;
379
unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0;
380
encoding |= (0xFF & reg) << 24;
381
reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0)
382
? instr->operands[2].physReg()
383
: 0;
384
encoding |= (0xFF & reg) << 16;
385
reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0)
386
? instr->operands[1].physReg()
387
: 0;
388
encoding |= (0xFF & reg) << 8;
389
encoding |= (0xFF & instr->operands[0].physReg());
390
out.push_back(encoding);
391
break;
392
}
393
case Format::MUBUF: {
394
MUBUF_instruction& mubuf = instr->mubuf();
395
uint32_t encoding = (0b111000 << 26);
396
encoding |= opcode << 18;
397
encoding |= (mubuf.lds ? 1 : 0) << 16;
398
encoding |= (mubuf.glc ? 1 : 0) << 14;
399
encoding |= (mubuf.idxen ? 1 : 0) << 13;
400
assert(!mubuf.addr64 || ctx.chip_class <= GFX7);
401
if (ctx.chip_class == GFX6 || ctx.chip_class == GFX7)
402
encoding |= (mubuf.addr64 ? 1 : 0) << 15;
403
encoding |= (mubuf.offen ? 1 : 0) << 12;
404
if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
405
assert(!mubuf.dlc); /* Device-level coherent is not supported on GFX9 and lower */
406
encoding |= (mubuf.slc ? 1 : 0) << 17;
407
} else if (ctx.chip_class >= GFX10) {
408
encoding |= (mubuf.dlc ? 1 : 0) << 15;
409
}
410
encoding |= 0x0FFF & mubuf.offset;
411
out.push_back(encoding);
412
encoding = 0;
413
if (ctx.chip_class <= GFX7 || ctx.chip_class >= GFX10) {
414
encoding |= (mubuf.slc ? 1 : 0) << 22;
415
}
416
encoding |= instr->operands[2].physReg() << 24;
417
encoding |= (mubuf.tfe ? 1 : 0) << 23;
418
encoding |= (instr->operands[0].physReg() >> 2) << 16;
419
unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()
420
: instr->definitions[0].physReg();
421
encoding |= (0xFF & reg) << 8;
422
encoding |= (0xFF & instr->operands[1].physReg());
423
out.push_back(encoding);
424
break;
425
}
426
case Format::MTBUF: {
427
MTBUF_instruction& mtbuf = instr->mtbuf();
428
429
uint32_t img_format = ac_get_tbuffer_format(ctx.chip_class, mtbuf.dfmt, mtbuf.nfmt);
430
uint32_t encoding = (0b111010 << 26);
431
assert(img_format <= 0x7F);
432
assert(!mtbuf.dlc || ctx.chip_class >= GFX10);
433
encoding |= (mtbuf.dlc ? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */
434
encoding |= (mtbuf.glc ? 1 : 0) << 14;
435
encoding |= (mtbuf.idxen ? 1 : 0) << 13;
436
encoding |= (mtbuf.offen ? 1 : 0) << 12;
437
encoding |= 0x0FFF & mtbuf.offset;
438
encoding |= (img_format << 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */
439
440
if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
441
encoding |= opcode << 15;
442
} else {
443
encoding |= (opcode & 0x07) << 16; /* 3 LSBs of 4-bit OPCODE */
444
}
445
446
out.push_back(encoding);
447
encoding = 0;
448
449
encoding |= instr->operands[2].physReg() << 24;
450
encoding |= (mtbuf.tfe ? 1 : 0) << 23;
451
encoding |= (mtbuf.slc ? 1 : 0) << 22;
452
encoding |= (instr->operands[0].physReg() >> 2) << 16;
453
unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()
454
: instr->definitions[0].physReg();
455
encoding |= (0xFF & reg) << 8;
456
encoding |= (0xFF & instr->operands[1].physReg());
457
458
if (ctx.chip_class >= GFX10) {
459
encoding |= (((opcode & 0x08) >> 3) << 21); /* MSB of 4-bit OPCODE */
460
}
461
462
out.push_back(encoding);
463
break;
464
}
465
case Format::MIMG: {
466
unsigned nsa_dwords = get_mimg_nsa_dwords(instr);
467
assert(!nsa_dwords || ctx.chip_class >= GFX10);
468
469
MIMG_instruction& mimg = instr->mimg();
470
uint32_t encoding = (0b111100 << 26);
471
encoding |= mimg.slc ? 1 << 25 : 0;
472
encoding |= (opcode & 0x7f) << 18;
473
encoding |= (opcode >> 7) & 1;
474
encoding |= mimg.lwe ? 1 << 17 : 0;
475
encoding |= mimg.tfe ? 1 << 16 : 0;
476
encoding |= mimg.glc ? 1 << 13 : 0;
477
encoding |= mimg.unrm ? 1 << 12 : 0;
478
if (ctx.chip_class <= GFX9) {
479
assert(!mimg.dlc); /* Device-level coherent is not supported on GFX9 and lower */
480
assert(!mimg.r128);
481
encoding |= mimg.a16 ? 1 << 15 : 0;
482
encoding |= mimg.da ? 1 << 14 : 0;
483
} else {
484
encoding |= mimg.r128 ? 1 << 15
485
: 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
486
encoding |= nsa_dwords << 1;
487
encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */
488
encoding |= mimg.dlc ? 1 << 7 : 0;
489
}
490
encoding |= (0xF & mimg.dmask) << 8;
491
out.push_back(encoding);
492
encoding = (0xFF & instr->operands[3].physReg()); /* VADDR */
493
if (!instr->definitions.empty()) {
494
encoding |= (0xFF & instr->definitions[0].physReg()) << 8; /* VDATA */
495
} else if (!instr->operands[2].isUndefined()) {
496
encoding |= (0xFF & instr->operands[2].physReg()) << 8; /* VDATA */
497
}
498
encoding |= (0x1F & (instr->operands[0].physReg() >> 2)) << 16; /* T# (resource) */
499
if (!instr->operands[1].isUndefined())
500
encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 21; /* sampler */
501
502
assert(!mimg.d16 || ctx.chip_class >= GFX9);
503
encoding |= mimg.d16 ? 1 << 31 : 0;
504
if (ctx.chip_class >= GFX10) {
505
/* GFX10: A16 still exists, but is in a different place */
506
encoding |= mimg.a16 ? 1 << 30 : 0;
507
}
508
509
out.push_back(encoding);
510
511
if (nsa_dwords) {
512
out.resize(out.size() + nsa_dwords);
513
std::vector<uint32_t>::iterator nsa = std::prev(out.end(), nsa_dwords);
514
for (unsigned i = 0; i < instr->operands.size() - 4u; i++)
515
nsa[i / 4] |= (0xFF & instr->operands[4 + i].physReg().reg()) << (i % 4 * 8);
516
}
517
break;
518
}
519
case Format::FLAT:
520
case Format::SCRATCH:
521
case Format::GLOBAL: {
522
FLAT_instruction& flat = instr->flatlike();
523
uint32_t encoding = (0b110111 << 26);
524
encoding |= opcode << 18;
525
if (ctx.chip_class <= GFX9) {
526
assert(flat.offset <= 0x1fff);
527
encoding |= flat.offset & 0x1fff;
528
} else if (instr->isFlat()) {
529
/* GFX10 has a 12-bit immediate OFFSET field,
530
* but it has a hw bug: it ignores the offset, called FlatSegmentOffsetBug
531
*/
532
assert(flat.offset == 0);
533
} else {
534
assert(flat.offset <= 0xfff);
535
encoding |= flat.offset & 0xfff;
536
}
537
if (instr->isScratch())
538
encoding |= 1 << 14;
539
else if (instr->isGlobal())
540
encoding |= 2 << 14;
541
encoding |= flat.lds ? 1 << 13 : 0;
542
encoding |= flat.glc ? 1 << 16 : 0;
543
encoding |= flat.slc ? 1 << 17 : 0;
544
if (ctx.chip_class >= GFX10) {
545
assert(!flat.nv);
546
encoding |= flat.dlc ? 1 << 12 : 0;
547
} else {
548
assert(!flat.dlc);
549
}
550
out.push_back(encoding);
551
encoding = (0xFF & instr->operands[0].physReg());
552
if (!instr->definitions.empty())
553
encoding |= (0xFF & instr->definitions[0].physReg()) << 24;
554
if (instr->operands.size() >= 3)
555
encoding |= (0xFF & instr->operands[2].physReg()) << 8;
556
if (!instr->operands[1].isUndefined()) {
557
assert(ctx.chip_class >= GFX10 || instr->operands[1].physReg() != 0x7F);
558
assert(instr->format != Format::FLAT);
559
encoding |= instr->operands[1].physReg() << 16;
560
} else if (instr->format != Format::FLAT ||
561
ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */
562
if (ctx.chip_class <= GFX9)
563
encoding |= 0x7F << 16;
564
else
565
encoding |= sgpr_null << 16;
566
}
567
encoding |= flat.nv ? 1 << 23 : 0;
568
out.push_back(encoding);
569
break;
570
}
571
case Format::EXP: {
572
Export_instruction& exp = instr->exp();
573
uint32_t encoding;
574
if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
575
encoding = (0b110001 << 26);
576
} else {
577
encoding = (0b111110 << 26);
578
}
579
580
encoding |= exp.valid_mask ? 0b1 << 12 : 0;
581
encoding |= exp.done ? 0b1 << 11 : 0;
582
encoding |= exp.compressed ? 0b1 << 10 : 0;
583
encoding |= exp.dest << 4;
584
encoding |= exp.enabled_mask;
585
out.push_back(encoding);
586
encoding = 0xFF & exp.operands[0].physReg();
587
encoding |= (0xFF & exp.operands[1].physReg()) << 8;
588
encoding |= (0xFF & exp.operands[2].physReg()) << 16;
589
encoding |= (0xFF & exp.operands[3].physReg()) << 24;
590
out.push_back(encoding);
591
break;
592
}
593
case Format::PSEUDO:
594
case Format::PSEUDO_BARRIER:
595
if (instr->opcode != aco_opcode::p_unit_test)
596
unreachable("Pseudo instructions should be lowered before assembly.");
597
break;
598
default:
599
if (instr->isVOP3()) {
600
VOP3_instruction& vop3 = instr->vop3();
601
602
if (instr->isVOP2()) {
603
opcode = opcode + 0x100;
604
} else if (instr->isVOP1()) {
605
if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9)
606
opcode = opcode + 0x140;
607
else
608
opcode = opcode + 0x180;
609
} else if (instr->isVOPC()) {
610
opcode = opcode + 0x0;
611
} else if (instr->isVINTRP()) {
612
opcode = opcode + 0x270;
613
}
614
615
uint32_t encoding;
616
if (ctx.chip_class <= GFX9) {
617
encoding = (0b110100 << 26);
618
} else if (ctx.chip_class >= GFX10) {
619
encoding = (0b110101 << 26);
620
} else {
621
unreachable("Unknown chip_class.");
622
}
623
624
if (ctx.chip_class <= GFX7) {
625
encoding |= opcode << 17;
626
encoding |= (vop3.clamp ? 1 : 0) << 11;
627
} else {
628
encoding |= opcode << 16;
629
encoding |= (vop3.clamp ? 1 : 0) << 15;
630
}
631
encoding |= vop3.opsel << 11;
632
for (unsigned i = 0; i < 3; i++)
633
encoding |= vop3.abs[i] << (8 + i);
634
if (instr->definitions.size() == 2)
635
encoding |= instr->definitions[1].physReg() << 8;
636
encoding |= (0xFF & instr->definitions[0].physReg());
637
out.push_back(encoding);
638
encoding = 0;
639
if (instr->opcode == aco_opcode::v_interp_mov_f32) {
640
encoding = 0x3 & instr->operands[0].constantValue();
641
} else {
642
for (unsigned i = 0; i < instr->operands.size(); i++)
643
encoding |= instr->operands[i].physReg() << (i * 9);
644
}
645
encoding |= vop3.omod << 27;
646
for (unsigned i = 0; i < 3; i++)
647
encoding |= vop3.neg[i] << (29 + i);
648
out.push_back(encoding);
649
650
} else if (instr->isVOP3P()) {
651
VOP3P_instruction& vop3 = instr->vop3p();
652
653
uint32_t encoding;
654
if (ctx.chip_class == GFX9) {
655
encoding = (0b110100111 << 23);
656
} else if (ctx.chip_class >= GFX10) {
657
encoding = (0b110011 << 26);
658
} else {
659
unreachable("Unknown chip_class.");
660
}
661
662
encoding |= opcode << 16;
663
encoding |= (vop3.clamp ? 1 : 0) << 15;
664
encoding |= vop3.opsel_lo << 11;
665
encoding |= ((vop3.opsel_hi & 0x4) ? 1 : 0) << 14;
666
for (unsigned i = 0; i < 3; i++)
667
encoding |= vop3.neg_hi[i] << (8 + i);
668
encoding |= (0xFF & instr->definitions[0].physReg());
669
out.push_back(encoding);
670
encoding = 0;
671
for (unsigned i = 0; i < instr->operands.size(); i++)
672
encoding |= instr->operands[i].physReg() << (i * 9);
673
encoding |= (vop3.opsel_hi & 0x3) << 27;
674
for (unsigned i = 0; i < 3; i++)
675
encoding |= vop3.neg_lo[i] << (29 + i);
676
out.push_back(encoding);
677
678
} else if (instr->isDPP()) {
679
assert(ctx.chip_class >= GFX8);
680
DPP_instruction& dpp = instr->dpp();
681
682
/* first emit the instruction without the DPP operand */
683
Operand dpp_op = instr->operands[0];
684
instr->operands[0] = Operand(PhysReg{250}, v1);
685
instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP);
686
emit_instruction(ctx, out, instr);
687
uint32_t encoding = (0xF & dpp.row_mask) << 28;
688
encoding |= (0xF & dpp.bank_mask) << 24;
689
encoding |= dpp.abs[1] << 23;
690
encoding |= dpp.neg[1] << 22;
691
encoding |= dpp.abs[0] << 21;
692
encoding |= dpp.neg[0] << 20;
693
if (ctx.chip_class >= GFX10)
694
encoding |= 1 << 18; /* set Fetch Inactive to match GFX9 behaviour */
695
encoding |= dpp.bound_ctrl << 19;
696
encoding |= dpp.dpp_ctrl << 8;
697
encoding |= (0xFF) & dpp_op.physReg();
698
out.push_back(encoding);
699
return;
700
} else if (instr->isSDWA()) {
701
SDWA_instruction& sdwa = instr->sdwa();
702
703
/* first emit the instruction without the SDWA operand */
704
Operand sdwa_op = instr->operands[0];
705
instr->operands[0] = Operand(PhysReg{249}, v1);
706
instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::SDWA);
707
emit_instruction(ctx, out, instr);
708
709
uint32_t encoding = 0;
710
711
if (instr->isVOPC()) {
712
if (instr->definitions[0].physReg() != vcc) {
713
encoding |= instr->definitions[0].physReg() << 8;
714
encoding |= 1 << 15;
715
}
716
encoding |= (sdwa.clamp ? 1 : 0) << 13;
717
} else {
718
encoding |= get_sdwa_sel(sdwa.dst_sel, instr->definitions[0].physReg()) << 8;
719
uint32_t dst_u = sdwa.dst_sel & sdwa_sext ? 1 : 0;
720
if (sdwa.dst_preserve || (sdwa.dst_sel & sdwa_isra))
721
dst_u = 2;
722
encoding |= dst_u << 11;
723
encoding |= (sdwa.clamp ? 1 : 0) << 13;
724
encoding |= sdwa.omod << 14;
725
}
726
727
encoding |= get_sdwa_sel(sdwa.sel[0], sdwa_op.physReg()) << 16;
728
encoding |= sdwa.sel[0] & sdwa_sext ? 1 << 19 : 0;
729
encoding |= sdwa.abs[0] << 21;
730
encoding |= sdwa.neg[0] << 20;
731
732
if (instr->operands.size() >= 2) {
733
encoding |= get_sdwa_sel(sdwa.sel[1], instr->operands[1].physReg()) << 24;
734
encoding |= sdwa.sel[1] & sdwa_sext ? 1 << 27 : 0;
735
encoding |= sdwa.abs[1] << 29;
736
encoding |= sdwa.neg[1] << 28;
737
}
738
739
encoding |= 0xFF & sdwa_op.physReg();
740
encoding |= (sdwa_op.physReg() < 256) << 23;
741
if (instr->operands.size() >= 2)
742
encoding |= (instr->operands[1].physReg() < 256) << 31;
743
out.push_back(encoding);
744
} else {
745
unreachable("unimplemented instruction format");
746
}
747
break;
748
}
749
750
/* append literal dword */
751
for (const Operand& op : instr->operands) {
752
if (op.isLiteral()) {
753
out.push_back(op.constantValue());
754
break;
755
}
756
}
757
}
758
759
void
760
emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
761
{
762
for (aco_ptr<Instruction>& instr : block.instructions) {
763
#if 0
764
int start_idx = out.size();
765
std::cerr << "Encoding:\t" << std::endl;
766
aco_print_instr(&*instr, stderr);
767
std::cerr << std::endl;
768
#endif
769
emit_instruction(ctx, out, instr.get());
770
#if 0
771
for (int i = start_idx; i < out.size(); i++)
772
std::cerr << "encoding: " << "0x" << std::setfill('0') << std::setw(8) << std::hex << out[i] << std::endl;
773
#endif
774
}
775
}
776
777
void
778
fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
779
{
780
bool exported = false;
781
for (Block& block : program->blocks) {
782
if (!(block.kind & block_kind_export_end))
783
continue;
784
std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin();
785
while (it != block.instructions.rend()) {
786
if ((*it)->isEXP()) {
787
Export_instruction& exp = (*it)->exp();
788
if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG) {
789
if (exp.dest >= V_008DFC_SQ_EXP_POS && exp.dest <= (V_008DFC_SQ_EXP_POS + 3)) {
790
exp.done = true;
791
exported = true;
792
break;
793
}
794
} else {
795
exp.done = true;
796
exp.valid_mask = true;
797
exported = true;
798
break;
799
}
800
} else if ((*it)->definitions.size() && (*it)->definitions[0].physReg() == exec)
801
break;
802
++it;
803
}
804
}
805
806
if (!exported) {
807
/* Abort in order to avoid a GPU hang. */
808
bool is_vertex_or_ngg =
809
(program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG);
810
aco_err(program,
811
"Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment");
812
aco_print_program(program, stderr);
813
abort();
814
}
815
}
816
817
static void
818
insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before,
819
unsigned insert_count, const uint32_t* insert_data)
820
{
821
out.insert(out.begin() + insert_before, insert_data, insert_data + insert_count);
822
823
/* Update the offset of each affected block */
824
for (Block& block : ctx.program->blocks) {
825
if (block.offset >= insert_before)
826
block.offset += insert_count;
827
}
828
829
/* Find first branch after the inserted code */
830
auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(),
831
[insert_before](const auto& branch) -> bool
832
{ return (unsigned)branch.first >= insert_before; });
833
834
/* Update the locations of branches */
835
for (; branch_it != ctx.branches.end(); ++branch_it)
836
branch_it->first += insert_count;
837
838
/* Update the locations of p_constaddr instructions */
839
for (auto& constaddr : ctx.constaddrs) {
840
constaddr_info& info = constaddr.second;
841
if (info.getpc_end >= insert_before)
842
info.getpc_end += insert_count;
843
if (info.add_literal >= insert_before)
844
info.add_literal += insert_count;
845
}
846
}
847
848
static void
849
fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
850
{
851
/* Branches with an offset of 0x3f are buggy on GFX10,
852
* we workaround by inserting NOPs if needed.
853
*/
854
bool gfx10_3f_bug = false;
855
856
do {
857
auto buggy_branch_it = std::find_if(
858
ctx.branches.begin(), ctx.branches.end(),
859
[&ctx](const auto& branch) -> bool {
860
return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) ==
861
0x3f;
862
});
863
864
gfx10_3f_bug = buggy_branch_it != ctx.branches.end();
865
866
if (gfx10_3f_bug) {
867
/* Insert an s_nop after the branch */
868
constexpr uint32_t s_nop_0 = 0xbf800000u;
869
insert_code(ctx, out, buggy_branch_it->first + 1, 1, &s_nop_0);
870
}
871
} while (gfx10_3f_bug);
872
}
873
874
void
875
emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards,
876
std::vector<uint32_t>& out)
877
{
878
Builder bld(ctx.program);
879
880
Definition def_tmp_lo(branch->definitions[0].physReg(), s1);
881
Operand op_tmp_lo(branch->definitions[0].physReg(), s1);
882
Definition def_tmp_hi(branch->definitions[0].physReg().advance(4), s1);
883
Operand op_tmp_hi(branch->definitions[0].physReg().advance(4), s1);
884
885
aco_ptr<Instruction> instr;
886
887
if (branch->opcode != aco_opcode::s_branch) {
888
/* for conditional branches, skip the long jump if the condition is false */
889
aco_opcode inv;
890
switch (branch->opcode) {
891
case aco_opcode::s_cbranch_scc0: inv = aco_opcode::s_cbranch_scc1; break;
892
case aco_opcode::s_cbranch_scc1: inv = aco_opcode::s_cbranch_scc0; break;
893
case aco_opcode::s_cbranch_vccz: inv = aco_opcode::s_cbranch_vccnz; break;
894
case aco_opcode::s_cbranch_vccnz: inv = aco_opcode::s_cbranch_vccz; break;
895
case aco_opcode::s_cbranch_execz: inv = aco_opcode::s_cbranch_execnz; break;
896
case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break;
897
default: unreachable("Unhandled long jump.");
898
}
899
instr.reset(bld.sopp(inv, -1, 7));
900
emit_instruction(ctx, out, instr.get());
901
}
902
903
/* create the new PC and stash SCC in the LSB */
904
instr.reset(bld.sop1(aco_opcode::s_getpc_b64, branch->definitions[0]).instr);
905
emit_instruction(ctx, out, instr.get());
906
907
instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr);
908
instr->operands[1].setFixed(PhysReg{255}); /* this operand has to be a literal */
909
emit_instruction(ctx, out, instr.get());
910
branch->pass_flags = out.size();
911
912
instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi,
913
Operand::c32(backwards ? UINT32_MAX : 0u))
914
.instr);
915
emit_instruction(ctx, out, instr.get());
916
917
/* restore SCC and clear the LSB of the new PC */
918
instr.reset(bld.sopc(aco_opcode::s_bitcmp1_b32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr);
919
emit_instruction(ctx, out, instr.get());
920
instr.reset(bld.sop1(aco_opcode::s_bitset0_b32, def_tmp_lo, Operand::zero()).instr);
921
emit_instruction(ctx, out, instr.get());
922
923
/* create the s_setpc_b64 to jump */
924
instr.reset(
925
bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr);
926
emit_instruction(ctx, out, instr.get());
927
}
928
929
void
930
fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
931
{
932
bool repeat = false;
933
do {
934
repeat = false;
935
936
if (ctx.chip_class == GFX10)
937
fix_branches_gfx10(ctx, out);
938
939
for (std::pair<int, SOPP_instruction*>& branch : ctx.branches) {
940
int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1;
941
if ((offset < INT16_MIN || offset > INT16_MAX) && !branch.second->pass_flags) {
942
std::vector<uint32_t> long_jump;
943
bool backwards =
944
ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first;
945
emit_long_jump(ctx, branch.second, backwards, long_jump);
946
947
out[branch.first] = long_jump[0];
948
insert_code(ctx, out, branch.first + 1, long_jump.size() - 1, long_jump.data() + 1);
949
950
repeat = true;
951
break;
952
}
953
954
if (branch.second->pass_flags) {
955
int after_getpc = branch.first + branch.second->pass_flags - 2;
956
offset = (int)ctx.program->blocks[branch.second->block].offset - after_getpc;
957
out[branch.first + branch.second->pass_flags - 1] = offset * 4;
958
} else {
959
out[branch.first] &= 0xffff0000u;
960
out[branch.first] |= (uint16_t)offset;
961
}
962
}
963
} while (repeat);
964
}
965
966
void
967
fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
968
{
969
for (auto& constaddr : ctx.constaddrs) {
970
constaddr_info& info = constaddr.second;
971
out[info.add_literal] += (out.size() - info.getpc_end) * 4u;
972
}
973
}
974
975
unsigned
976
emit_program(Program* program, std::vector<uint32_t>& code)
977
{
978
asm_context ctx(program);
979
980
if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS ||
981
program->stage.hw == HWStage::NGG)
982
fix_exports(ctx, code, program);
983
984
for (Block& block : program->blocks) {
985
block.offset = code.size();
986
emit_block(ctx, code, block);
987
}
988
989
fix_branches(ctx, code);
990
991
unsigned exec_size = code.size() * sizeof(uint32_t);
992
993
if (program->chip_class >= GFX10) {
994
/* Pad output with s_code_end so instruction prefetching doesn't cause
995
* page faults */
996
unsigned final_size = align(code.size() + 3 * 16, 16);
997
while (code.size() < final_size)
998
code.push_back(0xbf9f0000u);
999
}
1000
1001
fix_constaddrs(ctx, code);
1002
1003
while (program->constant_data.size() % 4u)
1004
program->constant_data.push_back(0);
1005
/* Copy constant data */
1006
code.insert(code.end(), (uint32_t*)program->constant_data.data(),
1007
(uint32_t*)(program->constant_data.data() + program->constant_data.size()));
1008
1009
return exec_size;
1010
}
1011
1012
} // namespace aco
1013
1014