Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c
4574 views
1
/*
2
* Copyright (C) 2018 Jonathan Marek <[email protected]>
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
* SOFTWARE.
22
*
23
* Authors:
24
* Jonathan Marek <[email protected]>
25
*/
26
27
#include "ir2_private.h"
28
29
static unsigned
30
src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
31
{
32
struct ir2_reg_component *comps;
33
unsigned swiz = 0;
34
35
switch (src->type) {
36
case IR2_SRC_SSA:
37
case IR2_SRC_REG:
38
break;
39
default:
40
return src->swizzle;
41
}
42
/* we need to take into account where the components were allocated */
43
comps = get_reg_src(ctx, src)->comp;
44
for (int i = 0; i < ncomp; i++) {
45
swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i);
46
}
47
return swiz;
48
}
49
50
/* alu instr need to take into how the output components are allocated */
51
52
/* scalar doesn't need to take into account dest swizzle */
53
54
static unsigned
55
alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg)
56
{
57
/* hardware seems to take from W, but swizzle everywhere just in case */
58
return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX);
59
}
60
61
static unsigned
62
alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr,
63
struct ir2_src *src)
64
{
65
struct ir2_reg_component *comp = get_reg(instr)->comp;
66
unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr));
67
unsigned swiz = 0;
68
69
/* non per component special cases */
70
switch (instr->alu.vector_opc) {
71
case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv:
72
return alu_swizzle_scalar(ctx, src);
73
case DOT2ADDv:
74
case DOT3v:
75
case DOT4v:
76
case CUBEv:
77
return swiz0;
78
default:
79
break;
80
}
81
82
for (int i = 0, j = 0; i < dst_ncomp(instr); j++) {
83
if (instr->alu.write_mask & 1 << j) {
84
if (comp[j].c != 7)
85
swiz |= swiz_set(i, comp[j].c);
86
i++;
87
}
88
}
89
return swiz_merge(swiz0, swiz);
90
}
91
92
static unsigned
93
alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1)
94
{
95
/* hardware seems to take from ZW, but swizzle everywhere (ABAB) */
96
unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0);
97
return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY);
98
}
99
100
/* write_mask needs to be transformed by allocation information */
101
102
static unsigned
103
alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr)
104
{
105
struct ir2_reg_component *comp = get_reg(instr)->comp;
106
unsigned write_mask = 0;
107
108
for (int i = 0; i < 4; i++) {
109
if (instr->alu.write_mask & 1 << i)
110
write_mask |= 1 << comp[i].c;
111
}
112
113
return write_mask;
114
}
115
116
/* fetch instructions can swizzle dest, but src swizzle needs conversion */
117
118
static unsigned
119
fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
120
{
121
unsigned alu_swiz = src_swizzle(ctx, src, ncomp);
122
unsigned swiz = 0;
123
for (int i = 0; i < ncomp; i++)
124
swiz |= swiz_get(alu_swiz, i) << i * 2;
125
return swiz;
126
}
127
128
static unsigned
129
fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr)
130
{
131
struct ir2_reg_component *comp = get_reg(instr)->comp;
132
unsigned dst_swiz = 0xfff;
133
for (int i = 0; i < dst_ncomp(instr); i++) {
134
dst_swiz &= ~(7 << comp[i].c * 3);
135
dst_swiz |= i << comp[i].c * 3;
136
}
137
return dst_swiz;
138
}
139
140
/* register / export # for instr */
141
static unsigned
142
dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr)
143
{
144
if (is_export(instr))
145
return instr->alu.export;
146
147
return get_reg(instr)->idx;
148
}
149
150
/* register # for src */
151
static unsigned
152
src_to_reg(struct ir2_context *ctx, struct ir2_src *src)
153
{
154
return get_reg_src(ctx, src)->idx;
155
}
156
157
static unsigned
158
src_reg_byte(struct ir2_context *ctx, struct ir2_src *src)
159
{
160
if (src->type == IR2_SRC_CONST) {
161
assert(!src->abs); /* no abs bit for const */
162
return src->num;
163
}
164
return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0);
165
}
166
167
/* produce the 12 byte binary instruction for a given sched_instr */
168
static void
169
fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched, instr_t *bc,
170
bool *is_fetch)
171
{
172
struct ir2_instr *instr = sched->instr, *instr_s, *instr_v;
173
174
*bc = (instr_t){};
175
176
if (instr && instr->type == IR2_FETCH) {
177
*is_fetch = true;
178
179
bc->fetch.opc = instr->fetch.opc;
180
bc->fetch.pred_select = !!instr->pred;
181
bc->fetch.pred_condition = instr->pred & 1;
182
183
struct ir2_src *src = instr->src;
184
185
if (instr->fetch.opc == VTX_FETCH) {
186
instr_fetch_vtx_t *vtx = &bc->fetch.vtx;
187
188
assert(instr->fetch.vtx.const_idx <= 0x1f);
189
assert(instr->fetch.vtx.const_idx_sel <= 0x3);
190
191
vtx->src_reg = src_to_reg(ctx, src);
192
vtx->src_swiz = fetch_swizzle(ctx, src, 1);
193
vtx->dst_reg = dst_to_reg(ctx, instr);
194
vtx->dst_swiz = fetch_dst_swiz(ctx, instr);
195
196
vtx->must_be_one = 1;
197
vtx->const_index = instr->fetch.vtx.const_idx;
198
vtx->const_index_sel = instr->fetch.vtx.const_idx_sel;
199
200
/* other fields will be patched */
201
202
/* XXX seems like every FETCH but the first has
203
* this bit set:
204
*/
205
vtx->reserved3 = instr->idx ? 0x1 : 0x0;
206
vtx->reserved0 = instr->idx ? 0x2 : 0x3;
207
} else if (instr->fetch.opc == TEX_FETCH) {
208
instr_fetch_tex_t *tex = &bc->fetch.tex;
209
210
tex->src_reg = src_to_reg(ctx, src);
211
tex->src_swiz = fetch_swizzle(ctx, src, 3);
212
tex->dst_reg = dst_to_reg(ctx, instr);
213
tex->dst_swiz = fetch_dst_swiz(ctx, instr);
214
/* tex->const_idx = patch_fetches */
215
tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
216
tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
217
tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
218
tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
219
tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
220
tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
221
tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
222
tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT;
223
tex->use_reg_lod = instr->src_count == 2;
224
tex->sample_location = SAMPLE_CENTER;
225
tex->tx_coord_denorm = instr->fetch.tex.is_rect;
226
} else if (instr->fetch.opc == TEX_SET_TEX_LOD) {
227
instr_fetch_tex_t *tex = &bc->fetch.tex;
228
229
tex->src_reg = src_to_reg(ctx, src);
230
tex->src_swiz = fetch_swizzle(ctx, src, 1);
231
tex->dst_reg = 0;
232
tex->dst_swiz = 0xfff;
233
234
tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
235
tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
236
tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
237
tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
238
tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
239
tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
240
tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
241
tex->use_comp_lod = 1;
242
tex->use_reg_lod = 0;
243
tex->sample_location = SAMPLE_CENTER;
244
} else {
245
assert(0);
246
}
247
return;
248
}
249
250
instr_v = sched->instr;
251
instr_s = sched->instr_s;
252
253
if (instr_v) {
254
struct ir2_src src1, src2, *src3;
255
256
src1 = instr_v->src[0];
257
src2 = instr_v->src[instr_v->src_count > 1];
258
src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL;
259
260
bc->alu.vector_opc = instr_v->alu.vector_opc;
261
bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v);
262
bc->alu.vector_dest = dst_to_reg(ctx, instr_v);
263
bc->alu.vector_clamp = instr_v->alu.saturate;
264
bc->alu.export_data = instr_v->alu.export >= 0;
265
266
/* single operand SETEv, use 0.0f as src2 */
267
if (instr_v->src_count == 1 &&
268
(bc->alu.vector_opc == SETEv || bc->alu.vector_opc == SETNEv ||
269
bc->alu.vector_opc == SETGTv || bc->alu.vector_opc == SETGTEv))
270
src2 = ir2_zero(ctx);
271
272
/* export32 instr for a20x hw binning has this bit set..
273
* it seems to do more than change the base address of constants
274
* XXX this is a hack
275
*/
276
bc->alu.relative_addr =
277
(bc->alu.export_data && bc->alu.vector_dest == 32);
278
279
bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1);
280
bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1);
281
bc->alu.src1_reg_negate = src1.negate;
282
bc->alu.src1_sel = src1.type != IR2_SRC_CONST;
283
284
bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2);
285
bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2);
286
bc->alu.src2_reg_negate = src2.negate;
287
bc->alu.src2_sel = src2.type != IR2_SRC_CONST;
288
289
if (src3) {
290
bc->alu.src3_reg_byte = src_reg_byte(ctx, src3);
291
bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3);
292
bc->alu.src3_reg_negate = src3->negate;
293
bc->alu.src3_sel = src3->type != IR2_SRC_CONST;
294
}
295
296
bc->alu.pred_select = instr_v->pred;
297
}
298
299
if (instr_s) {
300
struct ir2_src *src = instr_s->src;
301
302
bc->alu.scalar_opc = instr_s->alu.scalar_opc;
303
bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s);
304
bc->alu.scalar_dest = dst_to_reg(ctx, instr_s);
305
bc->alu.scalar_clamp = instr_s->alu.saturate;
306
bc->alu.export_data = instr_s->alu.export >= 0;
307
308
if (instr_s->src_count == 1) {
309
bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
310
bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src);
311
bc->alu.src3_reg_negate = src->negate;
312
bc->alu.src3_sel = src->type != IR2_SRC_CONST;
313
} else {
314
assert(instr_s->src_count == 2);
315
316
bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
317
bc->alu.src3_swiz =
318
alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle);
319
bc->alu.src3_reg_negate = src->negate;
320
bc->alu.src3_sel = src->type != IR2_SRC_CONST;
321
;
322
}
323
324
if (instr_v)
325
assert(instr_s->pred == instr_v->pred);
326
bc->alu.pred_select = instr_s->pred;
327
}
328
329
*is_fetch = false;
330
return;
331
}
332
333
static unsigned
334
write_cfs(struct ir2_context *ctx, instr_cf_t *cfs, unsigned cf_idx,
335
instr_cf_alloc_t *alloc, instr_cf_exec_t *exec)
336
{
337
assert(exec->count);
338
339
if (alloc)
340
cfs[cf_idx++].alloc = *alloc;
341
342
/* for memory alloc offset for patching */
343
if (alloc && alloc->buffer_select == SQ_MEMORY &&
344
ctx->info->mem_export_ptr == -1)
345
ctx->info->mem_export_ptr = cf_idx / 2 * 3;
346
347
cfs[cf_idx++].exec = *exec;
348
exec->address += exec->count;
349
exec->serialize = 0;
350
exec->count = 0;
351
352
return cf_idx;
353
}
354
355
/* assemble the final shader */
356
void
357
assemble(struct ir2_context *ctx, bool binning)
358
{
359
/* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384)
360
* address is 9 bits so could it be 512 ?
361
*/
362
instr_cf_t cfs[384];
363
instr_t bytecode[384], bc;
364
unsigned block_addr[128];
365
unsigned num_cf = 0;
366
367
/* CF instr state */
368
instr_cf_exec_t exec = {.opc = EXEC};
369
instr_cf_alloc_t alloc = {.opc = ALLOC};
370
371
int sync_id, sync_id_prev = -1;
372
bool is_fetch = false;
373
bool need_sync = true;
374
bool need_alloc = false;
375
unsigned block_idx = 0;
376
377
ctx->info->mem_export_ptr = -1;
378
ctx->info->num_fetch_instrs = 0;
379
380
/* vertex shader always needs to allocate at least one parameter
381
* if it will never happen,
382
*/
383
if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) {
384
alloc.buffer_select = SQ_PARAMETER_PIXEL;
385
cfs[num_cf++].alloc = alloc;
386
}
387
388
block_addr[0] = 0;
389
390
for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) {
391
struct ir2_instr *instr = ctx->instr_sched[j].instr;
392
393
/* catch IR2_CF since it isn't a regular instruction */
394
if (instr && instr->type == IR2_CF) {
395
assert(!need_alloc); /* XXX */
396
397
/* flush any exec cf before inserting jmp */
398
if (exec.count)
399
num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec);
400
401
cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t){
402
.opc = COND_JMP,
403
.address = instr->cf.block_idx, /* will be fixed later */
404
.force_call = !instr->pred,
405
.predicated_jmp = 1,
406
.direction = instr->cf.block_idx > instr->block_idx,
407
.condition = instr->pred & 1,
408
};
409
continue;
410
}
411
412
/* fill the 3 dwords for the instruction */
413
fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch);
414
415
/* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */
416
sync_id = 0;
417
if (is_fetch)
418
sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2;
419
420
need_sync = sync_id != sync_id_prev;
421
sync_id_prev = sync_id;
422
423
unsigned block;
424
{
425
426
if (ctx->instr_sched[j].instr)
427
block = ctx->instr_sched[j].instr->block_idx;
428
else
429
block = ctx->instr_sched[j].instr_s->block_idx;
430
431
assert(block_idx <= block);
432
}
433
434
/* info for patching */
435
if (is_fetch) {
436
struct ir2_fetch_info *info =
437
&ctx->info->fetch_info[ctx->info->num_fetch_instrs++];
438
info->offset = i * 3; /* add cf offset later */
439
440
if (bc.fetch.opc == VTX_FETCH) {
441
info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz;
442
} else if (bc.fetch.opc == TEX_FETCH) {
443
info->tex.samp_id = instr->fetch.tex.samp_id;
444
info->tex.src_swiz = bc.fetch.tex.src_swiz;
445
} else {
446
ctx->info->num_fetch_instrs--;
447
}
448
}
449
450
/* exec cf after 6 instr or when switching between fetch / alu */
451
if (exec.count == 6 ||
452
(exec.count && (need_sync || block != block_idx))) {
453
num_cf =
454
write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
455
need_alloc = false;
456
}
457
458
/* update block_addrs for jmp patching */
459
while (block_idx < block)
460
block_addr[++block_idx] = num_cf;
461
462
/* export - fill alloc cf */
463
if (!is_fetch && bc.alu.export_data) {
464
/* get the export buffer from either vector/scalar dest */
465
instr_alloc_type_t buffer = export_buf(bc.alu.vector_dest);
466
if (bc.alu.scalar_write_mask) {
467
if (bc.alu.vector_write_mask)
468
assert(buffer == export_buf(bc.alu.scalar_dest));
469
buffer = export_buf(bc.alu.scalar_dest);
470
}
471
472
/* flush previous alloc if the buffer changes */
473
bool need_new_alloc = buffer != alloc.buffer_select;
474
475
/* memory export always in 32/33 pair, new alloc on 32 */
476
if (bc.alu.vector_dest == 32)
477
need_new_alloc = true;
478
479
if (need_new_alloc && exec.count) {
480
num_cf =
481
write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
482
need_alloc = false;
483
}
484
485
need_alloc |= need_new_alloc;
486
487
alloc.size = 0;
488
alloc.buffer_select = buffer;
489
490
if (buffer == SQ_PARAMETER_PIXEL &&
491
ctx->so->type == MESA_SHADER_VERTEX)
492
alloc.size = ctx->f->inputs_count - 1;
493
494
if (buffer == SQ_POSITION)
495
alloc.size = ctx->so->writes_psize;
496
}
497
498
if (is_fetch)
499
exec.serialize |= 0x1 << exec.count * 2;
500
if (need_sync)
501
exec.serialize |= 0x2 << exec.count * 2;
502
503
need_sync = false;
504
exec.count += 1;
505
bytecode[i++] = bc;
506
}
507
508
/* final exec cf */
509
exec.opc = EXEC_END;
510
num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
511
512
/* insert nop to get an even # of CFs */
513
if (num_cf % 2)
514
cfs[num_cf++] = (instr_cf_t){.opc = NOP};
515
516
/* patch cf addrs */
517
for (int idx = 0; idx < num_cf; idx++) {
518
switch (cfs[idx].opc) {
519
case NOP:
520
case ALLOC:
521
break;
522
case EXEC:
523
case EXEC_END:
524
cfs[idx].exec.address += num_cf / 2;
525
break;
526
case COND_JMP:
527
cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address];
528
break;
529
default:
530
assert(0);
531
}
532
}
533
534
/* concatenate cfs and alu/fetch */
535
uint32_t cfdwords = num_cf / 2 * 3;
536
uint32_t alufetchdwords = exec.address * 3;
537
uint32_t sizedwords = cfdwords + alufetchdwords;
538
uint32_t *dwords = malloc(sizedwords * 4);
539
assert(dwords);
540
memcpy(dwords, cfs, cfdwords * 4);
541
memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4);
542
543
/* finalize ir2_shader_info */
544
ctx->info->dwords = dwords;
545
ctx->info->sizedwords = sizedwords;
546
for (int i = 0; i < ctx->info->num_fetch_instrs; i++)
547
ctx->info->fetch_info[i].offset += cfdwords;
548
549
if (FD_DBG(DISASM)) {
550
DBG("disassemble: type=%d", ctx->so->type);
551
disasm_a2xx(dwords, sizedwords, 0, ctx->so->type);
552
}
553
}
554
555