Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/freedreno/ir3/ir3_lower_parallelcopy.c
4565 views
1
/*
2
* Copyright (C) 2021 Valve Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
* SOFTWARE.
22
*/
23
24
#include "ir3_ra.h"
25
#include "ir3_shader.h"
26
27
struct copy_src {
28
unsigned flags;
29
union {
30
uint32_t imm;
31
physreg_t reg;
32
unsigned const_num;
33
};
34
};
35
36
struct copy_entry {
37
physreg_t dst;
38
unsigned flags;
39
bool done;
40
41
struct copy_src src;
42
};
43
44
static unsigned
45
copy_entry_size(const struct copy_entry *entry)
46
{
47
return (entry->flags & IR3_REG_HALF) ? 1 : 2;
48
}
49
50
static struct copy_src
51
get_copy_src(const struct ir3_register *reg, unsigned offset)
52
{
53
if (reg->flags & IR3_REG_IMMED) {
54
return (struct copy_src){
55
.flags = IR3_REG_IMMED,
56
.imm = reg->uim_val,
57
};
58
} else if (reg->flags & IR3_REG_CONST) {
59
return (struct copy_src){
60
.flags = IR3_REG_CONST,
61
.const_num = reg->num,
62
};
63
} else {
64
return (struct copy_src){
65
.flags = 0,
66
.reg = ra_reg_get_physreg(reg) + offset,
67
};
68
}
69
}
70
71
static void
72
do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num,
73
unsigned src2_num, unsigned flags)
74
{
75
struct ir3_instruction * xor
76
= ir3_instr_create(instr->block, OPC_XOR_B, 1, 2);
77
ir3_dst_create(xor, dst_num, flags);
78
ir3_src_create(xor, src1_num, flags);
79
ir3_src_create(xor, src2_num, flags);
80
81
ir3_instr_move_before(xor, instr);
82
}
83
84
static void
85
do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr,
86
const struct copy_entry *entry)
87
{
88
assert(!entry->src.flags);
89
90
if (entry->flags & IR3_REG_HALF) {
91
/* We currently make sure to never emit parallel copies where the
92
* source/destination is a half-reg above the range accessable to half
93
* registers. However, when a full-reg source overlaps a half-reg
94
* destination or vice versa, it can be very, very complicated to come
95
* up with a series of "legal" swaps and copies to resolve the
96
* parallel copy. So here we provide a fallback to implement the
97
* "illegal" swap instead. This may also be useful for implementing
98
* "spilling" half-regs to the inaccessable space.
99
*/
100
if (entry->src.reg >= RA_HALF_SIZE) {
101
/* Choose a temporary that doesn't overlap src or dst */
102
physreg_t tmp = entry->dst < 2 ? 2 : 0;
103
104
/* Swap src and the temporary */
105
do_swap(compiler, instr,
106
&(struct copy_entry){
107
.src = {.reg = entry->src.reg & ~1u},
108
.dst = tmp,
109
.flags = entry->flags & ~IR3_REG_HALF,
110
});
111
112
/* Do the original swap with src replaced with tmp */
113
do_swap(compiler, instr,
114
&(struct copy_entry){
115
.src = {.reg = tmp + (entry->src.reg & 1)},
116
.dst = entry->dst,
117
.flags = entry->flags,
118
});
119
120
/* Swap src and the temporary back */
121
do_swap(compiler, instr,
122
&(struct copy_entry){
123
.src = {.reg = entry->src.reg & ~1u},
124
.dst = tmp,
125
.flags = entry->flags & ~IR3_REG_HALF,
126
});
127
return;
128
}
129
130
/* If dst is not addressable, we only need to swap the arguments and
131
* let the case above handle it.
132
*/
133
if (entry->dst >= RA_HALF_SIZE) {
134
do_swap(compiler, instr,
135
&(struct copy_entry){
136
.src = {.reg = entry->dst},
137
.dst = entry->src.reg,
138
.flags = entry->flags,
139
});
140
return;
141
}
142
}
143
144
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
145
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
146
147
/* a5xx+ is known to support swz, which enables us to swap two registers
148
* in-place. If unsupported we emulate it using the xor trick.
149
*/
150
if (compiler->gpu_id < 500) {
151
/* Shared regs only exist since a5xx, so we don't have to provide a
152
* fallback path for them.
153
*/
154
assert(!(entry->flags & IR3_REG_SHARED));
155
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
156
do_xor(instr, src_num, src_num, dst_num, entry->flags);
157
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
158
} else {
159
/* Use a macro for shared regs because any shared reg writes need to
160
* be wrapped in a getone block to work correctly. Writing shared regs
161
* with multiple threads active does not work, even if they all return
162
* the same value.
163
*/
164
unsigned opc =
165
(entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ;
166
struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2);
167
ir3_dst_create(swz, dst_num, entry->flags);
168
ir3_dst_create(swz, src_num, entry->flags);
169
ir3_src_create(swz, src_num, entry->flags);
170
ir3_src_create(swz, dst_num, entry->flags);
171
swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
172
swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
173
swz->repeat = 1;
174
ir3_instr_move_before(swz, instr);
175
}
176
}
177
178
static void
179
do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr,
180
const struct copy_entry *entry)
181
{
182
if (entry->flags & IR3_REG_HALF) {
183
/* See do_swap() for why this is here. */
184
if (entry->dst >= RA_HALF_SIZE) {
185
/* TODO: is there a hw instruction we can use for this case? */
186
physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
187
188
do_swap(compiler, instr,
189
&(struct copy_entry){
190
.src = {.reg = entry->dst & ~1u},
191
.dst = tmp,
192
.flags = entry->flags & ~IR3_REG_HALF,
193
});
194
195
do_copy(compiler, instr,
196
&(struct copy_entry){
197
.src = entry->src,
198
.dst = tmp + (entry->dst & 1),
199
.flags = entry->flags,
200
});
201
202
do_swap(compiler, instr,
203
&(struct copy_entry){
204
.src = {.reg = entry->dst & ~1u},
205
.dst = tmp,
206
.flags = entry->flags & ~IR3_REG_HALF,
207
});
208
return;
209
}
210
211
if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {
212
unsigned src_num = ra_physreg_to_num(entry->src.reg & ~1u,
213
entry->flags & ~IR3_REG_HALF);
214
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
215
216
if (entry->src.reg % 2 == 0) {
217
/* cov.u32u16 dst, src */
218
struct ir3_instruction *cov =
219
ir3_instr_create(instr->block, OPC_MOV, 1, 1);
220
ir3_dst_create(cov, dst_num, entry->flags);
221
ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF);
222
cov->cat1.dst_type = TYPE_U16;
223
cov->cat1.src_type = TYPE_U32;
224
ir3_instr_move_before(cov, instr);
225
} else {
226
/* shr.b dst, src, h(16) */
227
struct ir3_instruction *shr =
228
ir3_instr_create(instr->block, OPC_SHR_B, 1, 2);
229
ir3_dst_create(shr, dst_num, entry->flags);
230
ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF);
231
ir3_src_create(shr, 0, entry->flags | IR3_REG_IMMED)->uim_val = 16;
232
ir3_instr_move_before(shr, instr);
233
}
234
return;
235
}
236
}
237
238
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
239
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
240
241
/* Similar to the swap case, we have to use a macro for shared regs. */
242
unsigned opc =
243
(entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV;
244
struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1);
245
ir3_dst_create(mov, dst_num, entry->flags);
246
ir3_src_create(mov, src_num, entry->flags | entry->src.flags);
247
mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
248
mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
249
if (entry->src.flags & IR3_REG_IMMED)
250
mov->srcs[0]->uim_val = entry->src.imm;
251
else if (entry->src.flags & IR3_REG_CONST)
252
mov->srcs[0]->num = entry->src.const_num;
253
ir3_instr_move_before(mov, instr);
254
}
255
256
struct copy_ctx {
257
/* For each physreg, the number of pending copy entries that use it as a
258
* source. Once this drops to zero, then the physreg is unblocked and can
259
* be moved to.
260
*/
261
unsigned physreg_use_count[RA_MAX_FILE_SIZE];
262
263
/* For each physreg, the pending copy_entry that uses it as a dest. */
264
struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
265
266
struct copy_entry entries[RA_MAX_FILE_SIZE];
267
unsigned entry_count;
268
};
269
270
static bool
271
entry_blocked(struct copy_entry *entry, struct copy_ctx *ctx)
272
{
273
for (unsigned i = 0; i < copy_entry_size(entry); i++) {
274
if (ctx->physreg_use_count[entry->dst + i] != 0)
275
return true;
276
}
277
278
return false;
279
}
280
281
static void
282
split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
283
{
284
assert(!entry->done);
285
assert(!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
286
assert(copy_entry_size(entry) == 2);
287
struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
288
289
new_entry->dst = entry->dst + 1;
290
new_entry->src.flags = entry->src.flags;
291
new_entry->src.reg = entry->src.reg + 1;
292
new_entry->done = false;
293
entry->flags |= IR3_REG_HALF;
294
new_entry->flags = entry->flags;
295
ctx->physreg_dst[entry->dst + 1] = new_entry;
296
}
297
298
static void
299
_handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
300
struct copy_ctx *ctx)
301
{
302
/* Set up the bookkeeping */
303
memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
304
memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
305
306
for (unsigned i = 0; i < ctx->entry_count; i++) {
307
struct copy_entry *entry = &ctx->entries[i];
308
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
309
if (!entry->src.flags)
310
ctx->physreg_use_count[entry->src.reg + j]++;
311
312
/* Copies should not have overlapping destinations. */
313
assert(!ctx->physreg_dst[entry->dst + j]);
314
ctx->physreg_dst[entry->dst + j] = entry;
315
}
316
}
317
318
bool progress = true;
319
while (progress) {
320
progress = false;
321
322
/* Step 1: resolve paths in the transfer graph. This means finding
323
* copies whose destination aren't blocked by something else and then
324
* emitting them, continuing this process until every copy is blocked
325
* and there are only cycles left.
326
*
327
* TODO: We should note that src is also available in dst to unblock
328
* cycles that src is involved in.
329
*/
330
331
for (unsigned i = 0; i < ctx->entry_count; i++) {
332
struct copy_entry *entry = &ctx->entries[i];
333
if (!entry->done && !entry_blocked(entry, ctx)) {
334
entry->done = true;
335
progress = true;
336
do_copy(compiler, instr, entry);
337
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
338
if (!entry->src.flags)
339
ctx->physreg_use_count[entry->src.reg + j]--;
340
ctx->physreg_dst[entry->dst + j] = NULL;
341
}
342
}
343
}
344
345
if (progress)
346
continue;
347
348
/* Step 2: Find partially blocked copies and split them. In the
349
* mergedregs case, we can 32-bit copies which are only blocked on one
350
* 16-bit half, and splitting them helps get things moving.
351
*
352
* We can skip splitting copies if the source isn't a register,
353
* however, because it does not unblock anything and therefore doesn't
354
* contribute to making forward progress with step 1. These copies
355
* should still be resolved eventually in step 1 because they can't be
356
* part of a cycle.
357
*/
358
for (unsigned i = 0; i < ctx->entry_count; i++) {
359
struct copy_entry *entry = &ctx->entries[i];
360
if (entry->done || entry->flags & IR3_REG_HALF)
361
continue;
362
363
if (((ctx->physreg_use_count[entry->dst] == 0 ||
364
ctx->physreg_use_count[entry->dst + 1] == 0)) &&
365
!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
366
split_32bit_copy(ctx, entry);
367
progress = true;
368
}
369
}
370
}
371
372
/* Step 3: resolve cycles through swapping.
373
*
374
* At this point, the transfer graph should consist of only cycles.
375
* The reason is that, given any physreg n_1 that's the source of a
376
* remaining entry, it has a destination n_2, which (because every
377
* copy is blocked) is the source of some other copy whose destination
378
* is n_3, and so we can follow the chain until we get a cycle. If we
379
* reached some other node than n_1:
380
*
381
* n_1 -> n_2 -> ... -> n_i
382
* ^ |
383
* |-------------|
384
*
385
* then n_2 would be the destination of 2 copies, which is illegal
386
* (checked above in an assert). So n_1 must be part of a cycle:
387
*
388
* n_1 -> n_2 -> ... -> n_i
389
* ^ |
390
* |---------------------|
391
*
392
* and this must be only cycle n_1 is involved in, because any other
393
* path starting from n_1 would also have to end in n_1, resulting in
394
* a node somewhere along the way being the destination of 2 copies
395
* when the 2 paths merge.
396
*
397
* The way we resolve the cycle is through picking a copy (n_1, n_2)
398
* and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
399
* out of the cycle:
400
*
401
* n_1 -> ... -> n_i
402
* ^ |
403
* |--------------|
404
*
405
* and we can keep repeating this until the cycle is empty.
406
*/
407
408
for (unsigned i = 0; i < ctx->entry_count; i++) {
409
struct copy_entry *entry = &ctx->entries[i];
410
if (entry->done)
411
continue;
412
413
assert(!entry->src.flags);
414
415
/* catch trivial copies */
416
if (entry->dst == entry->src.reg) {
417
entry->done = true;
418
continue;
419
}
420
421
do_swap(compiler, instr, entry);
422
423
/* Split any blocking copies whose sources are only partially
424
* contained within our destination.
425
*/
426
if (entry->flags & IR3_REG_HALF) {
427
for (unsigned j = 0; j < ctx->entry_count; j++) {
428
struct copy_entry *blocking = &ctx->entries[j];
429
430
if (blocking->done)
431
continue;
432
433
if (blocking->src.reg <= entry->dst &&
434
blocking->src.reg + 1 >= entry->dst &&
435
!(blocking->flags & IR3_REG_HALF)) {
436
split_32bit_copy(ctx, blocking);
437
}
438
}
439
}
440
441
/* Update sources of blocking copies.
442
*
443
* Note: at this point, every blocking copy's source should be
444
* contained within our destination.
445
*/
446
for (unsigned j = 0; j < ctx->entry_count; j++) {
447
struct copy_entry *blocking = &ctx->entries[j];
448
if (blocking->src.reg >= entry->dst &&
449
blocking->src.reg < entry->dst + copy_entry_size(entry)) {
450
blocking->src.reg =
451
entry->src.reg + (blocking->src.reg - entry->dst);
452
}
453
}
454
}
455
}
456
457
static void
458
handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr,
459
struct copy_entry *entries, unsigned entry_count)
460
{
461
struct copy_ctx ctx;
462
463
/* handle shared copies first */
464
ctx.entry_count = 0;
465
for (unsigned i = 0; i < entry_count; i++) {
466
if (entries[i].flags & IR3_REG_SHARED)
467
ctx.entries[ctx.entry_count++] = entries[i];
468
}
469
_handle_copies(v->shader->compiler, instr, &ctx);
470
471
if (v->mergedregs) {
472
/* Half regs and full regs are in the same file, so handle everything
473
* at once.
474
*/
475
ctx.entry_count = 0;
476
for (unsigned i = 0; i < entry_count; i++) {
477
if (!(entries[i].flags & IR3_REG_SHARED))
478
ctx.entries[ctx.entry_count++] = entries[i];
479
}
480
_handle_copies(v->shader->compiler, instr, &ctx);
481
} else {
482
/* There may be both half copies and full copies, so we have to split
483
* them up since they don't interfere.
484
*/
485
ctx.entry_count = 0;
486
for (unsigned i = 0; i < entry_count; i++) {
487
if (entries[i].flags & IR3_REG_HALF)
488
ctx.entries[ctx.entry_count++] = entries[i];
489
}
490
_handle_copies(v->shader->compiler, instr, &ctx);
491
492
ctx.entry_count = 0;
493
for (unsigned i = 0; i < entry_count; i++) {
494
if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED)))
495
ctx.entries[ctx.entry_count++] = entries[i];
496
}
497
_handle_copies(v->shader->compiler, instr, &ctx);
498
}
499
}
500
501
void
502
ir3_lower_copies(struct ir3_shader_variant *v)
503
{
504
DECLARE_ARRAY(struct copy_entry, copies);
505
copies_count = copies_sz = 0;
506
copies = NULL;
507
508
foreach_block (block, &v->ir->block_list) {
509
foreach_instr_safe (instr, &block->instr_list) {
510
if (instr->opc == OPC_META_PARALLEL_COPY) {
511
copies_count = 0;
512
for (unsigned i = 0; i < instr->dsts_count; i++) {
513
struct ir3_register *dst = instr->dsts[i];
514
struct ir3_register *src = instr->srcs[i];
515
unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
516
unsigned dst_physreg = ra_reg_get_physreg(dst);
517
for (unsigned j = 0; j < reg_elems(dst); j++) {
518
array_insert(
519
NULL, copies,
520
(struct copy_entry){
521
.dst = dst_physreg + j * reg_elem_size(dst),
522
.src = get_copy_src(src, j * reg_elem_size(dst)),
523
.flags = flags,
524
});
525
}
526
}
527
handle_copies(v, instr, copies, copies_count);
528
list_del(&instr->node);
529
} else if (instr->opc == OPC_META_COLLECT) {
530
copies_count = 0;
531
struct ir3_register *dst = instr->dsts[0];
532
unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
533
for (unsigned i = 0; i < instr->srcs_count; i++) {
534
struct ir3_register *src = instr->srcs[i];
535
array_insert(NULL, copies,
536
(struct copy_entry){
537
.dst = ra_num_to_physreg(dst->num + i, flags),
538
.src = get_copy_src(src, 0),
539
.flags = flags,
540
});
541
}
542
handle_copies(v, instr, copies, copies_count);
543
list_del(&instr->node);
544
} else if (instr->opc == OPC_META_SPLIT) {
545
copies_count = 0;
546
struct ir3_register *dst = instr->dsts[0];
547
struct ir3_register *src = instr->srcs[0];
548
unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
549
array_insert(NULL, copies,
550
(struct copy_entry){
551
.dst = ra_reg_get_physreg(dst),
552
.src = get_copy_src(
553
src, instr->split.off * reg_elem_size(dst)),
554
.flags = flags,
555
});
556
handle_copies(v, instr, copies, copies_count);
557
list_del(&instr->node);
558
} else if (instr->opc == OPC_META_PHI) {
559
list_del(&instr->node);
560
}
561
}
562
}
563
564
if (copies)
565
ralloc_free(copies);
566
}
567
568