Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/panfrost/bifrost/bi_ra.c
4564 views
1
/*
2
* Copyright (C) 2020 Collabora Ltd.
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
* SOFTWARE.
22
*
23
* Authors (Collabora):
24
* Alyssa Rosenzweig <[email protected]>
25
*/
26
27
#include "compiler.h"
28
#include "bi_builder.h"
29
#include "util/u_memory.h"
30
31
struct lcra_state {
32
unsigned node_count;
33
uint64_t *affinity;
34
35
/* Linear constraints imposed. Nested array sized upfront, organized as
36
* linear[node_left][node_right]. That is, calculate indices as:
37
*
38
* Each element is itself a bit field denoting whether (c_j - c_i) bias
39
* is present or not, including negative biases.
40
*
41
* Note for Bifrost, there are 4 components so the bias is in range
42
* [-3, 3] so encoded by 8-bit field. */
43
44
uint8_t *linear;
45
46
/* Before solving, forced registers; after solving, solutions. */
47
unsigned *solutions;
48
49
/** Node which caused register allocation to fail */
50
unsigned spill_node;
51
};
52
53
/* This module is an implementation of "Linearly Constrained
54
* Register Allocation". The paper is available in PDF form
55
* (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX
56
* (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md)
57
*/
58
59
static struct lcra_state *
60
lcra_alloc_equations(unsigned node_count)
61
{
62
struct lcra_state *l = calloc(1, sizeof(*l));
63
64
l->node_count = node_count;
65
66
l->linear = calloc(sizeof(l->linear[0]), node_count * node_count);
67
l->solutions = calloc(sizeof(l->solutions[0]), node_count);
68
l->affinity = calloc(sizeof(l->affinity[0]), node_count);
69
70
memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
71
72
return l;
73
}
74
75
static void
76
lcra_free(struct lcra_state *l)
77
{
78
free(l->linear);
79
free(l->affinity);
80
free(l->solutions);
81
free(l);
82
}
83
84
static void
85
lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j)
86
{
87
if (i == j)
88
return;
89
90
uint8_t constraint_fw = 0;
91
uint8_t constraint_bw = 0;
92
93
for (unsigned D = 0; D < 4; ++D) {
94
if (cmask_i & (cmask_j << D)) {
95
constraint_bw |= (1 << (3 + D));
96
constraint_fw |= (1 << (3 - D));
97
}
98
99
if (cmask_i & (cmask_j >> D)) {
100
constraint_fw |= (1 << (3 + D));
101
constraint_bw |= (1 << (3 - D));
102
}
103
}
104
105
l->linear[j * l->node_count + i] |= constraint_fw;
106
l->linear[i * l->node_count + j] |= constraint_bw;
107
}
108
109
static bool
110
lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
111
{
112
uint8_t *row = &l->linear[i * l->node_count];
113
signed constant = solutions[i];
114
115
for (unsigned j = 0; j < l->node_count; ++j) {
116
if (solutions[j] == ~0) continue;
117
118
signed lhs = solutions[j] - constant;
119
120
if (lhs < -3 || lhs > 3)
121
continue;
122
123
if (row[j] & (1 << (lhs + 3)))
124
return false;
125
}
126
127
return true;
128
}
129
130
static bool
131
lcra_solve(struct lcra_state *l)
132
{
133
for (unsigned step = 0; step < l->node_count; ++step) {
134
if (l->solutions[step] != ~0) continue;
135
if (l->affinity[step] == 0) continue;
136
137
bool succ = false;
138
139
u_foreach_bit64(r, l->affinity[step]) {
140
l->solutions[step] = r;
141
142
if (lcra_test_linear(l, l->solutions, step)) {
143
succ = true;
144
break;
145
}
146
}
147
148
/* Out of registers - prepare to spill */
149
if (!succ) {
150
l->spill_node = step;
151
return false;
152
}
153
}
154
155
return true;
156
}
157
158
/* Register spilling is implemented with a cost-benefit system. Costs are set
159
* by the user. Benefits are calculated from the constraints. */
160
161
static unsigned
162
lcra_count_constraints(struct lcra_state *l, unsigned i)
163
{
164
unsigned count = 0;
165
uint8_t *constraints = &l->linear[i * l->node_count];
166
167
for (unsigned j = 0; j < l->node_count; ++j)
168
count += util_bitcount(constraints[j]);
169
170
return count;
171
}
172
173
/* Construct an affinity mask such that the vector with `count` elements does
174
* not intersect any of the registers in the bitset `clobber`. In other words,
175
* an allocated register r needs to satisfy for each i < count: a + i != b.
176
* Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the
177
* entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where
178
* that union is the desired clobber set. That may be written equivalently as
179
* the union over i < n of (B - i), where subtraction is defined elementwise
180
* and corresponds to a shift of the entire bitset.
181
*/
182
183
static uint64_t
184
bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
185
{
186
uint64_t clobbered = 0;
187
188
for (unsigned i = 0; i < count; ++i)
189
clobbered |= (clobber >> i);
190
191
/* Don't allocate past the end of the register file */
192
if (count > 1) {
193
unsigned excess = count - 1;
194
uint64_t mask = BITFIELD_MASK(excess);
195
clobbered |= mask << (64 - excess);
196
197
if (split_file)
198
clobbered |= mask << (16 - excess);
199
}
200
201
/* Don't allocate the middle if we split out the middle */
202
if (split_file)
203
clobbered |= BITFIELD64_MASK(32) << 16;
204
205
/* We can use a register iff it's not clobberred */
206
return ~clobbered;
207
}
208
209
static void
210
bi_mark_interference(bi_block *block, struct lcra_state *l, uint16_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file)
211
{
212
bi_foreach_instr_in_block_rev(block, ins) {
213
/* Mark all registers live after the instruction as
214
* interfering with the destination */
215
216
bi_foreach_dest(ins, d) {
217
unsigned node = bi_get_node(ins->dest[d]);
218
219
if (node >= node_count)
220
continue;
221
222
/* Don't allocate to anything that's read later as a
223
* preloaded register. The affinity is the intersection
224
* of affinity masks for each write. Since writes have
225
* offsets, but the affinity is for the whole node, we
226
* need to offset the affinity opposite the write
227
* offset, so we shift right. */
228
unsigned count = bi_count_write_registers(ins, d);
229
unsigned offset = ins->dest[d].offset;
230
uint64_t affinity = bi_make_affinity(preload_live, count, split_file);
231
l->affinity[node] &= (affinity >> offset);
232
233
for (unsigned i = 0; i < node_count; ++i) {
234
if (live[i]) {
235
lcra_add_node_interference(l, node,
236
bi_writemask(ins, d), i, live[i]);
237
}
238
}
239
}
240
241
if (!is_blend && ins->op == BI_OPCODE_BLEND) {
242
/* Blend shaders might clobber r0-r15, r48. */
243
uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
244
245
for (unsigned i = 0; i < node_count; ++i) {
246
if (live[i])
247
l->affinity[i] &= ~clobber;
248
}
249
}
250
251
/* Update live_in */
252
preload_live = bi_postra_liveness_ins(preload_live, ins);
253
bi_liveness_ins_update(live, ins, node_count);
254
}
255
256
block->reg_live_in = preload_live;
257
}
258
259
static void
260
bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
261
{
262
unsigned node_count = bi_max_temp(ctx);
263
264
bi_compute_liveness(ctx);
265
bi_postra_liveness(ctx);
266
267
bi_foreach_block_rev(ctx, _blk) {
268
bi_block *blk = (bi_block *) _blk;
269
uint16_t *live = mem_dup(_blk->live_out, node_count * sizeof(uint16_t));
270
271
bi_mark_interference(blk, l, live, blk->reg_live_out,
272
node_count, ctx->inputs->is_blend, !full_regs);
273
274
free(live);
275
}
276
}
277
278
static struct lcra_state *
279
bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
280
{
281
unsigned node_count = bi_max_temp(ctx);
282
struct lcra_state *l = lcra_alloc_equations(node_count);
283
284
/* Blend shaders are restricted to R0-R15. Other shaders at full
285
* occupancy also can access R48-R63. At half occupancy they can access
286
* the whole file. */
287
288
uint64_t default_affinity =
289
ctx->inputs->is_blend ? BITFIELD64_MASK(16) :
290
full_regs ? BITFIELD64_MASK(64) :
291
(BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
292
293
bi_foreach_instr_global(ctx, ins) {
294
bi_foreach_dest(ins, d) {
295
unsigned dest = bi_get_node(ins->dest[d]);
296
297
/* Blend shaders expect the src colour to be in r0-r3 */
298
if (ins->op == BI_OPCODE_BLEND &&
299
!ctx->inputs->is_blend) {
300
unsigned node = bi_get_node(ins->src[0]);
301
assert(node < node_count);
302
l->solutions[node] = 0;
303
}
304
305
if (dest < node_count)
306
l->affinity[dest] = default_affinity;
307
}
308
309
}
310
311
bi_compute_interference(ctx, l, full_regs);
312
313
*success = lcra_solve(l);
314
315
return l;
316
}
317
318
static bi_index
319
bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index)
320
{
321
/* Offsets can only be applied when we register allocated an index, or
322
* alternatively for FAU's encoding */
323
324
ASSERTED bool is_offset = (index.offset > 0) &&
325
(index.type != BI_INDEX_FAU);
326
unsigned node_count = bi_max_temp(ctx);
327
328
/* Did we run RA for this index at all */
329
if (bi_get_node(index) >= node_count) {
330
assert(!is_offset);
331
return index;
332
}
333
334
/* LCRA didn't bother solving this index (how lazy!) */
335
signed solution = l->solutions[bi_get_node(index)];
336
if (solution < 0) {
337
assert(!is_offset);
338
return index;
339
}
340
341
/* todo: do we want to compose with the subword swizzle? */
342
bi_index new_index = bi_register(solution + index.offset);
343
new_index.swizzle = index.swizzle;
344
new_index.abs = index.abs;
345
new_index.neg = index.neg;
346
return new_index;
347
}
348
349
static void
350
bi_install_registers(bi_context *ctx, struct lcra_state *l)
351
{
352
bi_foreach_instr_global(ctx, ins) {
353
bi_foreach_dest(ins, d)
354
ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]);
355
356
bi_foreach_src(ins, s)
357
ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]);
358
}
359
}
360
361
static void
362
bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new)
363
{
364
bi_foreach_src(ins, i) {
365
if (bi_is_equiv(ins->src[i], old)) {
366
ins->src[i].type = new.type;
367
ins->src[i].reg = new.reg;
368
ins->src[i].value = new.value;
369
}
370
}
371
}
372
373
/* If register allocation fails, find the best spill node */
374
375
static signed
376
bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
377
{
378
/* Pick a node satisfying bi_spill_register's preconditions */
379
BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count));
380
381
bi_foreach_instr_global(ctx, ins) {
382
bi_foreach_dest(ins, d) {
383
unsigned node = bi_get_node(ins->dest[d]);
384
385
if (node < l->node_count && ins->no_spill)
386
BITSET_SET(no_spill, node);
387
}
388
}
389
390
unsigned best_benefit = 0.0;
391
signed best_node = -1;
392
393
for (unsigned i = 0; i < l->node_count; ++i) {
394
if (BITSET_TEST(no_spill, i)) continue;
395
396
/* Only spill nodes that interfere with the node failing
397
* register allocation. It's pointless to spill anything else */
398
if (!l->linear[(l->spill_node * l->node_count) + i]) continue;
399
400
unsigned benefit = lcra_count_constraints(l, i);
401
402
if (benefit > best_benefit) {
403
best_benefit = benefit;
404
best_node = i;
405
}
406
}
407
408
free(no_spill);
409
return best_node;
410
}
411
412
static unsigned
413
bi_count_read_index(bi_instr *I, bi_index index)
414
{
415
unsigned max = 0;
416
417
bi_foreach_src(I, s) {
418
if (bi_is_equiv(I->src[s], index)) {
419
unsigned count = bi_count_read_registers(I, s);
420
max = MAX2(max, count + I->src[s].offset);
421
}
422
}
423
424
return max;
425
}
426
427
/* Once we've chosen a spill node, spill it and returns bytes spilled */
428
429
static unsigned
430
bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
431
{
432
bi_builder b = { .shader = ctx };
433
unsigned channels = 0;
434
435
/* Spill after every store, fill before every load */
436
bi_foreach_instr_global_safe(ctx, I) {
437
bi_foreach_dest(I, d) {
438
if (!bi_is_equiv(I->dest[d], index)) continue;
439
440
unsigned extra = I->dest[d].offset;
441
bi_index tmp = bi_temp(ctx);
442
443
I->dest[d] = bi_replace_index(I->dest[d], tmp);
444
I->no_spill = true;
445
446
unsigned count = bi_count_write_registers(I, d);
447
unsigned bits = count * 32;
448
449
b.cursor = bi_after_instr(I);
450
bi_index loc = bi_imm_u32(offset + 4 * extra);
451
bi_store(&b, bits, tmp, loc, bi_zero(), BI_SEG_TL);
452
453
ctx->spills++;
454
channels = MAX2(channels, extra + count);
455
}
456
457
if (bi_has_arg(I, index)) {
458
b.cursor = bi_before_instr(I);
459
bi_index tmp = bi_temp(ctx);
460
461
unsigned bits = bi_count_read_index(I, index) * 32;
462
bi_rewrite_index_src_single(I, index, tmp);
463
464
bi_instr *ld = bi_load_to(&b, bits, tmp,
465
bi_imm_u32(offset), bi_zero(), BI_SEG_TL);
466
ld->no_spill = true;
467
ctx->fills++;
468
}
469
}
470
471
return (channels * 4);
472
}
473
474
void
475
bi_register_allocate(bi_context *ctx)
476
{
477
struct lcra_state *l = NULL;
478
bool success = false;
479
480
unsigned iter_count = 1000; /* max iterations */
481
482
/* Number of bytes of memory we've spilled into */
483
unsigned spill_count = ctx->info->tls_size;
484
485
/* Try with reduced register pressure to improve thread count on v7 */
486
if (ctx->arch == 7) {
487
bi_invalidate_liveness(ctx);
488
l = bi_allocate_registers(ctx, &success, false);
489
490
if (success) {
491
ctx->info->work_reg_count = 32;
492
} else if (!success) {
493
lcra_free(l);
494
l = NULL;
495
}
496
}
497
498
/* Otherwise, use the register file and spill until we succeed */
499
while (!success && ((iter_count--) > 0)) {
500
bi_invalidate_liveness(ctx);
501
l = bi_allocate_registers(ctx, &success, true);
502
503
if (success) {
504
ctx->info->work_reg_count = 64;
505
} else {
506
signed spill_node = bi_choose_spill_node(ctx, l);
507
lcra_free(l);
508
l = NULL;
509
510
if (spill_node == -1)
511
unreachable("Failed to choose spill node\n");
512
513
spill_count += bi_spill_register(ctx,
514
bi_node_to_index(spill_node, bi_max_temp(ctx)),
515
spill_count);
516
}
517
}
518
519
assert(success);
520
521
ctx->info->tls_size = spill_count;
522
bi_install_registers(ctx, l);
523
524
lcra_free(l);
525
}
526
527