Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_register_allocate.c
8268 views
1
/*
2
* Copyright © 2014 Broadcom
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*/
23
24
#include "util/ralloc.h"
25
#include "util/register_allocate.h"
26
#include "vc4_context.h"
27
#include "vc4_qir.h"
28
#include "vc4_qpu.h"
29
30
#define QPU_R(file, index) { QPU_MUX_##file, index }
31
32
static const struct qpu_reg vc4_regs[] = {
33
{ QPU_MUX_R0, 0},
34
{ QPU_MUX_R1, 0},
35
{ QPU_MUX_R2, 0},
36
{ QPU_MUX_R3, 0},
37
{ QPU_MUX_R4, 0},
38
QPU_R(A, 0),
39
QPU_R(B, 0),
40
QPU_R(A, 1),
41
QPU_R(B, 1),
42
QPU_R(A, 2),
43
QPU_R(B, 2),
44
QPU_R(A, 3),
45
QPU_R(B, 3),
46
QPU_R(A, 4),
47
QPU_R(B, 4),
48
QPU_R(A, 5),
49
QPU_R(B, 5),
50
QPU_R(A, 6),
51
QPU_R(B, 6),
52
QPU_R(A, 7),
53
QPU_R(B, 7),
54
QPU_R(A, 8),
55
QPU_R(B, 8),
56
QPU_R(A, 9),
57
QPU_R(B, 9),
58
QPU_R(A, 10),
59
QPU_R(B, 10),
60
QPU_R(A, 11),
61
QPU_R(B, 11),
62
QPU_R(A, 12),
63
QPU_R(B, 12),
64
QPU_R(A, 13),
65
QPU_R(B, 13),
66
QPU_R(A, 14),
67
QPU_R(B, 14),
68
QPU_R(A, 15),
69
QPU_R(B, 15),
70
QPU_R(A, 16),
71
QPU_R(B, 16),
72
QPU_R(A, 17),
73
QPU_R(B, 17),
74
QPU_R(A, 18),
75
QPU_R(B, 18),
76
QPU_R(A, 19),
77
QPU_R(B, 19),
78
QPU_R(A, 20),
79
QPU_R(B, 20),
80
QPU_R(A, 21),
81
QPU_R(B, 21),
82
QPU_R(A, 22),
83
QPU_R(B, 22),
84
QPU_R(A, 23),
85
QPU_R(B, 23),
86
QPU_R(A, 24),
87
QPU_R(B, 24),
88
QPU_R(A, 25),
89
QPU_R(B, 25),
90
QPU_R(A, 26),
91
QPU_R(B, 26),
92
QPU_R(A, 27),
93
QPU_R(B, 27),
94
QPU_R(A, 28),
95
QPU_R(B, 28),
96
QPU_R(A, 29),
97
QPU_R(B, 29),
98
QPU_R(A, 30),
99
QPU_R(B, 30),
100
QPU_R(A, 31),
101
QPU_R(B, 31),
102
};
103
#define ACC_INDEX 0
104
#define ACC_COUNT 5
105
#define AB_INDEX (ACC_INDEX + ACC_COUNT)
106
#define AB_COUNT 64
107
108
static void
109
vc4_alloc_reg_set(struct vc4_context *vc4)
110
{
111
assert(vc4_regs[AB_INDEX].addr == 0);
112
assert(vc4_regs[AB_INDEX + 1].addr == 0);
113
STATIC_ASSERT(ARRAY_SIZE(vc4_regs) == AB_INDEX + 64);
114
115
if (vc4->regs)
116
return;
117
118
vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), false);
119
120
/* The physical regfiles split us into two classes, with [0] being the
121
* whole space and [1] being the bottom half (for threaded fragment
122
* shaders).
123
*/
124
for (int i = 0; i < 2; i++) {
125
vc4->reg_class_any[i] = ra_alloc_contig_reg_class(vc4->regs, 1);
126
vc4->reg_class_a_or_b[i] = ra_alloc_contig_reg_class(vc4->regs, 1);
127
vc4->reg_class_a_or_b_or_acc[i] = ra_alloc_contig_reg_class(vc4->regs, 1);
128
vc4->reg_class_r4_or_a[i] = ra_alloc_contig_reg_class(vc4->regs, 1);
129
vc4->reg_class_a[i] = ra_alloc_contig_reg_class(vc4->regs, 1);
130
}
131
vc4->reg_class_r0_r3 = ra_alloc_contig_reg_class(vc4->regs, 1);
132
133
/* r0-r3 */
134
for (uint32_t i = ACC_INDEX; i < ACC_INDEX + 4; i++) {
135
ra_class_add_reg(vc4->reg_class_r0_r3, i);
136
ra_class_add_reg(vc4->reg_class_a_or_b_or_acc[0], i);
137
ra_class_add_reg(vc4->reg_class_a_or_b_or_acc[1], i);
138
}
139
140
/* R4 gets a special class because it can't be written as a general
141
* purpose register. (it's TMU_NOSWAP as a write address).
142
*/
143
for (int i = 0; i < 2; i++) {
144
ra_class_add_reg(vc4->reg_class_r4_or_a[i], ACC_INDEX + 4);
145
ra_class_add_reg(vc4->reg_class_any[i], ACC_INDEX + 4);
146
}
147
148
/* A/B */
149
for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i ++) {
150
/* Reserve ra14/rb14 for spilling fixup_raddr_conflict() in
151
* vc4_qpu_emit.c
152
*/
153
if (vc4_regs[i].addr == 14)
154
continue;
155
156
ra_class_add_reg(vc4->reg_class_any[0], i);
157
ra_class_add_reg(vc4->reg_class_a_or_b[0], i);
158
ra_class_add_reg(vc4->reg_class_a_or_b_or_acc[0], i);
159
160
if (vc4_regs[i].addr < 16) {
161
ra_class_add_reg(vc4->reg_class_any[1], i);
162
ra_class_add_reg(vc4->reg_class_a_or_b[1], i);
163
ra_class_add_reg(vc4->reg_class_a_or_b_or_acc[1], i);
164
}
165
166
167
/* A only */
168
if (((i - AB_INDEX) & 1) == 0) {
169
ra_class_add_reg(vc4->reg_class_a[0], i);
170
ra_class_add_reg(vc4->reg_class_r4_or_a[0], i);
171
172
if (vc4_regs[i].addr < 16) {
173
ra_class_add_reg(vc4->reg_class_a[1], i);
174
ra_class_add_reg(vc4->reg_class_r4_or_a[1], i);
175
}
176
}
177
}
178
179
ra_set_finalize(vc4->regs, NULL);
180
}
181
182
struct node_to_temp_map {
183
uint32_t temp;
184
uint32_t priority;
185
};
186
187
static int
188
node_to_temp_priority(const void *in_a, const void *in_b)
189
{
190
const struct node_to_temp_map *a = in_a;
191
const struct node_to_temp_map *b = in_b;
192
193
return a->priority - b->priority;
194
}
195
196
#define CLASS_BIT_A (1 << 0)
197
#define CLASS_BIT_B (1 << 1)
198
#define CLASS_BIT_R4 (1 << 2)
199
#define CLASS_BIT_R0_R3 (1 << 4)
200
201
struct vc4_ra_select_callback_data {
202
uint32_t next_acc;
203
uint32_t next_ab;
204
};
205
206
static unsigned int
207
vc4_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
208
{
209
struct vc4_ra_select_callback_data *vc4_ra = data;
210
211
/* If r4 is available, always choose it -- few other things can go
212
* there, and choosing anything else means inserting a mov.
213
*/
214
if (BITSET_TEST(regs, ACC_INDEX + 4))
215
return ACC_INDEX + 4;
216
217
/* Choose an accumulator if possible (no delay between write and
218
* read), but round-robin through them to give post-RA instruction
219
* selection more options.
220
*/
221
for (int i = 0; i < ACC_COUNT; i++) {
222
int acc_off = (vc4_ra->next_acc + i) % ACC_COUNT;
223
int acc = ACC_INDEX + acc_off;
224
225
if (BITSET_TEST(regs, acc)) {
226
vc4_ra->next_acc = acc_off + 1;
227
return acc;
228
}
229
}
230
231
for (int i = 0; i < AB_COUNT; i++) {
232
int ab_off = (vc4_ra->next_ab + i) % AB_COUNT;
233
int ab = AB_INDEX + ab_off;
234
235
if (BITSET_TEST(regs, ab)) {
236
vc4_ra->next_ab = ab_off + 1;
237
return ab;
238
}
239
}
240
241
unreachable("RA must pass us at least one possible reg.");
242
}
243
244
/**
245
* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
246
*
247
* The return value should be freed by the caller.
248
*/
249
struct qpu_reg *
250
vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
251
{
252
struct node_to_temp_map map[c->num_temps];
253
uint32_t temp_to_node[c->num_temps];
254
uint8_t class_bits[c->num_temps];
255
struct qpu_reg *temp_registers = calloc(c->num_temps,
256
sizeof(*temp_registers));
257
struct vc4_ra_select_callback_data callback_data = {
258
.next_acc = 0,
259
.next_ab = 0,
260
};
261
262
/* If things aren't ever written (undefined values), just read from
263
* r0.
264
*/
265
for (uint32_t i = 0; i < c->num_temps; i++)
266
temp_registers[i] = qpu_rn(0);
267
268
vc4_alloc_reg_set(vc4);
269
270
struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
271
c->num_temps);
272
273
/* Compute the live ranges so we can figure out interference. */
274
qir_calculate_live_intervals(c);
275
276
ra_set_select_reg_callback(g, vc4_ra_select_callback, &callback_data);
277
278
for (uint32_t i = 0; i < c->num_temps; i++) {
279
map[i].temp = i;
280
map[i].priority = c->temp_end[i] - c->temp_start[i];
281
}
282
qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
283
for (uint32_t i = 0; i < c->num_temps; i++) {
284
temp_to_node[map[i].temp] = i;
285
}
286
287
/* Figure out our register classes and preallocated registers. We
288
* start with any temp being able to be in any file, then instructions
289
* incrementally remove bits that the temp definitely can't be in.
290
*/
291
memset(class_bits,
292
CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3,
293
sizeof(class_bits));
294
295
int ip = 0;
296
qir_for_each_inst_inorder(inst, c) {
297
if (qir_writes_r4(inst)) {
298
/* This instruction writes r4 (and optionally moves
299
* its result to a temp), so nothing else can be
300
* stored in r4 across it.
301
*/
302
for (int i = 0; i < c->num_temps; i++) {
303
if (c->temp_start[i] < ip && c->temp_end[i] > ip)
304
class_bits[i] &= ~CLASS_BIT_R4;
305
}
306
307
/* If we're doing a conditional write of something
308
* writing R4 (math, tex results), then make sure that
309
* we store in a temp so that we actually
310
* conditionally move the result.
311
*/
312
if (inst->cond != QPU_COND_ALWAYS)
313
class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
314
} else {
315
/* R4 can't be written as a general purpose
316
* register. (it's TMU_NOSWAP as a write address).
317
*/
318
if (inst->dst.file == QFILE_TEMP)
319
class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
320
}
321
322
switch (inst->op) {
323
case QOP_FRAG_Z:
324
ra_set_node_reg(g, temp_to_node[inst->dst.index],
325
AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2 + 1);
326
break;
327
328
case QOP_FRAG_W:
329
ra_set_node_reg(g, temp_to_node[inst->dst.index],
330
AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
331
break;
332
333
case QOP_ROT_MUL:
334
assert(inst->src[0].file == QFILE_TEMP);
335
class_bits[inst->src[0].index] &= CLASS_BIT_R0_R3;
336
break;
337
338
case QOP_THRSW:
339
/* All accumulators are invalidated across a thread
340
* switch.
341
*/
342
for (int i = 0; i < c->num_temps; i++) {
343
if (c->temp_start[i] < ip && c->temp_end[i] > ip)
344
class_bits[i] &= ~(CLASS_BIT_R0_R3 |
345
CLASS_BIT_R4);
346
}
347
break;
348
349
default:
350
break;
351
}
352
353
if (inst->dst.pack && !qir_is_mul(inst)) {
354
/* The non-MUL pack flags require an A-file dst
355
* register.
356
*/
357
class_bits[inst->dst.index] &= CLASS_BIT_A;
358
}
359
360
/* Apply restrictions for src unpacks. The integer unpacks
361
* can only be done from regfile A, while float unpacks can be
362
* either A or R4.
363
*/
364
for (int i = 0; i < qir_get_nsrc(inst); i++) {
365
if (inst->src[i].file == QFILE_TEMP &&
366
inst->src[i].pack) {
367
if (qir_is_float_input(inst)) {
368
class_bits[inst->src[i].index] &=
369
CLASS_BIT_A | CLASS_BIT_R4;
370
} else {
371
class_bits[inst->src[i].index] &=
372
CLASS_BIT_A;
373
}
374
}
375
}
376
377
ip++;
378
}
379
380
for (uint32_t i = 0; i < c->num_temps; i++) {
381
int node = temp_to_node[i];
382
383
switch (class_bits[i]) {
384
case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3:
385
ra_set_node_class(g, node,
386
vc4->reg_class_any[c->fs_threaded]);
387
break;
388
case CLASS_BIT_A | CLASS_BIT_B:
389
ra_set_node_class(g, node,
390
vc4->reg_class_a_or_b[c->fs_threaded]);
391
break;
392
case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R0_R3:
393
ra_set_node_class(g, node,
394
vc4->reg_class_a_or_b_or_acc[c->fs_threaded]);
395
break;
396
case CLASS_BIT_A | CLASS_BIT_R4:
397
ra_set_node_class(g, node,
398
vc4->reg_class_r4_or_a[c->fs_threaded]);
399
break;
400
case CLASS_BIT_A:
401
ra_set_node_class(g, node,
402
vc4->reg_class_a[c->fs_threaded]);
403
break;
404
case CLASS_BIT_R0_R3:
405
ra_set_node_class(g, node, vc4->reg_class_r0_r3);
406
break;
407
408
default:
409
/* DDX/DDY used across thread switched might get us
410
* here.
411
*/
412
if (c->fs_threaded) {
413
c->failed = true;
414
free(temp_registers);
415
return NULL;
416
}
417
418
fprintf(stderr, "temp %d: bad class bits: 0x%x\n",
419
i, class_bits[i]);
420
abort();
421
break;
422
}
423
}
424
425
for (uint32_t i = 0; i < c->num_temps; i++) {
426
for (uint32_t j = i + 1; j < c->num_temps; j++) {
427
if (!(c->temp_start[i] >= c->temp_end[j] ||
428
c->temp_start[j] >= c->temp_end[i])) {
429
ra_add_node_interference(g,
430
temp_to_node[i],
431
temp_to_node[j]);
432
}
433
}
434
}
435
436
bool ok = ra_allocate(g);
437
if (!ok) {
438
if (!c->fs_threaded) {
439
fprintf(stderr, "Failed to register allocate:\n");
440
qir_dump(c);
441
}
442
443
c->failed = true;
444
free(temp_registers);
445
return NULL;
446
}
447
448
for (uint32_t i = 0; i < c->num_temps; i++) {
449
temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])];
450
451
/* If the value's never used, just write to the NOP register
452
* for clarity in debug output.
453
*/
454
if (c->temp_start[i] == c->temp_end[i])
455
temp_registers[i] = qpu_ra(QPU_W_NOP);
456
}
457
458
ralloc_free(g);
459
460
return temp_registers;
461
}
462
463