CoCalc -- bi

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/panfrost/bifrost/bi_ra.c
⁴⁵⁶⁴ views
1
/*
2
 * Copyright (C) 2020 Collabora Ltd.
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
 * SOFTWARE.
22
 *
23
 * Authors (Collabora):
24
 *      Alyssa Rosenzweig <[email protected]>
25
 */
26

27
#include "compiler.h"
28
#include "bi_builder.h"
29
#include "util/u_memory.h"
30

31
struct lcra_state {
32
        unsigned node_count;
33
        uint64_t *affinity;
34

35
        /* Linear constraints imposed. Nested array sized upfront, organized as
36
         * linear[node_left][node_right]. That is, calculate indices as:
37
         *
38
         * Each element is itself a bit field denoting whether (c_j - c_i) bias
39
         * is present or not, including negative biases.
40
         *
41
         * Note for Bifrost, there are 4 components so the bias is in range
42
         * [-3, 3] so encoded by 8-bit field. */
43

44
        uint8_t *linear;
45

46
        /* Before solving, forced registers; after solving, solutions. */
47
        unsigned *solutions;
48

49
        /** Node which caused register allocation to fail */
50
        unsigned spill_node;
51
};
52

53
/* This module is an implementation of "Linearly Constrained
54
 * Register Allocation". The paper is available in PDF form
55
 * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX
56
 * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md)
57
 */
58

59
static struct lcra_state *
60
lcra_alloc_equations(unsigned node_count)
61
{
62
        struct lcra_state *l = calloc(1, sizeof(*l));
63

64
        l->node_count = node_count;
65

66
        l->linear = calloc(sizeof(l->linear[0]), node_count * node_count);
67
        l->solutions = calloc(sizeof(l->solutions[0]), node_count);
68
        l->affinity = calloc(sizeof(l->affinity[0]), node_count);
69

70
        memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
71

72
        return l;
73
}
74

75
static void
76
lcra_free(struct lcra_state *l)
77
{
78
        free(l->linear);
79
        free(l->affinity);
80
        free(l->solutions);
81
        free(l);
82
}
83

84
static void
85
lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j)
86
{
87
        if (i == j)
88
                return;
89

90
        uint8_t constraint_fw = 0;
91
        uint8_t constraint_bw = 0;
92

93
        for (unsigned D = 0; D < 4; ++D) {
94
                if (cmask_i & (cmask_j << D)) {
95
                        constraint_bw |= (1 << (3 + D));
96
                        constraint_fw |= (1 << (3 - D));
97
                }
98

99
                if (cmask_i & (cmask_j >> D)) {
100
                        constraint_fw |= (1 << (3 + D));
101
                        constraint_bw |= (1 << (3 - D));
102
                }
103
        }
104

105
        l->linear[j * l->node_count + i] |= constraint_fw;
106
        l->linear[i * l->node_count + j] |= constraint_bw;
107
}
108

109
static bool
110
lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
111
{
112
        uint8_t *row = &l->linear[i * l->node_count];
113
        signed constant = solutions[i];
114

115
        for (unsigned j = 0; j < l->node_count; ++j) {
116
                if (solutions[j] == ~0) continue;
117

118
                signed lhs = solutions[j] - constant;
119

120
                if (lhs < -3 || lhs > 3)
121
                        continue;
122

123
                if (row[j] & (1 << (lhs + 3)))
124
                        return false;
125
        }
126

127
        return true;
128
}
129

130
static bool
131
lcra_solve(struct lcra_state *l)
132
{
133
        for (unsigned step = 0; step < l->node_count; ++step) {
134
                if (l->solutions[step] != ~0) continue;
135
                if (l->affinity[step] == 0) continue;
136

137
                bool succ = false;
138

139
                u_foreach_bit64(r, l->affinity[step]) {
140
                        l->solutions[step] = r;
141

142
                        if (lcra_test_linear(l, l->solutions, step)) {
143
                                succ = true;
144
                                break;
145
                        }
146
                }
147

148
                /* Out of registers - prepare to spill */
149
                if (!succ) {
150
                        l->spill_node = step;
151
                        return false;
152
                }
153
        }
154

155
        return true;
156
}
157

158
/* Register spilling is implemented with a cost-benefit system. Costs are set
159
 * by the user. Benefits are calculated from the constraints. */
160

161
static unsigned
162
lcra_count_constraints(struct lcra_state *l, unsigned i)
163
{
164
        unsigned count = 0;
165
        uint8_t *constraints = &l->linear[i * l->node_count];
166

167
        for (unsigned j = 0; j < l->node_count; ++j)
168
                count += util_bitcount(constraints[j]);
169

170
        return count;
171
}
172

173
/* Construct an affinity mask such that the vector with `count` elements does
174
 * not intersect any of the registers in the bitset `clobber`. In other words,
175
 * an allocated register r needs to satisfy for each i < count: a + i != b.
176
 * Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the
177
 * entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where
178
 * that union is the desired clobber set. That may be written equivalently as
179
 * the union over i < n of (B - i), where subtraction is defined elementwise
180
 * and corresponds to a shift of the entire bitset.
181
 */
182

183
static uint64_t
184
bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
185
{
186
        uint64_t clobbered = 0;
187

188
        for (unsigned i = 0; i < count; ++i)
189
                clobbered |= (clobber >> i);
190

191
        /* Don't allocate past the end of the register file */
192
        if (count > 1) {
193
                unsigned excess = count - 1;
194
                uint64_t mask = BITFIELD_MASK(excess);
195
                clobbered |= mask << (64 - excess);
196

197
                if (split_file)
198
                        clobbered |= mask << (16 - excess);
199
        }
200

201
        /* Don't allocate the middle if we split out the middle */
202
        if (split_file)
203
                clobbered |= BITFIELD64_MASK(32) << 16;
204

205
        /* We can use a register iff it's not clobberred */
206
        return ~clobbered;
207
}
208

209
static void
210
bi_mark_interference(bi_block *block, struct lcra_state *l, uint16_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file)
211
{
212
        bi_foreach_instr_in_block_rev(block, ins) {
213
                /* Mark all registers live after the instruction as
214
                 * interfering with the destination */
215

216
                bi_foreach_dest(ins, d) {
217
                        unsigned node = bi_get_node(ins->dest[d]);
218

219
                        if (node >= node_count)
220
                                continue;
221

222
                        /* Don't allocate to anything that's read later as a
223
                         * preloaded register. The affinity is the intersection
224
                         * of affinity masks for each write. Since writes have
225
                         * offsets, but the affinity is for the whole node, we
226
                         * need to offset the affinity opposite the write
227
                         * offset, so we shift right. */
228
                        unsigned count = bi_count_write_registers(ins, d);
229
                        unsigned offset = ins->dest[d].offset;
230
                        uint64_t affinity = bi_make_affinity(preload_live, count, split_file);
231
                        l->affinity[node] &= (affinity >> offset);
232

233
                        for (unsigned i = 0; i < node_count; ++i) {
234
                                if (live[i]) {
235
                                        lcra_add_node_interference(l, node,
236
                                                        bi_writemask(ins, d), i, live[i]);
237
                                }
238
                        }
239
                }
240

241
                if (!is_blend && ins->op == BI_OPCODE_BLEND) {
242
                        /* Blend shaders might clobber r0-r15, r48. */
243
                        uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
244

245
                        for (unsigned i = 0; i < node_count; ++i) {
246
                                if (live[i])
247
                                        l->affinity[i] &= ~clobber;
248
                        }
249
                }
250

251
                /* Update live_in */
252
                preload_live = bi_postra_liveness_ins(preload_live, ins);
253
                bi_liveness_ins_update(live, ins, node_count);
254
        }
255

256
        block->reg_live_in = preload_live;
257
}
258

259
static void
260
bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
261
{
262
        unsigned node_count = bi_max_temp(ctx);
263

264
        bi_compute_liveness(ctx);
265
        bi_postra_liveness(ctx);
266

267
        bi_foreach_block_rev(ctx, _blk) {
268
                bi_block *blk = (bi_block *) _blk;
269
                uint16_t *live = mem_dup(_blk->live_out, node_count * sizeof(uint16_t));
270

271
                bi_mark_interference(blk, l, live, blk->reg_live_out,
272
                                node_count, ctx->inputs->is_blend, !full_regs);
273

274
                free(live);
275
        }
276
}
277

278
static struct lcra_state *
279
bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
280
{
281
        unsigned node_count = bi_max_temp(ctx);
282
        struct lcra_state *l = lcra_alloc_equations(node_count);
283

284
        /* Blend shaders are restricted to R0-R15. Other shaders at full
285
         * occupancy also can access R48-R63. At half occupancy they can access
286
         * the whole file. */
287

288
        uint64_t default_affinity =
289
                ctx->inputs->is_blend ? BITFIELD64_MASK(16) :
290
                full_regs ? BITFIELD64_MASK(64) :
291
                (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
292

293
        bi_foreach_instr_global(ctx, ins) {
294
                bi_foreach_dest(ins, d) {
295
                        unsigned dest = bi_get_node(ins->dest[d]);
296

297
                        /* Blend shaders expect the src colour to be in r0-r3 */
298
                        if (ins->op == BI_OPCODE_BLEND &&
299
                            !ctx->inputs->is_blend) {
300
                                unsigned node = bi_get_node(ins->src[0]);
301
                                assert(node < node_count);
302
                                l->solutions[node] = 0;
303
                        }
304

305
                        if (dest < node_count)
306
                                l->affinity[dest] = default_affinity;
307
                }
308

309
        }
310

311
        bi_compute_interference(ctx, l, full_regs);
312

313
        *success = lcra_solve(l);
314

315
        return l;
316
}
317

318
static bi_index
319
bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index)
320
{
321
        /* Offsets can only be applied when we register allocated an index, or
322
         * alternatively for FAU's encoding */
323

324
        ASSERTED bool is_offset = (index.offset > 0) &&
325
                (index.type != BI_INDEX_FAU);
326
        unsigned node_count = bi_max_temp(ctx);
327

328
        /* Did we run RA for this index at all */
329
        if (bi_get_node(index) >= node_count) {
330
                assert(!is_offset);
331
                return index;
332
        }
333

334
        /* LCRA didn't bother solving this index (how lazy!) */
335
        signed solution = l->solutions[bi_get_node(index)];
336
        if (solution < 0) {
337
                assert(!is_offset);
338
                return index;
339
        }
340

341
        /* todo: do we want to compose with the subword swizzle? */
342
        bi_index new_index = bi_register(solution + index.offset);
343
        new_index.swizzle = index.swizzle;
344
        new_index.abs = index.abs;
345
        new_index.neg = index.neg;
346
        return new_index;
347
}
348

349
static void
350
bi_install_registers(bi_context *ctx, struct lcra_state *l)
351
{
352
        bi_foreach_instr_global(ctx, ins) {
353
                bi_foreach_dest(ins, d)
354
                        ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]);
355

356
                bi_foreach_src(ins, s)
357
                        ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]);
358
        }
359
}
360

361
static void
362
bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new)
363
{
364
        bi_foreach_src(ins, i) {
365
                if (bi_is_equiv(ins->src[i], old)) {
366
                        ins->src[i].type = new.type;
367
                        ins->src[i].reg = new.reg;
368
                        ins->src[i].value = new.value;
369
                }
370
        }
371
}
372

373
/* If register allocation fails, find the best spill node */
374

375
static signed
376
bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
377
{
378
        /* Pick a node satisfying bi_spill_register's preconditions */
379
        BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count));
380

381
        bi_foreach_instr_global(ctx, ins) {
382
                bi_foreach_dest(ins, d) {
383
                        unsigned node = bi_get_node(ins->dest[d]);
384

385
                        if (node < l->node_count && ins->no_spill)
386
                                BITSET_SET(no_spill, node);
387
                }
388
        }
389

390
        unsigned best_benefit = 0.0;
391
        signed best_node = -1;
392

393
        for (unsigned i = 0; i < l->node_count; ++i) {
394
                if (BITSET_TEST(no_spill, i)) continue;
395

396
                /* Only spill nodes that interfere with the node failing
397
                 * register allocation. It's pointless to spill anything else */
398
                if (!l->linear[(l->spill_node * l->node_count) + i]) continue;
399

400
                unsigned benefit = lcra_count_constraints(l, i);
401

402
                if (benefit > best_benefit) {
403
                        best_benefit = benefit;
404
                        best_node = i;
405
                }
406
        }
407

408
        free(no_spill);
409
        return best_node;
410
}
411

412
static unsigned
413
bi_count_read_index(bi_instr *I, bi_index index)
414
{
415
        unsigned max = 0;
416

417
        bi_foreach_src(I, s) {
418
                if (bi_is_equiv(I->src[s], index)) {
419
                        unsigned count = bi_count_read_registers(I, s);
420
                        max = MAX2(max, count + I->src[s].offset);
421
                }
422
        }
423

424
        return max;
425
}
426

427
/* Once we've chosen a spill node, spill it and returns bytes spilled */
428

429
static unsigned
430
bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
431
{
432
        bi_builder b = { .shader = ctx };
433
        unsigned channels = 0;
434

435
        /* Spill after every store, fill before every load */
436
        bi_foreach_instr_global_safe(ctx, I) {
437
                bi_foreach_dest(I, d) {
438
                        if (!bi_is_equiv(I->dest[d], index)) continue;
439

440
                        unsigned extra = I->dest[d].offset;
441
                        bi_index tmp = bi_temp(ctx);
442

443
                        I->dest[d] = bi_replace_index(I->dest[d], tmp);
444
                        I->no_spill = true;
445

446
                        unsigned count = bi_count_write_registers(I, d);
447
                        unsigned bits = count * 32;
448

449
                        b.cursor = bi_after_instr(I);
450
                        bi_index loc = bi_imm_u32(offset + 4 * extra);
451
                        bi_store(&b, bits, tmp, loc, bi_zero(), BI_SEG_TL);
452

453
                        ctx->spills++;
454
                        channels = MAX2(channels, extra + count);
455
                }
456

457
                if (bi_has_arg(I, index)) {
458
                        b.cursor = bi_before_instr(I);
459
                        bi_index tmp = bi_temp(ctx);
460

461
                        unsigned bits = bi_count_read_index(I, index) * 32;
462
                        bi_rewrite_index_src_single(I, index, tmp);
463

464
                        bi_instr *ld = bi_load_to(&b, bits, tmp,
465
                                        bi_imm_u32(offset), bi_zero(), BI_SEG_TL);
466
                        ld->no_spill = true;
467
                        ctx->fills++;
468
                }
469
        }
470

471
        return (channels * 4);
472
}
473

474
void
475
bi_register_allocate(bi_context *ctx)
476
{
477
        struct lcra_state *l = NULL;
478
        bool success = false;
479

480
        unsigned iter_count = 1000; /* max iterations */
481

482
        /* Number of bytes of memory we've spilled into */
483
        unsigned spill_count = ctx->info->tls_size;
484

485
        /* Try with reduced register pressure to improve thread count on v7 */
486
        if (ctx->arch == 7) {
487
                bi_invalidate_liveness(ctx);
488
                l = bi_allocate_registers(ctx, &success, false);
489

490
                if (success) {
491
                        ctx->info->work_reg_count = 32;
492
                } else if (!success) {
493
                        lcra_free(l);
494
                        l = NULL;
495
                }
496
        }
497

498
        /* Otherwise, use the register file and spill until we succeed */
499
        while (!success && ((iter_count--) > 0)) {
500
                bi_invalidate_liveness(ctx);
501
                l = bi_allocate_registers(ctx, &success, true);
502

503
                if (success) {
504
                        ctx->info->work_reg_count = 64;
505
                } else {
506
                        signed spill_node = bi_choose_spill_node(ctx, l);
507
                        lcra_free(l);
508
                        l = NULL;
509

510
                        if (spill_node == -1)
511
                                unreachable("Failed to choose spill node\n");
512

513
                        spill_count += bi_spill_register(ctx,
514
                                        bi_node_to_index(spill_node, bi_max_temp(ctx)),
515
                                        spill_count);
516
                }
517
        }
518

519
        assert(success);
520

521
        ctx->info->tls_size = spill_count;
522
        bi_install_registers(ctx, l);
523

524
        lcra_free(l);
525
}
526

527
Product

Resources

Company