Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/freedreno/ir3/ir3_ra.c
4565 views
1
/*
2
* Copyright (C) 2021 Valve Corporation
3
* Copyright (C) 2014 Rob Clark <[email protected]>
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
25
#include "ir3_ra.h"
26
#include "util/rb_tree.h"
27
#include "util/u_math.h"
28
#include "ir3_shader.h"
29
30
/* This file implements an SSA-based register allocator. Unlike other
31
* SSA-based allocators, it handles vector split/collect "smartly," meaning
32
* that multiple values may share the same register interval. From the
33
* perspective of the allocator itself, only the top-level intervals matter,
34
* and the allocator is only concerned with allocating top-level intervals,
35
* which may mean moving other top-level intervals around. Other intervals,
36
* like the destination of a split instruction or the source of a collect
37
* instruction, are "locked" to their parent interval. The details of this are
38
* mostly handled by ir3_merge_regs and ir3_reg_ctx.
39
*
40
* We currently don't do any backtracking, but we do use the merge sets as a
41
* form of affinity to try to avoid moves from phis/splits/collects. Each
42
* merge set is what a more "classic" graph-coloring or live-range based
43
* allocator would consider a single register, but here we use it as merely a
44
* hint, except when multiple overlapping values are live at the same time.
45
* Each merge set has a "preferred" register, and we try to honor that when
46
* allocating values in the merge set.
47
*/
48
49
/* ir3_reg_ctx implementation. */
50
51
static int
52
ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
53
{
54
physreg_t reg = *(const physreg_t *)data;
55
const struct ir3_reg_interval *interval =
56
ir3_rb_node_to_interval_const(node);
57
if (interval->reg->interval_start > reg)
58
return -1;
59
else if (interval->reg->interval_end <= reg)
60
return 1;
61
else
62
return 0;
63
}
64
65
static struct ir3_reg_interval *
66
ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
67
{
68
struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
69
return node ? ir3_rb_node_to_interval(node) : NULL;
70
}
71
72
static struct ir3_reg_interval *
73
ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
74
{
75
struct rb_node *node =
76
rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
77
return node ? ir3_rb_node_to_interval(node) : NULL;
78
}
79
80
/* Get the interval covering the reg, or the closest to the right if it
81
* doesn't exist.
82
*/
83
static struct ir3_reg_interval *
84
ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
85
{
86
struct ir3_reg_interval *interval =
87
ir3_reg_interval_search_sloppy(tree, offset);
88
if (!interval) {
89
return NULL;
90
} else if (interval->reg->interval_end > offset) {
91
return interval;
92
} else {
93
/* There is no interval covering reg, and ra_file_search_sloppy()
94
* returned the closest range to the left, so the next interval to the
95
* right should be the closest to the right.
96
*/
97
return ir3_reg_interval_next_or_null(interval);
98
}
99
}
100
101
static int
102
ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
103
{
104
const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
105
const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
106
return b->reg->interval_start - a->reg->interval_start;
107
}
108
109
static void
110
interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
111
struct ir3_reg_interval *interval)
112
{
113
struct ir3_reg_interval *right =
114
ir3_reg_interval_search_right(tree, interval->reg->interval_start);
115
if (right && right->reg->interval_start < interval->reg->interval_end) {
116
/* We disallow trees where different members have different half-ness.
117
* This means that we can't treat bitcasts as copies like normal
118
* split/collect, so something like this would require an extra copy
119
* in mergedregs mode, and count as 4 half-units of register pressure
120
* instead of 2:
121
*
122
* f16vec2 foo = unpackFloat2x16(bar)
123
* ... = foo.x
124
* ... = bar
125
*
126
* However, relaxing this rule would open a huge can of worms. What
127
* happens when there's a vector of 16 things, and the fifth element
128
* has been bitcasted as a half-reg? Would that element alone have to
129
* be small enough to be used as a half-reg source? Let's keep that
130
* can of worms firmly shut for now.
131
*/
132
assert((interval->reg->flags & IR3_REG_HALF) ==
133
(right->reg->flags & IR3_REG_HALF));
134
135
if (right->reg->interval_end <= interval->reg->interval_end &&
136
right->reg->interval_start >= interval->reg->interval_start) {
137
/* Check if we're inserting something that's already inserted */
138
assert(interval != right);
139
140
/* "right" is contained in "interval" and must become a child of
141
* it. There may be further children too.
142
*/
143
for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
144
right && right->reg->interval_start < interval->reg->interval_end;
145
right = next, next = ir3_reg_interval_next_or_null(next)) {
146
/* "right" must be contained in "interval." */
147
assert(right->reg->interval_end <= interval->reg->interval_end);
148
assert((interval->reg->flags & IR3_REG_HALF) ==
149
(right->reg->flags & IR3_REG_HALF));
150
if (!right->parent)
151
ctx->interval_delete(ctx, right);
152
right->parent = interval;
153
rb_tree_remove(tree, &right->node);
154
rb_tree_insert(&interval->children, &right->node,
155
ir3_reg_interval_insert_cmp);
156
}
157
} else {
158
/* "right" must contain "interval," since intervals must form a
159
* tree.
160
*/
161
assert(right->reg->interval_start <= interval->reg->interval_start);
162
interval->parent = right;
163
interval_insert(ctx, &right->children, interval);
164
return;
165
}
166
}
167
168
if (!interval->parent)
169
ctx->interval_add(ctx, interval);
170
rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
171
interval->inserted = true;
172
}
173
174
void
175
ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
176
struct ir3_reg_interval *interval)
177
{
178
interval_insert(ctx, &ctx->intervals, interval);
179
}
180
181
void
182
ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
183
struct ir3_reg_interval *interval)
184
{
185
if (interval->parent) {
186
rb_tree_remove(&interval->parent->children, &interval->node);
187
} else {
188
ctx->interval_delete(ctx, interval);
189
rb_tree_remove(&ctx->intervals, &interval->node);
190
}
191
192
rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,
193
node) {
194
rb_tree_remove(&interval->children, &child->node);
195
child->parent = interval->parent;
196
197
if (interval->parent) {
198
rb_tree_insert(&child->parent->children, &child->node,
199
ir3_reg_interval_insert_cmp);
200
} else {
201
ctx->interval_readd(ctx, interval, child);
202
rb_tree_insert(&ctx->intervals, &child->node,
203
ir3_reg_interval_insert_cmp);
204
}
205
}
206
207
interval->inserted = false;
208
}
209
210
void
211
ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
212
struct ir3_reg_interval *interval)
213
{
214
assert(!interval->parent);
215
216
ctx->interval_delete(ctx, interval);
217
rb_tree_remove(&ctx->intervals, &interval->node);
218
}
219
220
static void
221
interval_dump(struct ir3_reg_interval *interval, unsigned indent)
222
{
223
for (unsigned i = 0; i < indent; i++)
224
printf("\t");
225
printf("reg %u start %u\n", interval->reg->name,
226
interval->reg->interval_start);
227
228
rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
229
interval_dump(child, indent + 1);
230
}
231
232
for (unsigned i = 0; i < indent; i++)
233
printf("\t");
234
printf("reg %u end %u\n", interval->reg->name, interval->reg->interval_end);
235
}
236
237
void
238
ir3_reg_interval_dump(struct ir3_reg_interval *interval)
239
{
240
interval_dump(interval, 0);
241
}
242
243
/* These are the core datastructures used by the register allocator. First
244
* ra_interval and ra_file, which are used for intra-block tracking and use
245
* the ir3_reg_ctx infrastructure:
246
*/
247
248
struct ra_interval {
249
struct ir3_reg_interval interval;
250
251
struct rb_node physreg_node;
252
physreg_t physreg_start, physreg_end;
253
254
/* True if this is a source of the current instruction which is entirely
255
* killed. This means we can allocate the dest over it, but we can't break
256
* it up.
257
*/
258
bool is_killed;
259
260
/* True if this interval cannot be moved from its position. This is only
261
* used for precolored inputs to ensure that other inputs don't get
262
* allocated on top of them.
263
*/
264
bool frozen;
265
};
266
267
struct ra_file {
268
struct ir3_reg_ctx reg_ctx;
269
270
BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
271
BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
272
273
struct rb_tree physreg_intervals;
274
275
unsigned size;
276
unsigned start;
277
};
278
279
/* State for inter-block tracking. When we split a live range to make space
280
* for a vector, we may need to insert fixup code when a block has multiple
281
* predecessors that have moved the same live value to different registers.
282
* This keeps track of state required to do that.
283
*/
284
285
struct ra_block_state {
286
/* Map of defining ir3_register -> physreg it was allocated to at the end
287
* of the block.
288
*/
289
struct hash_table *renames;
290
291
/* For loops, we need to process a block before all its predecessors have
292
* been processed. In particular, we need to pick registers for values
293
* without knowing if all the predecessors have been renamed. This keeps
294
* track of the registers we chose so that when we visit the back-edge we
295
* can move them appropriately. If all predecessors have been visited
296
* before this block is visited then we don't need to fill this out. This
297
* is a map from ir3_register -> physreg.
298
*/
299
struct hash_table *entry_regs;
300
301
/* True if the block has been visited and "renames" is complete.
302
*/
303
bool visited;
304
305
/* True if the block is unreachable via the logical CFG. This happens for
306
* blocks after an if where both sides end in a break/continue. We ignore
307
* it for everything but shared registers.
308
*/
309
bool logical_unreachable;
310
};
311
312
struct ra_parallel_copy {
313
struct ra_interval *interval;
314
physreg_t src;
315
};
316
317
/* The main context: */
318
319
struct ra_ctx {
320
/* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
321
* half of this file too.
322
*/
323
struct ra_file full;
324
325
/* hr0.x - hr63.w, only used without merged-regs. */
326
struct ra_file half;
327
328
/* Shared regs. */
329
struct ra_file shared;
330
331
struct ir3 *ir;
332
333
struct ir3_liveness *live;
334
335
struct ir3_block *block;
336
337
const struct ir3_compiler *compiler;
338
gl_shader_stage stage;
339
340
/* Pending moves of top-level intervals that will be emitted once we're
341
* finished:
342
*/
343
DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
344
345
struct ra_interval *intervals;
346
struct ra_block_state *blocks;
347
348
bool merged_regs;
349
};
350
351
#define foreach_interval(interval, file) \
352
rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \
353
physreg_node)
354
#define foreach_interval_rev(interval, file) \
355
rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \
356
physreg_node)
357
#define foreach_interval_safe(interval, file) \
358
rb_tree_foreach_safe (struct ra_interval, interval, \
359
&(file)->physreg_intervals, physreg_node)
360
#define foreach_interval_rev_safe(interval, file) \
361
rb_tree_foreach_rev_safe(struct ra_interval, interval, \
362
&(file)->physreg_intervals, physreg_node)
363
364
static struct ra_interval *
365
rb_node_to_interval(struct rb_node *node)
366
{
367
return rb_node_data(struct ra_interval, node, physreg_node);
368
}
369
370
static const struct ra_interval *
371
rb_node_to_interval_const(const struct rb_node *node)
372
{
373
return rb_node_data(struct ra_interval, node, physreg_node);
374
}
375
376
static struct ra_interval *
377
ra_interval_next(struct ra_interval *interval)
378
{
379
struct rb_node *next = rb_node_next(&interval->physreg_node);
380
return next ? rb_node_to_interval(next) : NULL;
381
}
382
383
static struct ra_interval *
384
ra_interval_next_or_null(struct ra_interval *interval)
385
{
386
return interval ? ra_interval_next(interval) : NULL;
387
}
388
389
static int
390
ra_interval_cmp(const struct rb_node *node, const void *data)
391
{
392
physreg_t reg = *(const physreg_t *)data;
393
const struct ra_interval *interval = rb_node_to_interval_const(node);
394
if (interval->physreg_start > reg)
395
return -1;
396
else if (interval->physreg_end <= reg)
397
return 1;
398
else
399
return 0;
400
}
401
402
static struct ra_interval *
403
ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
404
{
405
struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
406
return node ? rb_node_to_interval(node) : NULL;
407
}
408
409
/* Get the interval covering the reg, or the closest to the right if it
410
* doesn't exist.
411
*/
412
static struct ra_interval *
413
ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
414
{
415
struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
416
if (!interval) {
417
return NULL;
418
} else if (interval->physreg_end > reg) {
419
return interval;
420
} else {
421
/* There is no interval covering reg, and ra_file_search_sloppy()
422
* returned the closest range to the left, so the next interval to the
423
* right should be the closest to the right.
424
*/
425
return ra_interval_next_or_null(interval);
426
}
427
}
428
429
static struct ra_interval *
430
ra_file_search_right(struct ra_file *file, physreg_t reg)
431
{
432
return ra_interval_search_right(&file->physreg_intervals, reg);
433
}
434
435
static int
436
ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
437
{
438
const struct ra_interval *a = rb_node_to_interval_const(_a);
439
const struct ra_interval *b = rb_node_to_interval_const(_b);
440
return b->physreg_start - a->physreg_start;
441
}
442
443
static struct ra_interval *
444
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
445
{
446
return rb_node_data(struct ra_interval, interval, interval);
447
}
448
449
static struct ra_file *
450
ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
451
{
452
return rb_node_data(struct ra_file, ctx, reg_ctx);
453
}
454
455
static void
456
interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
457
{
458
struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
459
struct ra_file *file = ir3_reg_ctx_to_file(ctx);
460
461
/* We can assume in this case that physreg_start/physreg_end is already
462
* initialized.
463
*/
464
for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
465
BITSET_CLEAR(file->available, i);
466
BITSET_CLEAR(file->available_to_evict, i);
467
}
468
469
rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
470
ra_interval_insert_cmp);
471
}
472
473
static void
474
interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
475
{
476
struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
477
struct ra_file *file = ir3_reg_ctx_to_file(ctx);
478
479
for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
480
BITSET_SET(file->available, i);
481
BITSET_SET(file->available_to_evict, i);
482
}
483
484
rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
485
}
486
487
static void
488
interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
489
struct ir3_reg_interval *_child)
490
{
491
struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
492
struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
493
494
child->physreg_start =
495
parent->physreg_start + (child->interval.reg->interval_start -
496
parent->interval.reg->interval_start);
497
child->physreg_end =
498
child->physreg_start +
499
(child->interval.reg->interval_end - child->interval.reg->interval_start);
500
501
interval_add(ctx, _child);
502
}
503
504
static void
505
ra_file_init(struct ra_file *file)
506
{
507
for (unsigned i = 0; i < file->size; i++) {
508
BITSET_SET(file->available, i);
509
BITSET_SET(file->available_to_evict, i);
510
}
511
512
file->start = 0;
513
514
rb_tree_init(&file->reg_ctx.intervals);
515
rb_tree_init(&file->physreg_intervals);
516
517
file->reg_ctx.interval_add = interval_add;
518
file->reg_ctx.interval_delete = interval_delete;
519
file->reg_ctx.interval_readd = interval_readd;
520
}
521
522
static void
523
ra_file_insert(struct ra_file *file, struct ra_interval *interval)
524
{
525
assert(interval->physreg_start < interval->physreg_end);
526
assert(interval->physreg_end <= file->size);
527
if (interval->interval.reg->flags & IR3_REG_HALF)
528
assert(interval->physreg_end <= RA_HALF_SIZE);
529
530
ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
531
}
532
533
static void
534
ra_file_remove(struct ra_file *file, struct ra_interval *interval)
535
{
536
ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
537
}
538
539
static void
540
ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
541
{
542
assert(!interval->interval.parent);
543
544
for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
545
BITSET_SET(file->available, i);
546
}
547
548
interval->is_killed = true;
549
}
550
551
static physreg_t
552
ra_interval_get_physreg(const struct ra_interval *interval)
553
{
554
unsigned child_start = interval->interval.reg->interval_start;
555
556
while (interval->interval.parent) {
557
interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
558
}
559
560
return interval->physreg_start +
561
(child_start - interval->interval.reg->interval_start);
562
}
563
564
static unsigned
565
ra_interval_get_num(const struct ra_interval *interval)
566
{
567
return ra_physreg_to_num(ra_interval_get_physreg(interval),
568
interval->interval.reg->flags);
569
}
570
571
static void
572
ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
573
{
574
ir3_reg_interval_init(&interval->interval, reg);
575
interval->is_killed = false;
576
interval->frozen = false;
577
}
578
579
static void
580
ra_interval_dump(struct ra_interval *interval)
581
{
582
printf("physreg %u ", interval->physreg_start);
583
584
ir3_reg_interval_dump(&interval->interval);
585
}
586
587
static void
588
ra_file_dump(struct ra_file *file)
589
{
590
rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
591
physreg_node) {
592
ra_interval_dump(interval);
593
}
594
595
unsigned start, end;
596
printf("available:\n");
597
BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
598
printf("%u-%u ", start, end);
599
}
600
printf("\n");
601
602
printf("available to evict:\n");
603
BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
604
printf("%u-%u ", start, end);
605
}
606
printf("\n");
607
printf("start: %u\n", file->start);
608
}
609
610
static void
611
ra_ctx_dump(struct ra_ctx *ctx)
612
{
613
printf("full:\n");
614
ra_file_dump(&ctx->full);
615
printf("half:\n");
616
ra_file_dump(&ctx->half);
617
printf("shared:\n");
618
ra_file_dump(&ctx->shared);
619
}
620
621
static unsigned
622
reg_file_size(struct ra_file *file, struct ir3_register *reg)
623
{
624
/* Half-regs can only take up the first half of the combined regfile */
625
if (reg->flags & IR3_REG_HALF)
626
return MIN2(file->size, RA_HALF_SIZE);
627
else
628
return file->size;
629
}
630
631
/* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
632
* top-level intervals at once. Pop multiple intervals, then push them back in
633
* any order.
634
*/
635
636
struct ra_removed_interval {
637
struct ra_interval *interval;
638
unsigned size;
639
};
640
641
static struct ra_removed_interval
642
ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
643
struct ra_interval *interval)
644
{
645
assert(!interval->interval.parent);
646
647
/* Check if we've already moved this reg before */
648
unsigned pcopy_index;
649
for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;
650
pcopy_index++) {
651
if (ctx->parallel_copies[pcopy_index].interval == interval)
652
break;
653
}
654
655
if (pcopy_index == ctx->parallel_copies_count) {
656
array_insert(ctx, ctx->parallel_copies,
657
(struct ra_parallel_copy){
658
.interval = interval,
659
.src = interval->physreg_start,
660
});
661
}
662
663
ir3_reg_interval_remove_all(&file->reg_ctx, &interval->interval);
664
665
return (struct ra_removed_interval){
666
.interval = interval,
667
.size = interval->physreg_end - interval->physreg_start,
668
};
669
}
670
671
static void
672
ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
673
const struct ra_removed_interval *removed, physreg_t dst)
674
{
675
struct ra_interval *interval = removed->interval;
676
677
interval->physreg_start = dst;
678
interval->physreg_end = dst + removed->size;
679
680
ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
681
}
682
683
/* Pick up the interval and place it at "dst". */
684
static void
685
ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
686
struct ra_interval *interval, physreg_t dst)
687
{
688
struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
689
ra_push_interval(ctx, file, &temp, dst);
690
}
691
692
static bool
693
get_reg_specified(struct ra_file *file, struct ir3_register *reg,
694
physreg_t physreg, bool is_source)
695
{
696
for (unsigned i = 0; i < reg_size(reg); i++) {
697
if (!BITSET_TEST(is_source ? file->available_to_evict : file->available,
698
physreg + i))
699
return false;
700
}
701
702
return true;
703
}
704
705
/* Try to evict any registers conflicting with the proposed spot "physreg" for
706
* "reg". That is, move them to other places so that we can allocate "physreg"
707
* here.
708
*/
709
710
static bool
711
try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
712
struct ir3_register *reg, physreg_t physreg,
713
unsigned *_eviction_count, bool is_source, bool speculative)
714
{
715
BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
716
memcpy(available_to_evict, file->available_to_evict,
717
sizeof(available_to_evict));
718
719
for (unsigned i = 0; i < reg_size(reg); i++)
720
BITSET_CLEAR(available_to_evict, physreg + i);
721
722
unsigned eviction_count = 0;
723
/* Iterate over each range conflicting with physreg */
724
for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
725
*next = ra_interval_next_or_null(conflicting);
726
conflicting != NULL &&
727
conflicting->physreg_start < physreg + reg_size(reg);
728
conflicting = next, next = ra_interval_next_or_null(next)) {
729
if (!is_source && conflicting->is_killed)
730
continue;
731
732
if (conflicting->frozen) {
733
assert(speculative);
734
return false;
735
}
736
737
unsigned avail_start, avail_end;
738
bool evicted = false;
739
BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
740
reg_file_size(file, conflicting->interval.reg)) {
741
unsigned size = avail_end - avail_start;
742
743
/* non-half registers must be aligned */
744
if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&
745
avail_start % 2 == 1) {
746
avail_start++;
747
size--;
748
}
749
750
if (size >= conflicting->physreg_end - conflicting->physreg_start) {
751
for (unsigned i = 0;
752
i < conflicting->physreg_end - conflicting->physreg_start; i++)
753
BITSET_CLEAR(available_to_evict, avail_start + i);
754
eviction_count +=
755
conflicting->physreg_end - conflicting->physreg_start;
756
if (!speculative)
757
ra_move_interval(ctx, file, conflicting, avail_start);
758
evicted = true;
759
break;
760
}
761
}
762
763
if (!evicted)
764
return false;
765
}
766
767
*_eviction_count = eviction_count;
768
return true;
769
}
770
771
static int
772
removed_interval_cmp(const void *_i1, const void *_i2)
773
{
774
const struct ra_removed_interval *i1 = _i1;
775
const struct ra_removed_interval *i2 = _i2;
776
777
/* We sort the registers as follows:
778
*
779
* |--------------------------------------------------------------------|
780
* | | | | |
781
* | Half live-through | Half killed | Full killed | Full live-through |
782
* | | | | |
783
* |--------------------------------------------------------------------|
784
* | |
785
* | Destination |
786
* | |
787
* |-----------------|
788
*
789
* Half-registers have to be first so that they stay in the low half of
790
* the register file. Then half and full killed must stay together so that
791
* there's a contiguous range where we can put the register. With this
792
* structure we should be able to accomodate any collection of intervals
793
* such that the total number of half components is within the half limit
794
* and the combined components are within the full limit.
795
*/
796
797
unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
798
unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
799
if (i1_align > i2_align)
800
return 1;
801
if (i1_align < i2_align)
802
return -1;
803
804
if (i1_align == 1) {
805
if (i2->interval->is_killed)
806
return -1;
807
if (i1->interval->is_killed)
808
return 1;
809
} else {
810
if (i2->interval->is_killed)
811
return 1;
812
if (i1->interval->is_killed)
813
return -1;
814
}
815
816
return 0;
817
}
818
819
/* "Compress" all the live intervals so that there is enough space for the
820
* destination register. As there can be gaps when a more-aligned interval
821
* follows a less-aligned interval, this also sorts them to remove such
822
* "padding", which may be required when space is very tight. This isn't
823
* amazing, but should be used only as a last resort in case the register file
824
* is almost full and badly fragmented.
825
*
826
* Return the physreg to use.
827
*/
828
static physreg_t
829
compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,
830
unsigned align, bool is_source)
831
{
832
DECLARE_ARRAY(struct ra_removed_interval, intervals);
833
intervals_count = intervals_sz = 0;
834
intervals = NULL;
835
836
unsigned removed_full_size = 0;
837
unsigned removed_half_size = 0;
838
unsigned file_size =
839
align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
840
physreg_t start_reg = 0;
841
842
foreach_interval_rev_safe (interval, file) {
843
/* Check if we can sort the intervals *after* this one and have
844
* enough space leftover to accomodate "size" units.
845
*/
846
if (align == 1) {
847
if (interval->physreg_end + removed_half_size <= file_size - size) {
848
start_reg = interval->physreg_end;
849
break;
850
}
851
} else {
852
if (interval->physreg_end + removed_half_size <=
853
file_size - removed_full_size - size) {
854
start_reg = interval->physreg_end;
855
break;
856
}
857
}
858
859
/* We assume that all frozen intervals are at the start and that we
860
* can avoid popping them.
861
*/
862
assert(!interval->frozen);
863
864
/* Killed sources don't count because they go at the end and can
865
* overlap the register we're trying to add.
866
*/
867
if (!interval->is_killed && !is_source) {
868
if (interval->interval.reg->flags & IR3_REG_HALF)
869
removed_half_size +=
870
interval->physreg_end - interval->physreg_start;
871
else
872
removed_full_size +=
873
interval->physreg_end - interval->physreg_start;
874
}
875
876
/* Now that we've done the accounting, pop this off */
877
d("popping interval %u physreg %u\n", interval->interval.reg->name,
878
interval->physreg_start);
879
array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
880
}
881
882
/* TODO: In addition to skipping registers at the beginning that are
883
* well-packed, we should try to skip registers at the end.
884
*/
885
886
qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
887
888
physreg_t physreg = start_reg;
889
physreg_t ret_reg = (physreg_t)~0;
890
for (unsigned i = 0; i < intervals_count; i++) {
891
if (ret_reg == (physreg_t)~0 &&
892
((intervals[i].interval->is_killed && !is_source) ||
893
!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) {
894
ret_reg = ALIGN(physreg, align);
895
}
896
897
if (ret_reg != (physreg_t)~0 &&
898
(is_source || !intervals[i].interval->is_killed)) {
899
physreg = MAX2(physreg, ret_reg + size);
900
}
901
902
if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) {
903
physreg = ALIGN(physreg, 2);
904
}
905
906
if (physreg + intervals[i].size >
907
reg_file_size(file, intervals[i].interval->interval.reg)) {
908
d("ran out of room for interval %u!\n",
909
intervals[i].interval->interval.reg->name);
910
unreachable("reg pressure calculation was wrong!");
911
return 0;
912
}
913
914
d("pushing interval %u physreg %u\n",
915
intervals[i].interval->interval.reg->name, physreg);
916
ra_push_interval(ctx, file, &intervals[i], physreg);
917
918
physreg += intervals[i].size;
919
}
920
921
if (ret_reg == (physreg_t)~0)
922
ret_reg = physreg;
923
924
ret_reg = ALIGN(ret_reg, align);
925
if (ret_reg + size > file_size) {
926
d("ran out of room for the new interval!\n");
927
unreachable("reg pressure calculation was wrong!");
928
return 0;
929
}
930
931
return ret_reg;
932
}
933
934
static void
935
update_affinity(struct ir3_register *reg, physreg_t physreg)
936
{
937
if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
938
return;
939
940
if (physreg < reg->merge_set_offset)
941
return;
942
943
reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
944
}
945
946
/* Try to find free space for a register without shuffling anything. This uses
947
* a round-robin algorithm to reduce false dependencies.
948
*/
949
static physreg_t
950
find_best_gap(struct ra_file *file, unsigned file_size, unsigned size,
951
unsigned align, bool is_source)
952
{
953
BITSET_WORD *available =
954
is_source ? file->available_to_evict : file->available;
955
956
unsigned start = ALIGN(file->start, align) % (file_size - size + align);
957
unsigned candidate = start;
958
do {
959
bool is_available = true;
960
for (unsigned i = 0; i < size; i++) {
961
if (!BITSET_TEST(available, candidate + i)) {
962
is_available = false;
963
break;
964
}
965
}
966
967
if (is_available) {
968
file->start = (candidate + size) % file_size;
969
return candidate;
970
}
971
972
candidate += align;
973
if (candidate + size > file_size)
974
candidate = 0;
975
} while (candidate != start);
976
977
return (physreg_t)~0;
978
}
979
980
static struct ra_file *
981
ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
982
{
983
if (reg->flags & IR3_REG_SHARED)
984
return &ctx->shared;
985
else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
986
return &ctx->full;
987
else
988
return &ctx->half;
989
}
990
991
/* This is the main entrypoint for picking a register. Pick a free register
992
* for "reg", shuffling around sources if necessary. In the normal case where
993
* "is_source" is false, this register can overlap with killed sources
994
* (intervals with "is_killed == true"). If "is_source" is true, then
995
* is_killed is ignored and the register returned must not overlap with killed
996
* sources. This must be used for tied registers, because we're actually
997
* allocating the destination and the tied source at the same time.
998
*/
999
1000
static physreg_t
1001
get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,
1002
bool is_source)
1003
{
1004
unsigned file_size = reg_file_size(file, reg);
1005
if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
1006
physreg_t preferred_reg =
1007
reg->merge_set->preferred_reg + reg->merge_set_offset;
1008
if (preferred_reg < file_size &&
1009
preferred_reg % reg_elem_size(reg) == 0 &&
1010
get_reg_specified(file, reg, preferred_reg, is_source))
1011
return preferred_reg;
1012
}
1013
1014
/* If this register is a subset of a merge set which we have not picked a
1015
* register for, first try to allocate enough space for the entire merge
1016
* set.
1017
*/
1018
unsigned size = reg_size(reg);
1019
if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
1020
size < reg->merge_set->size) {
1021
physreg_t best_reg = find_best_gap(file, file_size, reg->merge_set->size,
1022
reg->merge_set->alignment, is_source);
1023
if (best_reg != (physreg_t)~0u) {
1024
best_reg += reg->merge_set_offset;
1025
return best_reg;
1026
}
1027
}
1028
1029
/* For ALU and SFU instructions, if the src reg is avail to pick, use it.
1030
* Because this doesn't introduce unnecessary dependencies, and it
1031
* potentially avoids needing (ss) syncs for write after read hazards for
1032
* SFU instructions:
1033
*/
1034
if (is_sfu(reg->instr) || is_alu(reg->instr)) {
1035
for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
1036
struct ir3_register *src = reg->instr->srcs[i];
1037
if (!ra_reg_is_src(src))
1038
continue;
1039
if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
1040
struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1041
physreg_t src_physreg = ra_interval_get_physreg(src_interval);
1042
if (src_physreg % reg_elem_size(reg) == 0 &&
1043
src_physreg + size <= file_size &&
1044
get_reg_specified(file, reg, src_physreg, is_source))
1045
return src_physreg;
1046
}
1047
}
1048
}
1049
1050
physreg_t best_reg =
1051
find_best_gap(file, file_size, size, reg_elem_size(reg), is_source);
1052
if (best_reg != (physreg_t)~0u) {
1053
return best_reg;
1054
}
1055
1056
/* Ok, we couldn't find anything that fits. Here is where we have to start
1057
* moving things around to make stuff fit. First try solely evicting
1058
* registers in the way.
1059
*/
1060
unsigned best_eviction_count = ~0;
1061
for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
1062
unsigned eviction_count;
1063
if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) {
1064
if (eviction_count < best_eviction_count) {
1065
best_eviction_count = eviction_count;
1066
best_reg = i;
1067
}
1068
}
1069
}
1070
1071
if (best_eviction_count != ~0) {
1072
ASSERTED bool result = try_evict_regs(
1073
ctx, file, reg, best_reg, &best_eviction_count, is_source, false);
1074
assert(result);
1075
return best_reg;
1076
}
1077
1078
/* Use the dumb fallback only if try_evict_regs() fails. */
1079
return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg),
1080
is_source);
1081
}
1082
1083
static void
1084
assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,
1085
unsigned num)
1086
{
1087
if (reg->flags & IR3_REG_ARRAY) {
1088
reg->array.base = num;
1089
if (reg->flags & IR3_REG_RELATIV)
1090
reg->array.offset += num;
1091
else
1092
reg->num = num + reg->array.offset;
1093
} else {
1094
reg->num = num;
1095
}
1096
}
1097
1098
static void
1099
mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
1100
{
1101
struct ra_interval *interval = &ctx->intervals[src->def->name];
1102
1103
if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
1104
interval->interval.parent ||
1105
!rb_tree_is_empty(&interval->interval.children))
1106
return;
1107
1108
ra_file_mark_killed(ra_get_file(ctx, src), interval);
1109
}
1110
1111
static void
1112
insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1113
{
1114
struct ra_file *file = ra_get_file(ctx, dst);
1115
struct ra_interval *interval = &ctx->intervals[dst->name];
1116
1117
d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
1118
1119
if (!(dst->flags & IR3_REG_UNUSED))
1120
ra_file_insert(file, interval);
1121
1122
assign_reg(dst->instr, dst, ra_interval_get_num(interval));
1123
}
1124
1125
static void
1126
allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
1127
physreg_t physreg)
1128
{
1129
struct ra_interval *interval = &ctx->intervals[dst->name];
1130
update_affinity(dst, physreg);
1131
1132
ra_interval_init(interval, dst);
1133
interval->physreg_start = physreg;
1134
interval->physreg_end = physreg + reg_size(dst);
1135
}
1136
1137
static void
1138
allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1139
{
1140
struct ra_file *file = ra_get_file(ctx, dst);
1141
1142
struct ir3_register *tied = dst->tied;
1143
if (tied) {
1144
struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1145
struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1146
physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
1147
if (tied_interval->is_killed) {
1148
/* The easy case: the source is killed, so we can just reuse it
1149
* for the destination.
1150
*/
1151
allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
1152
} else {
1153
/* The source is live-through, so we need to get a free register
1154
* (which is free for both the source and destination!), copy the
1155
* original source to it, then use that for the source and
1156
* destination.
1157
*/
1158
physreg_t physreg = get_reg(ctx, file, dst, true);
1159
allocate_dst_fixed(ctx, dst, physreg);
1160
array_insert(ctx, ctx->parallel_copies,
1161
(struct ra_parallel_copy){
1162
.interval = dst_interval,
1163
.src = tied_physreg,
1164
});
1165
}
1166
1167
return;
1168
}
1169
1170
/* All the hard work is done by get_reg here. */
1171
physreg_t physreg = get_reg(ctx, file, dst, false);
1172
1173
allocate_dst_fixed(ctx, dst, physreg);
1174
}
1175
1176
static void
1177
assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
1178
struct ir3_register *src)
1179
{
1180
struct ra_interval *interval = &ctx->intervals[src->def->name];
1181
struct ra_file *file = ra_get_file(ctx, src);
1182
1183
struct ir3_register *tied = src->tied;
1184
physreg_t physreg;
1185
if (tied) {
1186
struct ra_interval *tied_interval = &ctx->intervals[tied->name];
1187
physreg = ra_interval_get_physreg(tied_interval);
1188
} else {
1189
physreg = ra_interval_get_physreg(interval);
1190
}
1191
1192
assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
1193
1194
if (src->flags & IR3_REG_FIRST_KILL)
1195
ra_file_remove(file, interval);
1196
}
1197
1198
/* Insert a parallel copy instruction before the instruction with the parallel
1199
* copy entries we've built up.
1200
*/
1201
static void
1202
insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1203
{
1204
if (ctx->parallel_copies_count == 0)
1205
return;
1206
1207
struct ir3_instruction *pcopy =
1208
ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
1209
ctx->parallel_copies_count, ctx->parallel_copies_count);
1210
1211
for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1212
struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1213
struct ir3_register *reg =
1214
ir3_dst_create(pcopy, INVALID_REG,
1215
entry->interval->interval.reg->flags & ~IR3_REG_SSA);
1216
reg->size = entry->interval->interval.reg->size;
1217
reg->wrmask = entry->interval->interval.reg->wrmask;
1218
assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
1219
}
1220
1221
for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1222
struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1223
struct ir3_register *reg =
1224
ir3_src_create(pcopy, INVALID_REG,
1225
entry->interval->interval.reg->flags & ~IR3_REG_SSA);
1226
reg->size = entry->interval->interval.reg->size;
1227
reg->wrmask = entry->interval->interval.reg->wrmask;
1228
assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
1229
}
1230
1231
list_del(&pcopy->node);
1232
list_addtail(&pcopy->node, &instr->node);
1233
ctx->parallel_copies_count = 0;
1234
}
1235
1236
static void
1237
handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1238
{
1239
/* First, mark sources as going-to-be-killed while allocating the dest. */
1240
ra_foreach_src (src, instr) {
1241
mark_src_killed(ctx, src);
1242
}
1243
1244
/* Allocate the destination. */
1245
ra_foreach_dst (dst, instr) {
1246
allocate_dst(ctx, dst);
1247
}
1248
1249
/* Now handle sources. Go backward so that in case there are multiple
1250
* sources with the same def and that def is killed we only remove it at
1251
* the end.
1252
*/
1253
ra_foreach_src_rev (src, instr) {
1254
assign_src(ctx, instr, src);
1255
}
1256
1257
/* Now finally insert the destination into the map. */
1258
ra_foreach_dst (dst, instr) {
1259
insert_dst(ctx, dst);
1260
}
1261
1262
insert_parallel_copy_instr(ctx, instr);
1263
}
1264
1265
static void
1266
handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
1267
{
1268
struct ir3_register *dst = instr->dsts[0];
1269
struct ir3_register *src = instr->srcs[0];
1270
1271
if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
1272
handle_normal_instr(ctx, instr);
1273
return;
1274
}
1275
1276
struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1277
1278
physreg_t physreg = ra_interval_get_physreg(src_interval);
1279
assign_src(ctx, instr, src);
1280
1281
allocate_dst_fixed(
1282
ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
1283
insert_dst(ctx, dst);
1284
}
1285
1286
static void
1287
handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
1288
{
1289
struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
1290
unsigned dst_offset = instr->dsts[0]->merge_set_offset;
1291
1292
if (!dst_set || dst_set->regs_count == 1) {
1293
handle_normal_instr(ctx, instr);
1294
return;
1295
}
1296
1297
/* We need to check if any of the sources are contained in an interval
1298
* that is at least as large as the vector. In this case, we should put
1299
* the vector inside that larger interval. (There should be one
1300
* unambiguous place to put it, because values sharing the same merge set
1301
* should be allocated together.) This can happen in a case like:
1302
*
1303
* ssa_1 (wrmask=0xf) = ...
1304
* ssa_2 = split ssa_1 off:0
1305
* ssa_3 = split ssa_1 off:1
1306
* ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
1307
* ... = (kill)ssa_1
1308
* ... = (kill)ssa_4
1309
*
1310
* ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
1311
*/
1312
physreg_t dst_fixed = (physreg_t)~0u;
1313
1314
for (unsigned i = 0; i < instr->srcs_count; i++) {
1315
if (!ra_reg_is_src(instr->srcs[i]))
1316
continue;
1317
1318
if (instr->srcs[i]->flags & IR3_REG_FIRST_KILL) {
1319
mark_src_killed(ctx, instr->srcs[i]);
1320
}
1321
1322
struct ir3_register *src = instr->srcs[i];
1323
struct ra_interval *interval = &ctx->intervals[src->def->name];
1324
1325
if (src->def->merge_set != dst_set || interval->is_killed)
1326
continue;
1327
while (interval->interval.parent != NULL) {
1328
interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1329
}
1330
if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
1331
dst_fixed = interval->physreg_start -
1332
interval->interval.reg->merge_set_offset + dst_offset;
1333
} else {
1334
/* For sources whose root interval is smaller than the
1335
* destination (i.e. the normal case), we will shuffle them
1336
* around after allocating the destination. Mark them killed so
1337
* that the destination can be allocated over them, even if they
1338
* aren't actually killed.
1339
*/
1340
ra_file_mark_killed(ra_get_file(ctx, src), interval);
1341
}
1342
}
1343
1344
if (dst_fixed != (physreg_t)~0u)
1345
allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
1346
else
1347
allocate_dst(ctx, instr->dsts[0]);
1348
1349
/* Remove the temporary is_killed we added */
1350
for (unsigned i = 0; i < instr->srcs_count; i++) {
1351
if (!ra_reg_is_src(instr->srcs[i]))
1352
continue;
1353
1354
struct ir3_register *src = instr->srcs[i];
1355
struct ra_interval *interval = &ctx->intervals[src->def->name];
1356
while (interval->interval.parent != NULL) {
1357
interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1358
}
1359
1360
/* Filter out cases where it actually should be killed */
1361
if (interval != &ctx->intervals[src->def->name] ||
1362
!(src->flags & IR3_REG_KILL))
1363
interval->is_killed = false;
1364
}
1365
1366
ra_foreach_src_rev (src, instr) {
1367
assign_src(ctx, instr, src);
1368
}
1369
1370
/* We need to do this before insert_dst(), so that children of the
1371
* destination which got marked as killed and then shuffled around to make
1372
* space for the destination have the correct pcopy destination that
1373
* matches what we assign the source of the collect to in assign_src().
1374
*
1375
* TODO: In this case we'll wind up copying the value in the pcopy and
1376
* then again in the collect. We could avoid one of those by updating the
1377
* pcopy destination to match up with the final location of the source
1378
* after the collect and making the collect a no-op. However this doesn't
1379
* seem to happen often.
1380
*/
1381
insert_parallel_copy_instr(ctx, instr);
1382
1383
/* Note: insert_dst will automatically shuffle around any intervals that
1384
* are a child of the collect by making them children of the collect.
1385
*/
1386
1387
insert_dst(ctx, instr->dsts[0]);
1388
}
1389
1390
/* Parallel copies before RA should only be at the end of the block, for
1391
* phi's. For these we only need to fill in the sources, and then we fill in
1392
* the destinations in the successor block.
1393
*/
1394
static void
1395
handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
1396
{
1397
ra_foreach_src_rev (src, instr) {
1398
assign_src(ctx, instr, src);
1399
}
1400
}
1401
1402
/* Some inputs may need to be precolored. We need to handle those first, so
1403
* that other non-precolored inputs don't accidentally get allocated over
1404
* them. Inputs are the very first thing in the shader, so it shouldn't be a
1405
* problem to allocate them to a specific physreg.
1406
*/
1407
1408
static void
1409
handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1410
{
1411
if (instr->dsts[0]->num == INVALID_REG)
1412
return;
1413
1414
struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1415
physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
1416
allocate_dst_fixed(ctx, instr->dsts[0], physreg);
1417
insert_dst(ctx, instr->dsts[0]);
1418
interval->frozen = true;
1419
}
1420
1421
static void
1422
handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1423
{
1424
if (instr->dsts[0]->num != INVALID_REG)
1425
return;
1426
1427
allocate_dst(ctx, instr->dsts[0]);
1428
1429
struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1430
struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1431
ra_file_insert(file, interval);
1432
}
1433
1434
static void
1435
assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1436
{
1437
struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1438
struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1439
1440
if (instr->dsts[0]->num == INVALID_REG) {
1441
assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
1442
} else {
1443
interval->frozen = false;
1444
}
1445
1446
if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1447
ra_file_remove(file, interval);
1448
1449
ra_foreach_src_rev (src, instr)
1450
assign_src(ctx, instr, src);
1451
}
1452
1453
/* chmask is a bit weird, because it has pre-colored sources due to the need
1454
* to pass some registers to the next stage. Fortunately there are only at
1455
* most two, and there should be no other live values by the time we get to
1456
* this instruction, so we only have to do the minimum and don't need any
1457
* fancy fallbacks.
1458
*
1459
* TODO: Add more complete handling of precolored sources, e.g. for function
1460
* argument handling. We'd need a way to mark sources as fixed so that they
1461
* don't get moved around when placing other sources in the fallback case, and
1462
* a duplication of much of the logic in get_reg(). This also opens another
1463
* can of worms, e.g. what if the precolored source is a split of a vector
1464
* which is still live -- this breaks our assumption that splits don't incur
1465
* any "extra" register requirements and we'd have to break it out of the
1466
* parent ra_interval.
1467
*/
1468
1469
static void
1470
handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
1471
{
1472
struct ra_file *file = ra_get_file(ctx, src);
1473
struct ra_interval *interval = &ctx->intervals[src->def->name];
1474
physreg_t physreg = ra_reg_get_physreg(src);
1475
1476
if (ra_interval_get_num(interval) == src->num)
1477
return;
1478
1479
/* Try evicting stuff in our way if it isn't free. This won't move
1480
* anything unless it overlaps with our precolored physreg, so we don't
1481
* have to worry about evicting other precolored sources.
1482
*/
1483
if (!get_reg_specified(file, src, physreg, true)) {
1484
unsigned eviction_count;
1485
if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
1486
false)) {
1487
unreachable("failed to evict for precolored source!");
1488
return;
1489
}
1490
}
1491
1492
ra_move_interval(ctx, file, interval, physreg);
1493
}
1494
1495
static void
1496
handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
1497
{
1498
/* Note: we purposely don't mark sources as killed, so that we can reuse
1499
* some of the get_reg() machinery as-if the source is a destination.
1500
* Marking it as killed would make e.g. get_reg_specified() wouldn't work
1501
* correctly.
1502
*/
1503
ra_foreach_src (src, instr) {
1504
assert(src->num != INVALID_REG);
1505
handle_precolored_source(ctx, src);
1506
}
1507
1508
ra_foreach_src (src, instr) {
1509
struct ra_file *file = ra_get_file(ctx, src);
1510
struct ra_interval *interval = &ctx->intervals[src->def->name];
1511
if (src->flags & IR3_REG_FIRST_KILL)
1512
ra_file_remove(file, interval);
1513
}
1514
1515
insert_parallel_copy_instr(ctx, instr);
1516
}
1517
1518
static physreg_t
1519
read_register(struct ra_ctx *ctx, struct ir3_block *block,
1520
struct ir3_register *def)
1521
{
1522
struct ra_block_state *state = &ctx->blocks[block->index];
1523
if (state->renames) {
1524
struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
1525
if (entry) {
1526
return (physreg_t)(uintptr_t)entry->data;
1527
}
1528
}
1529
1530
return ra_reg_get_physreg(def);
1531
}
1532
1533
static void
1534
handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
1535
{
1536
physreg_t physreg = ~0;
1537
for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1538
struct ir3_block *pred = ctx->block->predecessors[i];
1539
struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1540
1541
if (!pred_state->visited ||
1542
(pred_state->logical_unreachable && !(def->flags & IR3_REG_SHARED)))
1543
continue;
1544
1545
physreg = read_register(ctx, pred, def);
1546
break;
1547
}
1548
1549
assert(physreg != (physreg_t)~0);
1550
1551
struct ra_interval *interval = &ctx->intervals[def->name];
1552
struct ra_file *file = ra_get_file(ctx, def);
1553
ra_interval_init(interval, def);
1554
interval->physreg_start = physreg;
1555
interval->physreg_end = physreg + reg_size(def);
1556
ra_file_insert(file, interval);
1557
}
1558
1559
static void
1560
handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
1561
{
1562
/* Skip parallelcopy's which in the original program are only used as phi
1563
* arguments. Even though phi arguments are live out, they are only
1564
* assigned when the phi is.
1565
*/
1566
if (def->instr->opc == OPC_META_PARALLEL_COPY)
1567
return;
1568
1569
struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1570
struct ra_interval *interval = &ctx->intervals[def->name];
1571
physreg_t physreg = ra_interval_get_physreg(interval);
1572
if (physreg != ra_reg_get_physreg(def)) {
1573
if (!state->renames)
1574
state->renames = _mesa_pointer_hash_table_create(ctx);
1575
_mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
1576
}
1577
}
1578
1579
static void
1580
handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
1581
{
1582
struct ra_file *file = ra_get_file(ctx, def);
1583
struct ra_interval *interval = &ctx->intervals[def->name];
1584
1585
/* phis are always scalar, so they should already be the smallest possible
1586
* size. However they may be coalesced with other live-in values/phi
1587
* nodes, so check for that here.
1588
*/
1589
struct ir3_reg_interval *parent_ir3 =
1590
ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
1591
physreg_t physreg;
1592
if (parent_ir3) {
1593
struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
1594
physreg = ra_interval_get_physreg(parent) +
1595
(def->interval_start - parent_ir3->reg->interval_start);
1596
} else {
1597
physreg = get_reg(ctx, file, def, false);
1598
}
1599
1600
allocate_dst_fixed(ctx, def, physreg);
1601
1602
ra_file_insert(file, interval);
1603
}
1604
1605
static void
1606
assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
1607
{
1608
struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
1609
struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
1610
assert(!interval->interval.parent);
1611
unsigned num = ra_interval_get_num(interval);
1612
assign_reg(phi, phi->dsts[0], num);
1613
1614
/* Assign the parallelcopy sources of this phi */
1615
for (unsigned i = 0; i < phi->srcs_count; i++) {
1616
if (phi->srcs[i]->def) {
1617
assign_reg(phi, phi->srcs[i], num);
1618
assign_reg(phi, phi->srcs[i]->def, num);
1619
}
1620
}
1621
1622
if (phi->dsts[0]->flags & IR3_REG_UNUSED)
1623
ra_file_remove(file, interval);
1624
}
1625
1626
/* When we split a live range, we sometimes need to emit fixup code at the end
1627
* of a block. For example, something like:
1628
*
1629
* a = ...
1630
* if (...) {
1631
* ...
1632
* a' = a
1633
* b = ... // a evicted to make room for b
1634
* ...
1635
* }
1636
* ... = a
1637
*
1638
* When we insert the copy to a' in insert_parallel_copy_instr(), this forces
1639
* to insert another copy "a = a'" at the end of the if. Normally this would
1640
* also entail adding a phi node, but since we're about to go out of SSA
1641
* anyway we just insert an extra move. Note, however, that "b" might be used
1642
* in a phi node at the end of the if and share registers with "a", so we
1643
* have to be careful to extend any preexisting parallelcopy instruction
1644
* instead of creating our own in order to guarantee that they properly get
1645
* swapped.
1646
*/
1647
1648
static void
1649
insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
1650
struct ir3_register *reg)
1651
{
1652
struct ir3_instruction *old_pcopy = NULL;
1653
if (!list_is_empty(&block->instr_list)) {
1654
struct ir3_instruction *last =
1655
LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node);
1656
if (last->opc == OPC_META_PARALLEL_COPY)
1657
old_pcopy = last;
1658
}
1659
1660
unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
1661
struct ir3_instruction *pcopy = ir3_instr_create(
1662
block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);
1663
1664
for (unsigned i = 0; i < old_pcopy_srcs; i++) {
1665
old_pcopy->dsts[i]->instr = pcopy;
1666
pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
1667
}
1668
1669
struct ir3_register *dst_reg =
1670
ir3_dst_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
1671
dst_reg->wrmask = reg->wrmask;
1672
dst_reg->size = reg->size;
1673
assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
1674
1675
for (unsigned i = 0; i < old_pcopy_srcs; i++) {
1676
pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
1677
}
1678
1679
struct ir3_register *src_reg =
1680
ir3_src_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
1681
src_reg->wrmask = reg->wrmask;
1682
src_reg->size = reg->size;
1683
assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
1684
1685
if (old_pcopy)
1686
list_del(&old_pcopy->node);
1687
}
1688
1689
static void
1690
insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
1691
{
1692
physreg_t physreg = ra_interval_get_physreg(interval);
1693
1694
bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
1695
struct ir3_block **predecessors =
1696
shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
1697
unsigned predecessors_count = shared
1698
? ctx->block->physical_predecessors_count
1699
: ctx->block->predecessors_count;
1700
1701
for (unsigned i = 0; i < predecessors_count; i++) {
1702
struct ir3_block *pred = predecessors[i];
1703
struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1704
1705
if (!pred_state->visited)
1706
continue;
1707
1708
physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
1709
if (pred_reg != physreg) {
1710
insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
1711
1712
/* This is a bit tricky, but when visiting the destination of a
1713
* physical-only edge, we have two predecessors (the if and the
1714
* header block) and both have multiple successors. We pick the
1715
* register for all live-ins from the normal edge, which should
1716
* guarantee that there's no need for shuffling things around in
1717
* the normal predecessor as long as there are no phi nodes, but
1718
* we still may need to insert fixup code in the physical
1719
* predecessor (i.e. the last block of the if) and that has
1720
* another successor (the block after the if) so we need to update
1721
* the renames state for when we process the other successor. This
1722
* crucially depends on the other successor getting processed
1723
* after this.
1724
*
1725
* For normal (non-physical) edges we disallow critical edges so
1726
* that hacks like this aren't necessary.
1727
*/
1728
if (!pred_state->renames)
1729
pred_state->renames = _mesa_pointer_hash_table_create(ctx);
1730
_mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
1731
(void *)(uintptr_t)physreg);
1732
}
1733
}
1734
}
1735
1736
static void
1737
insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
1738
{
1739
BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
1740
rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1741
physreg_node) {
1742
/* Skip phi nodes. This needs to happen after phi nodes are allocated,
1743
* because we may have to move live-ins around to make space for phi
1744
* nodes, but we shouldn't be handling phi nodes here.
1745
*/
1746
if (BITSET_TEST(live_in, interval->interval.reg->name))
1747
insert_live_in_move(ctx, interval);
1748
}
1749
}
1750
1751
static void
1752
insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
1753
{
1754
rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1755
physreg_node) {
1756
_mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
1757
(void *)(uintptr_t)interval->physreg_start);
1758
}
1759
}
1760
1761
static void
1762
insert_live_in_moves(struct ra_ctx *ctx)
1763
{
1764
insert_file_live_in_moves(ctx, &ctx->full);
1765
insert_file_live_in_moves(ctx, &ctx->half);
1766
insert_file_live_in_moves(ctx, &ctx->shared);
1767
1768
/* If not all predecessors are visited, insert live-in regs so that
1769
* insert_live_out_moves() will work.
1770
*/
1771
bool all_preds_visited = true;
1772
for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1773
if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
1774
all_preds_visited = false;
1775
break;
1776
}
1777
}
1778
1779
if (!all_preds_visited) {
1780
struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1781
state->entry_regs = _mesa_pointer_hash_table_create(ctx);
1782
1783
insert_entry_regs(state, &ctx->full);
1784
insert_entry_regs(state, &ctx->half);
1785
insert_entry_regs(state, &ctx->shared);
1786
}
1787
}
1788
1789
static void
1790
insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
1791
{
1792
for (unsigned i = 0; i < 2; i++) {
1793
if (!ctx->block->successors[i])
1794
continue;
1795
1796
struct ir3_block *succ = ctx->block->successors[i];
1797
struct ra_block_state *succ_state = &ctx->blocks[succ->index];
1798
1799
if (!succ_state->visited)
1800
continue;
1801
1802
struct hash_entry *entry = _mesa_hash_table_search(
1803
succ_state->entry_regs, interval->interval.reg);
1804
if (!entry)
1805
continue;
1806
1807
physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
1808
if (new_reg != interval->physreg_start) {
1809
insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
1810
interval->interval.reg);
1811
}
1812
}
1813
}
1814
1815
static void
1816
insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
1817
{
1818
rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1819
physreg_node) {
1820
insert_live_out_move(ctx, interval);
1821
}
1822
}
1823
1824
static void
1825
insert_live_out_moves(struct ra_ctx *ctx)
1826
{
1827
insert_file_live_out_moves(ctx, &ctx->full);
1828
insert_file_live_out_moves(ctx, &ctx->half);
1829
insert_file_live_out_moves(ctx, &ctx->shared);
1830
}
1831
1832
static void
1833
handle_block(struct ra_ctx *ctx, struct ir3_block *block)
1834
{
1835
ctx->block = block;
1836
1837
/* Reset the register files from the last block */
1838
ra_file_init(&ctx->full);
1839
ra_file_init(&ctx->half);
1840
ra_file_init(&ctx->shared);
1841
1842
bool unreachable = false;
1843
if (block != ir3_start_block(ctx->ir)) {
1844
unreachable = true;
1845
for (unsigned i = 0; i < block->predecessors_count; i++) {
1846
struct ra_block_state *pred_state =
1847
&ctx->blocks[block->predecessors[i]->index];
1848
if (!pred_state->logical_unreachable) {
1849
unreachable = false;
1850
break;
1851
}
1852
}
1853
}
1854
1855
ctx->blocks[block->index].logical_unreachable = unreachable;
1856
1857
/* Handle live-ins, phis, and input meta-instructions. These all appear
1858
* live at the beginning of the block, and interfere with each other
1859
* therefore need to be allocated "in parallel". This means that we
1860
* have to allocate all of them, inserting them into the file, and then
1861
* delay updating the IR until all of them are allocated.
1862
*
1863
* Handle precolored inputs first, because we need to make sure that other
1864
* inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
1865
* and inputs at the same time, because the first block doesn't have
1866
* predecessors. Therefore handle_live_in doesn't have to worry about
1867
* them.
1868
*/
1869
1870
foreach_instr (instr, &block->instr_list) {
1871
if (instr->opc == OPC_META_INPUT)
1872
handle_precolored_input(ctx, instr);
1873
else
1874
break;
1875
}
1876
1877
unsigned name;
1878
BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1879
ctx->live->definitions_count) {
1880
struct ir3_register *reg = ctx->live->definitions[name];
1881
if (unreachable && !(reg->flags & IR3_REG_SHARED))
1882
continue;
1883
handle_live_in(ctx, reg);
1884
}
1885
1886
foreach_instr (instr, &block->instr_list) {
1887
if (instr->opc == OPC_META_PHI)
1888
handle_phi(ctx, instr->dsts[0]);
1889
else if (instr->opc == OPC_META_INPUT ||
1890
instr->opc == OPC_META_TEX_PREFETCH)
1891
handle_input(ctx, instr);
1892
else
1893
break;
1894
}
1895
1896
/* After this point, every live-in/phi/input has an interval assigned to
1897
* it. We delay actually assigning values until everything has been
1898
* allocated, so we can simply ignore any parallel copy entries created
1899
* when shuffling them around.
1900
*/
1901
ctx->parallel_copies_count = 0;
1902
1903
insert_live_in_moves(ctx);
1904
1905
if (RA_DEBUG) {
1906
printf("after live-in block %u:\n", block->index);
1907
ra_ctx_dump(ctx);
1908
}
1909
1910
/* Now we're done with processing live-ins, and can handle the body of the
1911
* block.
1912
*/
1913
foreach_instr (instr, &block->instr_list) {
1914
if (RA_DEBUG) {
1915
printf("processing: ");
1916
ir3_print_instr(instr);
1917
}
1918
1919
if (instr->opc == OPC_META_PHI)
1920
assign_phi(ctx, instr);
1921
else if (instr->opc == OPC_META_INPUT ||
1922
instr->opc == OPC_META_TEX_PREFETCH)
1923
assign_input(ctx, instr);
1924
else if (instr->opc == OPC_META_SPLIT)
1925
handle_split(ctx, instr);
1926
else if (instr->opc == OPC_META_COLLECT)
1927
handle_collect(ctx, instr);
1928
else if (instr->opc == OPC_META_PARALLEL_COPY)
1929
handle_pcopy(ctx, instr);
1930
else if (instr->opc == OPC_CHMASK)
1931
handle_chmask(ctx, instr);
1932
else
1933
handle_normal_instr(ctx, instr);
1934
1935
if (RA_DEBUG)
1936
ra_ctx_dump(ctx);
1937
}
1938
1939
insert_live_out_moves(ctx);
1940
1941
BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],
1942
ctx->live->definitions_count) {
1943
struct ir3_register *reg = ctx->live->definitions[name];
1944
handle_live_out(ctx, reg);
1945
}
1946
1947
ctx->blocks[block->index].visited = true;
1948
}
1949
1950
static unsigned
1951
calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
1952
{
1953
/* Registers are allocated in units of vec4, so switch from units of
1954
* half-regs to vec4.
1955
*/
1956
unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
1957
1958
bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
1959
1960
unsigned target = reg_count;
1961
unsigned reg_independent_max_waves =
1962
ir3_get_reg_independent_max_waves(v, double_threadsize);
1963
unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
1964
v->shader->compiler, reg_count, double_threadsize);
1965
unsigned target_waves =
1966
MIN2(reg_independent_max_waves, reg_dependent_max_waves);
1967
1968
while (target <= RA_FULL_SIZE / (2 * 4) &&
1969
ir3_should_double_threadsize(v, target) == double_threadsize &&
1970
ir3_get_reg_dependent_max_waves(v->shader->compiler, target,
1971
double_threadsize) >= target_waves)
1972
target++;
1973
1974
return (target - 1) * 2 * 4;
1975
}
1976
1977
int
1978
ir3_ra(struct ir3_shader_variant *v)
1979
{
1980
ir3_calc_dominance(v->ir);
1981
1982
ir3_create_parallel_copies(v->ir);
1983
1984
struct ir3_liveness *live = ir3_calc_liveness(v);
1985
1986
ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
1987
1988
ir3_merge_regs(live, v->ir);
1989
1990
struct ir3_pressure max_pressure;
1991
ir3_calc_pressure(v, live, &max_pressure);
1992
d("max pressure:");
1993
d("\tfull: %u", max_pressure.full);
1994
d("\thalf: %u", max_pressure.half);
1995
d("\tshared: %u", max_pressure.shared);
1996
1997
if (v->mergedregs) {
1998
max_pressure.full += max_pressure.half;
1999
max_pressure.half = 0;
2000
}
2001
2002
if (max_pressure.full > RA_FULL_SIZE || max_pressure.half > RA_HALF_SIZE ||
2003
max_pressure.shared > RA_SHARED_SIZE) {
2004
d("max pressure exceeded!");
2005
return 1;
2006
}
2007
2008
struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
2009
2010
ctx->ir = v->ir;
2011
ctx->merged_regs = v->mergedregs;
2012
ctx->compiler = v->shader->compiler;
2013
ctx->stage = v->type;
2014
ctx->live = live;
2015
ctx->intervals =
2016
rzalloc_array(ctx, struct ra_interval, live->definitions_count);
2017
ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
2018
2019
ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
2020
d("full size: %u", ctx->full.size);
2021
2022
if (!v->mergedregs)
2023
ctx->half.size = RA_HALF_SIZE;
2024
2025
ctx->shared.size = RA_SHARED_SIZE;
2026
2027
foreach_block (block, &v->ir->block_list)
2028
handle_block(ctx, block);
2029
2030
ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
2031
2032
/* Strip array-ness and SSA-ness at the end, because various helpers still
2033
* need to work even on definitions that have already been assigned. For
2034
* example, we need to preserve array-ness so that array live-ins have the
2035
* right size.
2036
*/
2037
foreach_block (block, &v->ir->block_list) {
2038
foreach_instr (instr, &block->instr_list) {
2039
for (unsigned i = 0; i < instr->dsts_count; i++) {
2040
instr->dsts[i]->flags &= ~IR3_REG_SSA;
2041
2042
/* Parallel copies of array registers copy the whole register,
2043
* and we need some way to let the parallel copy code know
2044
* that this was an array whose size is determined by
2045
* reg->size. So keep the array flag on those.
2046
*/
2047
if (!is_meta(instr))
2048
instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
2049
}
2050
2051
for (unsigned i = 0; i < instr->srcs_count; i++) {
2052
instr->srcs[i]->flags &= ~IR3_REG_SSA;
2053
2054
if (!is_meta(instr))
2055
instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
2056
}
2057
}
2058
}
2059
2060
ir3_debug_print(v->ir, "AFTER: register allocation");
2061
2062
ir3_lower_copies(v);
2063
2064
ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
2065
2066
ralloc_free(ctx);
2067
ralloc_free(live);
2068
return 0;
2069
}
2070
2071