CoCalc -- ir3

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/freedreno/ir3/ir3_ra.c
⁴⁵⁶⁵ views
1
/*
2
 * Copyright (C) 2021 Valve Corporation
3
 * Copyright (C) 2014 Rob Clark <[email protected]>
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
 * and/or sell copies of the Software, and to permit persons to whom the
10
 * Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
 * SOFTWARE.
23
 */
24

25
#include "ir3_ra.h"
26
#include "util/rb_tree.h"
27
#include "util/u_math.h"
28
#include "ir3_shader.h"
29

30
/* This file implements an SSA-based register allocator. Unlike other
31
 * SSA-based allocators, it handles vector split/collect "smartly," meaning
32
 * that multiple values may share the same register interval. From the
33
 * perspective of the allocator itself, only the top-level intervals matter,
34
 * and the allocator is only concerned with allocating top-level intervals,
35
 * which may mean moving other top-level intervals around. Other intervals,
36
 * like the destination of a split instruction or the source of a collect
37
 * instruction, are "locked" to their parent interval. The details of this are
38
 * mostly handled by ir3_merge_regs and ir3_reg_ctx.
39
 *
40
 * We currently don't do any backtracking, but we do use the merge sets as a
41
 * form of affinity to try to avoid moves from phis/splits/collects. Each
42
 * merge set is what a more "classic" graph-coloring or live-range based
43
 * allocator would consider a single register, but here we use it as merely a
44
 * hint, except when multiple overlapping values are live at the same time.
45
 * Each merge set has a "preferred" register, and we try to honor that when
46
 * allocating values in the merge set.
47
 */
48

49
/* ir3_reg_ctx implementation. */
50

51
static int
52
ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
53
{
54
   physreg_t reg = *(const physreg_t *)data;
55
   const struct ir3_reg_interval *interval =
56
      ir3_rb_node_to_interval_const(node);
57
   if (interval->reg->interval_start > reg)
58
      return -1;
59
   else if (interval->reg->interval_end <= reg)
60
      return 1;
61
   else
62
      return 0;
63
}
64

65
static struct ir3_reg_interval *
66
ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
67
{
68
   struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
69
   return node ? ir3_rb_node_to_interval(node) : NULL;
70
}
71

72
static struct ir3_reg_interval *
73
ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
74
{
75
   struct rb_node *node =
76
      rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
77
   return node ? ir3_rb_node_to_interval(node) : NULL;
78
}
79

80
/* Get the interval covering the reg, or the closest to the right if it
81
 * doesn't exist.
82
 */
83
static struct ir3_reg_interval *
84
ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
85
{
86
   struct ir3_reg_interval *interval =
87
      ir3_reg_interval_search_sloppy(tree, offset);
88
   if (!interval) {
89
      return NULL;
90
   } else if (interval->reg->interval_end > offset) {
91
      return interval;
92
   } else {
93
      /* There is no interval covering reg, and ra_file_search_sloppy()
94
       * returned the closest range to the left, so the next interval to the
95
       * right should be the closest to the right.
96
       */
97
      return ir3_reg_interval_next_or_null(interval);
98
   }
99
}
100

101
static int
102
ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
103
{
104
   const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
105
   const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
106
   return b->reg->interval_start - a->reg->interval_start;
107
}
108

109
static void
110
interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
111
                struct ir3_reg_interval *interval)
112
{
113
   struct ir3_reg_interval *right =
114
      ir3_reg_interval_search_right(tree, interval->reg->interval_start);
115
   if (right && right->reg->interval_start < interval->reg->interval_end) {
116
      /* We disallow trees where different members have different half-ness.
117
       * This means that we can't treat bitcasts as copies like normal
118
       * split/collect, so something like this would require an extra copy
119
       * in mergedregs mode, and count as 4 half-units of register pressure
120
       * instead of 2:
121
       *
122
       * f16vec2 foo = unpackFloat2x16(bar)
123
       * ... = foo.x
124
       * ... = bar
125
       *
126
       * However, relaxing this rule would open a huge can of worms. What
127
       * happens when there's a vector of 16 things, and the fifth element
128
       * has been bitcasted as a half-reg? Would that element alone have to
129
       * be small enough to be used as a half-reg source? Let's keep that
130
       * can of worms firmly shut for now.
131
       */
132
      assert((interval->reg->flags & IR3_REG_HALF) ==
133
             (right->reg->flags & IR3_REG_HALF));
134

135
      if (right->reg->interval_end <= interval->reg->interval_end &&
136
          right->reg->interval_start >= interval->reg->interval_start) {
137
         /* Check if we're inserting something that's already inserted */
138
         assert(interval != right);
139

140
         /* "right" is contained in "interval" and must become a child of
141
          * it. There may be further children too.
142
          */
143
         for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
144
              right && right->reg->interval_start < interval->reg->interval_end;
145
              right = next, next = ir3_reg_interval_next_or_null(next)) {
146
            /* "right" must be contained in "interval." */
147
            assert(right->reg->interval_end <= interval->reg->interval_end);
148
            assert((interval->reg->flags & IR3_REG_HALF) ==
149
                   (right->reg->flags & IR3_REG_HALF));
150
            if (!right->parent)
151
               ctx->interval_delete(ctx, right);
152
            right->parent = interval;
153
            rb_tree_remove(tree, &right->node);
154
            rb_tree_insert(&interval->children, &right->node,
155
                           ir3_reg_interval_insert_cmp);
156
         }
157
      } else {
158
         /* "right" must contain "interval," since intervals must form a
159
          * tree.
160
          */
161
         assert(right->reg->interval_start <= interval->reg->interval_start);
162
         interval->parent = right;
163
         interval_insert(ctx, &right->children, interval);
164
         return;
165
      }
166
   }
167

168
   if (!interval->parent)
169
      ctx->interval_add(ctx, interval);
170
   rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
171
   interval->inserted = true;
172
}
173

174
void
175
ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
176
                        struct ir3_reg_interval *interval)
177
{
178
   interval_insert(ctx, &ctx->intervals, interval);
179
}
180

181
void
182
ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
183
                        struct ir3_reg_interval *interval)
184
{
185
   if (interval->parent) {
186
      rb_tree_remove(&interval->parent->children, &interval->node);
187
   } else {
188
      ctx->interval_delete(ctx, interval);
189
      rb_tree_remove(&ctx->intervals, &interval->node);
190
   }
191

192
   rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,
193
                         node) {
194
      rb_tree_remove(&interval->children, &child->node);
195
      child->parent = interval->parent;
196

197
      if (interval->parent) {
198
         rb_tree_insert(&child->parent->children, &child->node,
199
                        ir3_reg_interval_insert_cmp);
200
      } else {
201
         ctx->interval_readd(ctx, interval, child);
202
         rb_tree_insert(&ctx->intervals, &child->node,
203
                        ir3_reg_interval_insert_cmp);
204
      }
205
   }
206

207
   interval->inserted = false;
208
}
209

210
void
211
ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
212
                            struct ir3_reg_interval *interval)
213
{
214
   assert(!interval->parent);
215

216
   ctx->interval_delete(ctx, interval);
217
   rb_tree_remove(&ctx->intervals, &interval->node);
218
}
219

220
static void
221
interval_dump(struct ir3_reg_interval *interval, unsigned indent)
222
{
223
   for (unsigned i = 0; i < indent; i++)
224
      printf("\t");
225
   printf("reg %u start %u\n", interval->reg->name,
226
          interval->reg->interval_start);
227

228
   rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
229
      interval_dump(child, indent + 1);
230
   }
231

232
   for (unsigned i = 0; i < indent; i++)
233
      printf("\t");
234
   printf("reg %u end %u\n", interval->reg->name, interval->reg->interval_end);
235
}
236

237
void
238
ir3_reg_interval_dump(struct ir3_reg_interval *interval)
239
{
240
   interval_dump(interval, 0);
241
}
242

243
/* These are the core datastructures used by the register allocator. First
244
 * ra_interval and ra_file, which are used for intra-block tracking and use
245
 * the ir3_reg_ctx infrastructure:
246
 */
247

248
struct ra_interval {
249
   struct ir3_reg_interval interval;
250

251
   struct rb_node physreg_node;
252
   physreg_t physreg_start, physreg_end;
253

254
   /* True if this is a source of the current instruction which is entirely
255
    * killed. This means we can allocate the dest over it, but we can't break
256
    * it up.
257
    */
258
   bool is_killed;
259

260
   /* True if this interval cannot be moved from its position. This is only
261
    * used for precolored inputs to ensure that other inputs don't get
262
    * allocated on top of them.
263
    */
264
   bool frozen;
265
};
266

267
struct ra_file {
268
   struct ir3_reg_ctx reg_ctx;
269

270
   BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
271
   BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
272

273
   struct rb_tree physreg_intervals;
274

275
   unsigned size;
276
   unsigned start;
277
};
278

279
/* State for inter-block tracking. When we split a live range to make space
280
 * for a vector, we may need to insert fixup code when a block has multiple
281
 * predecessors that have moved the same live value to different registers.
282
 * This keeps track of state required to do that.
283
 */
284

285
struct ra_block_state {
286
   /* Map of defining ir3_register -> physreg it was allocated to at the end
287
    * of the block.
288
    */
289
   struct hash_table *renames;
290

291
   /* For loops, we need to process a block before all its predecessors have
292
    * been processed. In particular, we need to pick registers for values
293
    * without knowing if all the predecessors have been renamed. This keeps
294
    * track of the registers we chose so that when we visit the back-edge we
295
    * can move them appropriately. If all predecessors have been visited
296
    * before this block is visited then we don't need to fill this out. This
297
    * is a map from ir3_register -> physreg.
298
    */
299
   struct hash_table *entry_regs;
300

301
   /* True if the block has been visited and "renames" is complete.
302
    */
303
   bool visited;
304

305
   /* True if the block is unreachable via the logical CFG. This happens for
306
    * blocks after an if where both sides end in a break/continue. We ignore
307
    * it for everything but shared registers.
308
    */
309
   bool logical_unreachable;
310
};
311

312
struct ra_parallel_copy {
313
   struct ra_interval *interval;
314
   physreg_t src;
315
};
316

317
/* The main context: */
318

319
struct ra_ctx {
320
   /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
321
    * half of this file too.
322
    */
323
   struct ra_file full;
324

325
   /* hr0.x - hr63.w, only used without merged-regs. */
326
   struct ra_file half;
327

328
   /* Shared regs. */
329
   struct ra_file shared;
330

331
   struct ir3 *ir;
332

333
   struct ir3_liveness *live;
334

335
   struct ir3_block *block;
336

337
   const struct ir3_compiler *compiler;
338
   gl_shader_stage stage;
339

340
   /* Pending moves of top-level intervals that will be emitted once we're
341
    * finished:
342
    */
343
   DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
344

345
   struct ra_interval *intervals;
346
   struct ra_block_state *blocks;
347

348
   bool merged_regs;
349
};
350

351
#define foreach_interval(interval, file)                                       \
352
   rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
353
                    physreg_node)
354
#define foreach_interval_rev(interval, file)                                   \
355
   rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
356
                    physreg_node)
357
#define foreach_interval_safe(interval, file)                                  \
358
   rb_tree_foreach_safe (struct ra_interval, interval,                         \
359
                         &(file)->physreg_intervals, physreg_node)
360
#define foreach_interval_rev_safe(interval, file)                              \
361
   rb_tree_foreach_rev_safe(struct ra_interval, interval,                      \
362
                            &(file)->physreg_intervals, physreg_node)
363

364
static struct ra_interval *
365
rb_node_to_interval(struct rb_node *node)
366
{
367
   return rb_node_data(struct ra_interval, node, physreg_node);
368
}
369

370
static const struct ra_interval *
371
rb_node_to_interval_const(const struct rb_node *node)
372
{
373
   return rb_node_data(struct ra_interval, node, physreg_node);
374
}
375

376
static struct ra_interval *
377
ra_interval_next(struct ra_interval *interval)
378
{
379
   struct rb_node *next = rb_node_next(&interval->physreg_node);
380
   return next ? rb_node_to_interval(next) : NULL;
381
}
382

383
static struct ra_interval *
384
ra_interval_next_or_null(struct ra_interval *interval)
385
{
386
   return interval ? ra_interval_next(interval) : NULL;
387
}
388

389
static int
390
ra_interval_cmp(const struct rb_node *node, const void *data)
391
{
392
   physreg_t reg = *(const physreg_t *)data;
393
   const struct ra_interval *interval = rb_node_to_interval_const(node);
394
   if (interval->physreg_start > reg)
395
      return -1;
396
   else if (interval->physreg_end <= reg)
397
      return 1;
398
   else
399
      return 0;
400
}
401

402
static struct ra_interval *
403
ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
404
{
405
   struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
406
   return node ? rb_node_to_interval(node) : NULL;
407
}
408

409
/* Get the interval covering the reg, or the closest to the right if it
410
 * doesn't exist.
411
 */
412
static struct ra_interval *
413
ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
414
{
415
   struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
416
   if (!interval) {
417
      return NULL;
418
   } else if (interval->physreg_end > reg) {
419
      return interval;
420
   } else {
421
      /* There is no interval covering reg, and ra_file_search_sloppy()
422
       * returned the closest range to the left, so the next interval to the
423
       * right should be the closest to the right.
424
       */
425
      return ra_interval_next_or_null(interval);
426
   }
427
}
428

429
static struct ra_interval *
430
ra_file_search_right(struct ra_file *file, physreg_t reg)
431
{
432
   return ra_interval_search_right(&file->physreg_intervals, reg);
433
}
434

435
static int
436
ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
437
{
438
   const struct ra_interval *a = rb_node_to_interval_const(_a);
439
   const struct ra_interval *b = rb_node_to_interval_const(_b);
440
   return b->physreg_start - a->physreg_start;
441
}
442

443
static struct ra_interval *
444
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
445
{
446
   return rb_node_data(struct ra_interval, interval, interval);
447
}
448

449
static struct ra_file *
450
ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
451
{
452
   return rb_node_data(struct ra_file, ctx, reg_ctx);
453
}
454

455
static void
456
interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
457
{
458
   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
459
   struct ra_file *file = ir3_reg_ctx_to_file(ctx);
460

461
   /* We can assume in this case that physreg_start/physreg_end is already
462
    * initialized.
463
    */
464
   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
465
      BITSET_CLEAR(file->available, i);
466
      BITSET_CLEAR(file->available_to_evict, i);
467
   }
468

469
   rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
470
                  ra_interval_insert_cmp);
471
}
472

473
static void
474
interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
475
{
476
   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
477
   struct ra_file *file = ir3_reg_ctx_to_file(ctx);
478

479
   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
480
      BITSET_SET(file->available, i);
481
      BITSET_SET(file->available_to_evict, i);
482
   }
483

484
   rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
485
}
486

487
static void
488
interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
489
               struct ir3_reg_interval *_child)
490
{
491
   struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
492
   struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
493

494
   child->physreg_start =
495
      parent->physreg_start + (child->interval.reg->interval_start -
496
                               parent->interval.reg->interval_start);
497
   child->physreg_end =
498
      child->physreg_start +
499
      (child->interval.reg->interval_end - child->interval.reg->interval_start);
500

501
   interval_add(ctx, _child);
502
}
503

504
static void
505
ra_file_init(struct ra_file *file)
506
{
507
   for (unsigned i = 0; i < file->size; i++) {
508
      BITSET_SET(file->available, i);
509
      BITSET_SET(file->available_to_evict, i);
510
   }
511

512
   file->start = 0;
513

514
   rb_tree_init(&file->reg_ctx.intervals);
515
   rb_tree_init(&file->physreg_intervals);
516

517
   file->reg_ctx.interval_add = interval_add;
518
   file->reg_ctx.interval_delete = interval_delete;
519
   file->reg_ctx.interval_readd = interval_readd;
520
}
521

522
static void
523
ra_file_insert(struct ra_file *file, struct ra_interval *interval)
524
{
525
   assert(interval->physreg_start < interval->physreg_end);
526
   assert(interval->physreg_end <= file->size);
527
   if (interval->interval.reg->flags & IR3_REG_HALF)
528
      assert(interval->physreg_end <= RA_HALF_SIZE);
529

530
   ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
531
}
532

533
static void
534
ra_file_remove(struct ra_file *file, struct ra_interval *interval)
535
{
536
   ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
537
}
538

539
static void
540
ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
541
{
542
   assert(!interval->interval.parent);
543

544
   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
545
      BITSET_SET(file->available, i);
546
   }
547

548
   interval->is_killed = true;
549
}
550

551
static physreg_t
552
ra_interval_get_physreg(const struct ra_interval *interval)
553
{
554
   unsigned child_start = interval->interval.reg->interval_start;
555

556
   while (interval->interval.parent) {
557
      interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
558
   }
559

560
   return interval->physreg_start +
561
          (child_start - interval->interval.reg->interval_start);
562
}
563

564
static unsigned
565
ra_interval_get_num(const struct ra_interval *interval)
566
{
567
   return ra_physreg_to_num(ra_interval_get_physreg(interval),
568
                            interval->interval.reg->flags);
569
}
570

571
static void
572
ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
573
{
574
   ir3_reg_interval_init(&interval->interval, reg);
575
   interval->is_killed = false;
576
   interval->frozen = false;
577
}
578

579
static void
580
ra_interval_dump(struct ra_interval *interval)
581
{
582
   printf("physreg %u ", interval->physreg_start);
583

584
   ir3_reg_interval_dump(&interval->interval);
585
}
586

587
static void
588
ra_file_dump(struct ra_file *file)
589
{
590
   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
591
                    physreg_node) {
592
      ra_interval_dump(interval);
593
   }
594

595
   unsigned start, end;
596
   printf("available:\n");
597
   BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
598
      printf("%u-%u ", start, end);
599
   }
600
   printf("\n");
601

602
   printf("available to evict:\n");
603
   BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
604
      printf("%u-%u ", start, end);
605
   }
606
   printf("\n");
607
   printf("start: %u\n", file->start);
608
}
609

610
static void
611
ra_ctx_dump(struct ra_ctx *ctx)
612
{
613
   printf("full:\n");
614
   ra_file_dump(&ctx->full);
615
   printf("half:\n");
616
   ra_file_dump(&ctx->half);
617
   printf("shared:\n");
618
   ra_file_dump(&ctx->shared);
619
}
620

621
static unsigned
622
reg_file_size(struct ra_file *file, struct ir3_register *reg)
623
{
624
   /* Half-regs can only take up the first half of the combined regfile */
625
   if (reg->flags & IR3_REG_HALF)
626
      return MIN2(file->size, RA_HALF_SIZE);
627
   else
628
      return file->size;
629
}
630

631
/* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
632
 * top-level intervals at once. Pop multiple intervals, then push them back in
633
 * any order.
634
 */
635

636
struct ra_removed_interval {
637
   struct ra_interval *interval;
638
   unsigned size;
639
};
640

641
static struct ra_removed_interval
642
ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
643
                struct ra_interval *interval)
644
{
645
   assert(!interval->interval.parent);
646

647
   /* Check if we've already moved this reg before */
648
   unsigned pcopy_index;
649
   for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;
650
        pcopy_index++) {
651
      if (ctx->parallel_copies[pcopy_index].interval == interval)
652
         break;
653
   }
654

655
   if (pcopy_index == ctx->parallel_copies_count) {
656
      array_insert(ctx, ctx->parallel_copies,
657
                   (struct ra_parallel_copy){
658
                      .interval = interval,
659
                      .src = interval->physreg_start,
660
                   });
661
   }
662

663
   ir3_reg_interval_remove_all(&file->reg_ctx, &interval->interval);
664

665
   return (struct ra_removed_interval){
666
      .interval = interval,
667
      .size = interval->physreg_end - interval->physreg_start,
668
   };
669
}
670

671
static void
672
ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
673
                 const struct ra_removed_interval *removed, physreg_t dst)
674
{
675
   struct ra_interval *interval = removed->interval;
676

677
   interval->physreg_start = dst;
678
   interval->physreg_end = dst + removed->size;
679

680
   ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
681
}
682

683
/* Pick up the interval and place it at "dst". */
684
static void
685
ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
686
                 struct ra_interval *interval, physreg_t dst)
687
{
688
   struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
689
   ra_push_interval(ctx, file, &temp, dst);
690
}
691

692
static bool
693
get_reg_specified(struct ra_file *file, struct ir3_register *reg,
694
                  physreg_t physreg, bool is_source)
695
{
696
   for (unsigned i = 0; i < reg_size(reg); i++) {
697
      if (!BITSET_TEST(is_source ? file->available_to_evict : file->available,
698
                       physreg + i))
699
         return false;
700
   }
701

702
   return true;
703
}
704

705
/* Try to evict any registers conflicting with the proposed spot "physreg" for
706
 * "reg". That is, move them to other places so that we can allocate "physreg"
707
 * here.
708
 */
709

710
static bool
711
try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
712
               struct ir3_register *reg, physreg_t physreg,
713
               unsigned *_eviction_count, bool is_source, bool speculative)
714
{
715
   BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
716
   memcpy(available_to_evict, file->available_to_evict,
717
          sizeof(available_to_evict));
718

719
   for (unsigned i = 0; i < reg_size(reg); i++)
720
      BITSET_CLEAR(available_to_evict, physreg + i);
721

722
   unsigned eviction_count = 0;
723
   /* Iterate over each range conflicting with physreg */
724
   for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
725
                           *next = ra_interval_next_or_null(conflicting);
726
        conflicting != NULL &&
727
        conflicting->physreg_start < physreg + reg_size(reg);
728
        conflicting = next, next = ra_interval_next_or_null(next)) {
729
      if (!is_source && conflicting->is_killed)
730
         continue;
731

732
      if (conflicting->frozen) {
733
         assert(speculative);
734
         return false;
735
      }
736

737
      unsigned avail_start, avail_end;
738
      bool evicted = false;
739
      BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
740
                            reg_file_size(file, conflicting->interval.reg)) {
741
         unsigned size = avail_end - avail_start;
742

743
         /* non-half registers must be aligned */
744
         if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&
745
             avail_start % 2 == 1) {
746
            avail_start++;
747
            size--;
748
         }
749

750
         if (size >= conflicting->physreg_end - conflicting->physreg_start) {
751
            for (unsigned i = 0;
752
                 i < conflicting->physreg_end - conflicting->physreg_start; i++)
753
               BITSET_CLEAR(available_to_evict, avail_start + i);
754
            eviction_count +=
755
               conflicting->physreg_end - conflicting->physreg_start;
756
            if (!speculative)
757
               ra_move_interval(ctx, file, conflicting, avail_start);
758
            evicted = true;
759
            break;
760
         }
761
      }
762

763
      if (!evicted)
764
         return false;
765
   }
766

767
   *_eviction_count = eviction_count;
768
   return true;
769
}
770

771
static int
772
removed_interval_cmp(const void *_i1, const void *_i2)
773
{
774
   const struct ra_removed_interval *i1 = _i1;
775
   const struct ra_removed_interval *i2 = _i2;
776

777
   /* We sort the registers as follows:
778
    *
779
    * |--------------------------------------------------------------------|
780
    * |                    |             |             |                   |
781
    * |  Half live-through | Half killed | Full killed | Full live-through |
782
    * |                    |             |             |                   |
783
    * |--------------------------------------------------------------------|
784
    *                        |                 |
785
    *                        |   Destination   |
786
    *                        |                 |
787
    *                        |-----------------|
788
    *
789
    * Half-registers have to be first so that they stay in the low half of
790
    * the register file. Then half and full killed must stay together so that
791
    * there's a contiguous range where we can put the register. With this
792
    * structure we should be able to accomodate any collection of intervals
793
    * such that the total number of half components is within the half limit
794
    * and the combined components are within the full limit.
795
    */
796

797
   unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
798
   unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
799
   if (i1_align > i2_align)
800
      return 1;
801
   if (i1_align < i2_align)
802
      return -1;
803

804
   if (i1_align == 1) {
805
      if (i2->interval->is_killed)
806
         return -1;
807
      if (i1->interval->is_killed)
808
         return 1;
809
   } else {
810
      if (i2->interval->is_killed)
811
         return 1;
812
      if (i1->interval->is_killed)
813
         return -1;
814
   }
815

816
   return 0;
817
}
818

819
/* "Compress" all the live intervals so that there is enough space for the
820
 * destination register. As there can be gaps when a more-aligned interval
821
 * follows a less-aligned interval, this also sorts them to remove such
822
 * "padding", which may be required when space is very tight.  This isn't
823
 * amazing, but should be used only as a last resort in case the register file
824
 * is almost full and badly fragmented.
825
 *
826
 * Return the physreg to use.
827
 */
828
static physreg_t
829
compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,
830
                   unsigned align, bool is_source)
831
{
832
   DECLARE_ARRAY(struct ra_removed_interval, intervals);
833
   intervals_count = intervals_sz = 0;
834
   intervals = NULL;
835

836
   unsigned removed_full_size = 0;
837
   unsigned removed_half_size = 0;
838
   unsigned file_size =
839
      align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
840
   physreg_t start_reg = 0;
841

842
   foreach_interval_rev_safe (interval, file) {
843
      /* Check if we can sort the intervals *after* this one and have
844
       * enough space leftover to accomodate "size" units.
845
       */
846
      if (align == 1) {
847
         if (interval->physreg_end + removed_half_size <= file_size - size) {
848
            start_reg = interval->physreg_end;
849
            break;
850
         }
851
      } else {
852
         if (interval->physreg_end + removed_half_size <=
853
             file_size - removed_full_size - size) {
854
            start_reg = interval->physreg_end;
855
            break;
856
         }
857
      }
858

859
      /* We assume that all frozen intervals are at the start and that we
860
       * can avoid popping them.
861
       */
862
      assert(!interval->frozen);
863

864
      /* Killed sources don't count because they go at the end and can
865
       * overlap the register we're trying to add.
866
       */
867
      if (!interval->is_killed && !is_source) {
868
         if (interval->interval.reg->flags & IR3_REG_HALF)
869
            removed_half_size +=
870
               interval->physreg_end - interval->physreg_start;
871
         else
872
            removed_full_size +=
873
               interval->physreg_end - interval->physreg_start;
874
      }
875

876
      /* Now that we've done the accounting, pop this off */
877
      d("popping interval %u physreg %u\n", interval->interval.reg->name,
878
        interval->physreg_start);
879
      array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
880
   }
881

882
   /* TODO: In addition to skipping registers at the beginning that are
883
    * well-packed, we should try to skip registers at the end.
884
    */
885

886
   qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
887

888
   physreg_t physreg = start_reg;
889
   physreg_t ret_reg = (physreg_t)~0;
890
   for (unsigned i = 0; i < intervals_count; i++) {
891
      if (ret_reg == (physreg_t)~0 &&
892
          ((intervals[i].interval->is_killed && !is_source) ||
893
           !(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) {
894
         ret_reg = ALIGN(physreg, align);
895
      }
896

897
      if (ret_reg != (physreg_t)~0 &&
898
          (is_source || !intervals[i].interval->is_killed)) {
899
         physreg = MAX2(physreg, ret_reg + size);
900
      }
901

902
      if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) {
903
         physreg = ALIGN(physreg, 2);
904
      }
905

906
      if (physreg + intervals[i].size >
907
          reg_file_size(file, intervals[i].interval->interval.reg)) {
908
         d("ran out of room for interval %u!\n",
909
           intervals[i].interval->interval.reg->name);
910
         unreachable("reg pressure calculation was wrong!");
911
         return 0;
912
      }
913

914
      d("pushing interval %u physreg %u\n",
915
        intervals[i].interval->interval.reg->name, physreg);
916
      ra_push_interval(ctx, file, &intervals[i], physreg);
917

918
      physreg += intervals[i].size;
919
   }
920

921
   if (ret_reg == (physreg_t)~0)
922
      ret_reg = physreg;
923

924
   ret_reg = ALIGN(ret_reg, align);
925
   if (ret_reg + size > file_size) {
926
      d("ran out of room for the new interval!\n");
927
      unreachable("reg pressure calculation was wrong!");
928
      return 0;
929
   }
930

931
   return ret_reg;
932
}
933

934
static void
935
update_affinity(struct ir3_register *reg, physreg_t physreg)
936
{
937
   if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
938
      return;
939

940
   if (physreg < reg->merge_set_offset)
941
      return;
942

943
   reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
944
}
945

946
/* Try to find free space for a register without shuffling anything. This uses
947
 * a round-robin algorithm to reduce false dependencies.
948
 */
949
static physreg_t
950
find_best_gap(struct ra_file *file, unsigned file_size, unsigned size,
951
              unsigned align, bool is_source)
952
{
953
   BITSET_WORD *available =
954
      is_source ? file->available_to_evict : file->available;
955

956
   unsigned start = ALIGN(file->start, align) % (file_size - size + align);
957
   unsigned candidate = start;
958
   do {
959
      bool is_available = true;
960
      for (unsigned i = 0; i < size; i++) {
961
         if (!BITSET_TEST(available, candidate + i)) {
962
            is_available = false;
963
            break;
964
         }
965
      }
966

967
      if (is_available) {
968
         file->start = (candidate + size) % file_size;
969
         return candidate;
970
      }
971

972
      candidate += align;
973
      if (candidate + size > file_size)
974
         candidate = 0;
975
   } while (candidate != start);
976

977
   return (physreg_t)~0;
978
}
979

980
static struct ra_file *
981
ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
982
{
983
   if (reg->flags & IR3_REG_SHARED)
984
      return &ctx->shared;
985
   else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
986
      return &ctx->full;
987
   else
988
      return &ctx->half;
989
}
990

991
/* This is the main entrypoint for picking a register. Pick a free register
992
 * for "reg", shuffling around sources if necessary. In the normal case where
993
 * "is_source" is false, this register can overlap with killed sources
994
 * (intervals with "is_killed == true"). If "is_source" is true, then
995
 * is_killed is ignored and the register returned must not overlap with killed
996
 * sources. This must be used for tied registers, because we're actually
997
 * allocating the destination and the tied source at the same time.
998
 */
999

1000
static physreg_t
1001
get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,
1002
        bool is_source)
1003
{
1004
   unsigned file_size = reg_file_size(file, reg);
1005
   if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
1006
      physreg_t preferred_reg =
1007
         reg->merge_set->preferred_reg + reg->merge_set_offset;
1008
      if (preferred_reg < file_size &&
1009
          preferred_reg % reg_elem_size(reg) == 0 &&
1010
          get_reg_specified(file, reg, preferred_reg, is_source))
1011
         return preferred_reg;
1012
   }
1013

1014
   /* If this register is a subset of a merge set which we have not picked a
1015
    * register for, first try to allocate enough space for the entire merge
1016
    * set.
1017
    */
1018
   unsigned size = reg_size(reg);
1019
   if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
1020
       size < reg->merge_set->size) {
1021
      physreg_t best_reg = find_best_gap(file, file_size, reg->merge_set->size,
1022
                                         reg->merge_set->alignment, is_source);
1023
      if (best_reg != (physreg_t)~0u) {
1024
         best_reg += reg->merge_set_offset;
1025
         return best_reg;
1026
      }
1027
   }
1028

1029
   /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
1030
    * Because this doesn't introduce unnecessary dependencies, and it
1031
    * potentially avoids needing (ss) syncs for write after read hazards for
1032
    * SFU instructions:
1033
    */
1034
   if (is_sfu(reg->instr) || is_alu(reg->instr)) {
1035
      for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
1036
         struct ir3_register *src = reg->instr->srcs[i];
1037
         if (!ra_reg_is_src(src))
1038
            continue;
1039
         if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
1040
            struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1041
            physreg_t src_physreg = ra_interval_get_physreg(src_interval);
1042
            if (src_physreg % reg_elem_size(reg) == 0 &&
1043
                src_physreg + size <= file_size &&
1044
                get_reg_specified(file, reg, src_physreg, is_source))
1045
               return src_physreg;
1046
         }
1047
      }
1048
   }
1049

1050
   physreg_t best_reg =
1051
      find_best_gap(file, file_size, size, reg_elem_size(reg), is_source);
1052
   if (best_reg != (physreg_t)~0u) {
1053
      return best_reg;
1054
   }
1055

1056
   /* Ok, we couldn't find anything that fits. Here is where we have to start
1057
    * moving things around to make stuff fit. First try solely evicting
1058
    * registers in the way.
1059
    */
1060
   unsigned best_eviction_count = ~0;
1061
   for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
1062
      unsigned eviction_count;
1063
      if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) {
1064
         if (eviction_count < best_eviction_count) {
1065
            best_eviction_count = eviction_count;
1066
            best_reg = i;
1067
         }
1068
      }
1069
   }
1070

1071
   if (best_eviction_count != ~0) {
1072
      ASSERTED bool result = try_evict_regs(
1073
         ctx, file, reg, best_reg, &best_eviction_count, is_source, false);
1074
      assert(result);
1075
      return best_reg;
1076
   }
1077

1078
   /* Use the dumb fallback only if try_evict_regs() fails. */
1079
   return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg),
1080
                             is_source);
1081
}
1082

1083
static void
1084
assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,
1085
           unsigned num)
1086
{
1087
   if (reg->flags & IR3_REG_ARRAY) {
1088
      reg->array.base = num;
1089
      if (reg->flags & IR3_REG_RELATIV)
1090
         reg->array.offset += num;
1091
      else
1092
         reg->num = num + reg->array.offset;
1093
   } else {
1094
      reg->num = num;
1095
   }
1096
}
1097

1098
static void
1099
mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
1100
{
1101
   struct ra_interval *interval = &ctx->intervals[src->def->name];
1102

1103
   if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
1104
       interval->interval.parent ||
1105
       !rb_tree_is_empty(&interval->interval.children))
1106
      return;
1107

1108
   ra_file_mark_killed(ra_get_file(ctx, src), interval);
1109
}
1110

1111
static void
1112
insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1113
{
1114
   struct ra_file *file = ra_get_file(ctx, dst);
1115
   struct ra_interval *interval = &ctx->intervals[dst->name];
1116

1117
   d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
1118

1119
   if (!(dst->flags & IR3_REG_UNUSED))
1120
      ra_file_insert(file, interval);
1121

1122
   assign_reg(dst->instr, dst, ra_interval_get_num(interval));
1123
}
1124

1125
static void
1126
allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
1127
                   physreg_t physreg)
1128
{
1129
   struct ra_interval *interval = &ctx->intervals[dst->name];
1130
   update_affinity(dst, physreg);
1131

1132
   ra_interval_init(interval, dst);
1133
   interval->physreg_start = physreg;
1134
   interval->physreg_end = physreg + reg_size(dst);
1135
}
1136

1137
static void
1138
allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1139
{
1140
   struct ra_file *file = ra_get_file(ctx, dst);
1141

1142
   struct ir3_register *tied = dst->tied;
1143
   if (tied) {
1144
      struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1145
      struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1146
      physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
1147
      if (tied_interval->is_killed) {
1148
         /* The easy case: the source is killed, so we can just reuse it
1149
          * for the destination.
1150
          */
1151
         allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
1152
      } else {
1153
         /* The source is live-through, so we need to get a free register
1154
          * (which is free for both the source and destination!), copy the
1155
          * original source to it, then use that for the source and
1156
          * destination.
1157
          */
1158
         physreg_t physreg = get_reg(ctx, file, dst, true);
1159
         allocate_dst_fixed(ctx, dst, physreg);
1160
         array_insert(ctx, ctx->parallel_copies,
1161
                      (struct ra_parallel_copy){
1162
                         .interval = dst_interval,
1163
                         .src = tied_physreg,
1164
                      });
1165
      }
1166

1167
      return;
1168
   }
1169

1170
   /* All the hard work is done by get_reg here. */
1171
   physreg_t physreg = get_reg(ctx, file, dst, false);
1172

1173
   allocate_dst_fixed(ctx, dst, physreg);
1174
}
1175

1176
static void
1177
assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
1178
           struct ir3_register *src)
1179
{
1180
   struct ra_interval *interval = &ctx->intervals[src->def->name];
1181
   struct ra_file *file = ra_get_file(ctx, src);
1182

1183
   struct ir3_register *tied = src->tied;
1184
   physreg_t physreg;
1185
   if (tied) {
1186
      struct ra_interval *tied_interval = &ctx->intervals[tied->name];
1187
      physreg = ra_interval_get_physreg(tied_interval);
1188
   } else {
1189
      physreg = ra_interval_get_physreg(interval);
1190
   }
1191

1192
   assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
1193

1194
   if (src->flags & IR3_REG_FIRST_KILL)
1195
      ra_file_remove(file, interval);
1196
}
1197

1198
/* Insert a parallel copy instruction before the instruction with the parallel
1199
 * copy entries we've built up.
1200
 */
1201
static void
1202
insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1203
{
1204
   if (ctx->parallel_copies_count == 0)
1205
      return;
1206

1207
   struct ir3_instruction *pcopy =
1208
      ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
1209
                       ctx->parallel_copies_count, ctx->parallel_copies_count);
1210

1211
   for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1212
      struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1213
      struct ir3_register *reg =
1214
         ir3_dst_create(pcopy, INVALID_REG,
1215
                        entry->interval->interval.reg->flags & ~IR3_REG_SSA);
1216
      reg->size = entry->interval->interval.reg->size;
1217
      reg->wrmask = entry->interval->interval.reg->wrmask;
1218
      assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
1219
   }
1220

1221
   for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1222
      struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1223
      struct ir3_register *reg =
1224
         ir3_src_create(pcopy, INVALID_REG,
1225
                        entry->interval->interval.reg->flags & ~IR3_REG_SSA);
1226
      reg->size = entry->interval->interval.reg->size;
1227
      reg->wrmask = entry->interval->interval.reg->wrmask;
1228
      assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
1229
   }
1230

1231
   list_del(&pcopy->node);
1232
   list_addtail(&pcopy->node, &instr->node);
1233
   ctx->parallel_copies_count = 0;
1234
}
1235

1236
static void
1237
handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1238
{
1239
   /* First, mark sources as going-to-be-killed while allocating the dest. */
1240
   ra_foreach_src (src, instr) {
1241
      mark_src_killed(ctx, src);
1242
   }
1243

1244
   /* Allocate the destination. */
1245
   ra_foreach_dst (dst, instr) {
1246
      allocate_dst(ctx, dst);
1247
   }
1248

1249
   /* Now handle sources. Go backward so that in case there are multiple
1250
    * sources with the same def and that def is killed we only remove it at
1251
    * the end.
1252
    */
1253
   ra_foreach_src_rev (src, instr) {
1254
      assign_src(ctx, instr, src);
1255
   }
1256

1257
   /* Now finally insert the destination into the map. */
1258
   ra_foreach_dst (dst, instr) {
1259
      insert_dst(ctx, dst);
1260
   }
1261

1262
   insert_parallel_copy_instr(ctx, instr);
1263
}
1264

1265
static void
1266
handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
1267
{
1268
   struct ir3_register *dst = instr->dsts[0];
1269
   struct ir3_register *src = instr->srcs[0];
1270

1271
   if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
1272
      handle_normal_instr(ctx, instr);
1273
      return;
1274
   }
1275

1276
   struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1277

1278
   physreg_t physreg = ra_interval_get_physreg(src_interval);
1279
   assign_src(ctx, instr, src);
1280

1281
   allocate_dst_fixed(
1282
      ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
1283
   insert_dst(ctx, dst);
1284
}
1285

1286
static void
1287
handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
1288
{
1289
   struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
1290
   unsigned dst_offset = instr->dsts[0]->merge_set_offset;
1291

1292
   if (!dst_set || dst_set->regs_count == 1) {
1293
      handle_normal_instr(ctx, instr);
1294
      return;
1295
   }
1296

1297
   /* We need to check if any of the sources are contained in an interval
1298
    * that is at least as large as the vector. In this case, we should put
1299
    * the vector inside that larger interval. (There should be one
1300
    * unambiguous place to put it, because values sharing the same merge set
1301
    * should be allocated together.) This can happen in a case like:
1302
    *
1303
    * ssa_1 (wrmask=0xf) = ...
1304
    * ssa_2 = split ssa_1 off:0
1305
    * ssa_3 = split ssa_1 off:1
1306
    * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
1307
    * ... = (kill)ssa_1
1308
    * ... = (kill)ssa_4
1309
    *
1310
    * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
1311
    */
1312
   physreg_t dst_fixed = (physreg_t)~0u;
1313

1314
   for (unsigned i = 0; i < instr->srcs_count; i++) {
1315
      if (!ra_reg_is_src(instr->srcs[i]))
1316
         continue;
1317

1318
      if (instr->srcs[i]->flags & IR3_REG_FIRST_KILL) {
1319
         mark_src_killed(ctx, instr->srcs[i]);
1320
      }
1321

1322
      struct ir3_register *src = instr->srcs[i];
1323
      struct ra_interval *interval = &ctx->intervals[src->def->name];
1324

1325
      if (src->def->merge_set != dst_set || interval->is_killed)
1326
         continue;
1327
      while (interval->interval.parent != NULL) {
1328
         interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1329
      }
1330
      if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
1331
         dst_fixed = interval->physreg_start -
1332
                     interval->interval.reg->merge_set_offset + dst_offset;
1333
      } else {
1334
         /* For sources whose root interval is smaller than the
1335
          * destination (i.e. the normal case), we will shuffle them
1336
          * around after allocating the destination. Mark them killed so
1337
          * that the destination can be allocated over them, even if they
1338
          * aren't actually killed.
1339
          */
1340
         ra_file_mark_killed(ra_get_file(ctx, src), interval);
1341
      }
1342
   }
1343

1344
   if (dst_fixed != (physreg_t)~0u)
1345
      allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
1346
   else
1347
      allocate_dst(ctx, instr->dsts[0]);
1348

1349
   /* Remove the temporary is_killed we added */
1350
   for (unsigned i = 0; i < instr->srcs_count; i++) {
1351
      if (!ra_reg_is_src(instr->srcs[i]))
1352
         continue;
1353

1354
      struct ir3_register *src = instr->srcs[i];
1355
      struct ra_interval *interval = &ctx->intervals[src->def->name];
1356
      while (interval->interval.parent != NULL) {
1357
         interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1358
      }
1359

1360
      /* Filter out cases where it actually should be killed */
1361
      if (interval != &ctx->intervals[src->def->name] ||
1362
          !(src->flags & IR3_REG_KILL))
1363
         interval->is_killed = false;
1364
   }
1365

1366
   ra_foreach_src_rev (src, instr) {
1367
      assign_src(ctx, instr, src);
1368
   }
1369

1370
   /* We need to do this before insert_dst(), so that children of the
1371
    * destination which got marked as killed and then shuffled around to make
1372
    * space for the destination have the correct pcopy destination that
1373
    * matches what we assign the source of the collect to in assign_src().
1374
    *
1375
    * TODO: In this case we'll wind up copying the value in the pcopy and
1376
    * then again in the collect. We could avoid one of those by updating the
1377
    * pcopy destination to match up with the final location of the source
1378
    * after the collect and making the collect a no-op. However this doesn't
1379
    * seem to happen often.
1380
    */
1381
   insert_parallel_copy_instr(ctx, instr);
1382

1383
   /* Note: insert_dst will automatically shuffle around any intervals that
1384
    * are a child of the collect by making them children of the collect.
1385
    */
1386

1387
   insert_dst(ctx, instr->dsts[0]);
1388
}
1389

1390
/* Parallel copies before RA should only be at the end of the block, for
1391
 * phi's. For these we only need to fill in the sources, and then we fill in
1392
 * the destinations in the successor block.
1393
 */
1394
static void
1395
handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
1396
{
1397
   ra_foreach_src_rev (src, instr) {
1398
      assign_src(ctx, instr, src);
1399
   }
1400
}
1401

1402
/* Some inputs may need to be precolored. We need to handle those first, so
1403
 * that other non-precolored inputs don't accidentally get allocated over
1404
 * them. Inputs are the very first thing in the shader, so it shouldn't be a
1405
 * problem to allocate them to a specific physreg.
1406
 */
1407

1408
static void
1409
handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1410
{
1411
   if (instr->dsts[0]->num == INVALID_REG)
1412
      return;
1413

1414
   struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1415
   physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
1416
   allocate_dst_fixed(ctx, instr->dsts[0], physreg);
1417
   insert_dst(ctx, instr->dsts[0]);
1418
   interval->frozen = true;
1419
}
1420

1421
static void
1422
handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1423
{
1424
   if (instr->dsts[0]->num != INVALID_REG)
1425
      return;
1426

1427
   allocate_dst(ctx, instr->dsts[0]);
1428

1429
   struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1430
   struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1431
   ra_file_insert(file, interval);
1432
}
1433

1434
static void
1435
assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1436
{
1437
   struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1438
   struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1439

1440
   if (instr->dsts[0]->num == INVALID_REG) {
1441
      assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
1442
   } else {
1443
      interval->frozen = false;
1444
   }
1445

1446
   if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1447
      ra_file_remove(file, interval);
1448

1449
   ra_foreach_src_rev (src, instr)
1450
      assign_src(ctx, instr, src);
1451
}
1452

1453
/* chmask is a bit weird, because it has pre-colored sources due to the need
1454
 * to pass some registers to the next stage. Fortunately there are only at
1455
 * most two, and there should be no other live values by the time we get to
1456
 * this instruction, so we only have to do the minimum and don't need any
1457
 * fancy fallbacks.
1458
 *
1459
 * TODO: Add more complete handling of precolored sources, e.g. for function
1460
 * argument handling. We'd need a way to mark sources as fixed so that they
1461
 * don't get moved around when placing other sources in the fallback case, and
1462
 * a duplication of much of the logic in get_reg(). This also opens another
1463
 * can of worms, e.g. what if the precolored source is a split of a vector
1464
 * which is still live -- this breaks our assumption that splits don't incur
1465
 * any "extra" register requirements and we'd have to break it out of the
1466
 * parent ra_interval.
1467
 */
1468

1469
static void
1470
handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
1471
{
1472
   struct ra_file *file = ra_get_file(ctx, src);
1473
   struct ra_interval *interval = &ctx->intervals[src->def->name];
1474
   physreg_t physreg = ra_reg_get_physreg(src);
1475

1476
   if (ra_interval_get_num(interval) == src->num)
1477
      return;
1478

1479
   /* Try evicting stuff in our way if it isn't free. This won't move
1480
    * anything unless it overlaps with our precolored physreg, so we don't
1481
    * have to worry about evicting other precolored sources.
1482
    */
1483
   if (!get_reg_specified(file, src, physreg, true)) {
1484
      unsigned eviction_count;
1485
      if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
1486
                          false)) {
1487
         unreachable("failed to evict for precolored source!");
1488
         return;
1489
      }
1490
   }
1491

1492
   ra_move_interval(ctx, file, interval, physreg);
1493
}
1494

1495
static void
1496
handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
1497
{
1498
   /* Note: we purposely don't mark sources as killed, so that we can reuse
1499
    * some of the get_reg() machinery as-if the source is a destination.
1500
    * Marking it as killed would make e.g. get_reg_specified() wouldn't work
1501
    * correctly.
1502
    */
1503
   ra_foreach_src (src, instr) {
1504
      assert(src->num != INVALID_REG);
1505
      handle_precolored_source(ctx, src);
1506
   }
1507

1508
   ra_foreach_src (src, instr) {
1509
      struct ra_file *file = ra_get_file(ctx, src);
1510
      struct ra_interval *interval = &ctx->intervals[src->def->name];
1511
      if (src->flags & IR3_REG_FIRST_KILL)
1512
         ra_file_remove(file, interval);
1513
   }
1514

1515
   insert_parallel_copy_instr(ctx, instr);
1516
}
1517

1518
static physreg_t
1519
read_register(struct ra_ctx *ctx, struct ir3_block *block,
1520
              struct ir3_register *def)
1521
{
1522
   struct ra_block_state *state = &ctx->blocks[block->index];
1523
   if (state->renames) {
1524
      struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
1525
      if (entry) {
1526
         return (physreg_t)(uintptr_t)entry->data;
1527
      }
1528
   }
1529

1530
   return ra_reg_get_physreg(def);
1531
}
1532

1533
static void
1534
handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
1535
{
1536
   physreg_t physreg = ~0;
1537
   for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1538
      struct ir3_block *pred = ctx->block->predecessors[i];
1539
      struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1540

1541
      if (!pred_state->visited ||
1542
          (pred_state->logical_unreachable && !(def->flags & IR3_REG_SHARED)))
1543
         continue;
1544

1545
      physreg = read_register(ctx, pred, def);
1546
      break;
1547
   }
1548

1549
   assert(physreg != (physreg_t)~0);
1550

1551
   struct ra_interval *interval = &ctx->intervals[def->name];
1552
   struct ra_file *file = ra_get_file(ctx, def);
1553
   ra_interval_init(interval, def);
1554
   interval->physreg_start = physreg;
1555
   interval->physreg_end = physreg + reg_size(def);
1556
   ra_file_insert(file, interval);
1557
}
1558

1559
static void
1560
handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
1561
{
1562
   /* Skip parallelcopy's which in the original program are only used as phi
1563
    * arguments. Even though phi arguments are live out, they are only
1564
    * assigned when the phi is.
1565
    */
1566
   if (def->instr->opc == OPC_META_PARALLEL_COPY)
1567
      return;
1568

1569
   struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1570
   struct ra_interval *interval = &ctx->intervals[def->name];
1571
   physreg_t physreg = ra_interval_get_physreg(interval);
1572
   if (physreg != ra_reg_get_physreg(def)) {
1573
      if (!state->renames)
1574
         state->renames = _mesa_pointer_hash_table_create(ctx);
1575
      _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
1576
   }
1577
}
1578

1579
static void
1580
handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
1581
{
1582
   struct ra_file *file = ra_get_file(ctx, def);
1583
   struct ra_interval *interval = &ctx->intervals[def->name];
1584

1585
   /* phis are always scalar, so they should already be the smallest possible
1586
    * size. However they may be coalesced with other live-in values/phi
1587
    * nodes, so check for that here.
1588
    */
1589
   struct ir3_reg_interval *parent_ir3 =
1590
      ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
1591
   physreg_t physreg;
1592
   if (parent_ir3) {
1593
      struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
1594
      physreg = ra_interval_get_physreg(parent) +
1595
                (def->interval_start - parent_ir3->reg->interval_start);
1596
   } else {
1597
      physreg = get_reg(ctx, file, def, false);
1598
   }
1599

1600
   allocate_dst_fixed(ctx, def, physreg);
1601

1602
   ra_file_insert(file, interval);
1603
}
1604

1605
static void
1606
assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
1607
{
1608
   struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
1609
   struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
1610
   assert(!interval->interval.parent);
1611
   unsigned num = ra_interval_get_num(interval);
1612
   assign_reg(phi, phi->dsts[0], num);
1613

1614
   /* Assign the parallelcopy sources of this phi */
1615
   for (unsigned i = 0; i < phi->srcs_count; i++) {
1616
      if (phi->srcs[i]->def) {
1617
         assign_reg(phi, phi->srcs[i], num);
1618
         assign_reg(phi, phi->srcs[i]->def, num);
1619
      }
1620
   }
1621

1622
   if (phi->dsts[0]->flags & IR3_REG_UNUSED)
1623
      ra_file_remove(file, interval);
1624
}
1625

1626
/* When we split a live range, we sometimes need to emit fixup code at the end
1627
 * of a block. For example, something like:
1628
 *
1629
 * a = ...
1630
 * if (...) {
1631
 *    ...
1632
 *    a' = a
1633
 *    b = ... // a evicted to make room for b
1634
 *    ...
1635
 * }
1636
 * ... = a
1637
 *
1638
 * When we insert the copy to a' in insert_parallel_copy_instr(), this forces
1639
 * to insert another copy "a = a'" at the end of the if. Normally this would
1640
 * also entail adding a phi node, but since we're about to go out of SSA
1641
 * anyway we just insert an extra move. Note, however, that "b" might be used
1642
 * in a phi node at the end of the if and share registers with "a", so we
1643
 * have to be careful to extend any preexisting parallelcopy instruction
1644
 * instead of creating our own in order to guarantee that they properly get
1645
 * swapped.
1646
 */
1647

1648
static void
1649
insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
1650
                    struct ir3_register *reg)
1651
{
1652
   struct ir3_instruction *old_pcopy = NULL;
1653
   if (!list_is_empty(&block->instr_list)) {
1654
      struct ir3_instruction *last =
1655
         LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node);
1656
      if (last->opc == OPC_META_PARALLEL_COPY)
1657
         old_pcopy = last;
1658
   }
1659

1660
   unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
1661
   struct ir3_instruction *pcopy = ir3_instr_create(
1662
      block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);
1663

1664
   for (unsigned i = 0; i < old_pcopy_srcs; i++) {
1665
      old_pcopy->dsts[i]->instr = pcopy;
1666
      pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
1667
   }
1668

1669
   struct ir3_register *dst_reg =
1670
      ir3_dst_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
1671
   dst_reg->wrmask = reg->wrmask;
1672
   dst_reg->size = reg->size;
1673
   assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
1674

1675
   for (unsigned i = 0; i < old_pcopy_srcs; i++) {
1676
      pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
1677
   }
1678

1679
   struct ir3_register *src_reg =
1680
      ir3_src_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
1681
   src_reg->wrmask = reg->wrmask;
1682
   src_reg->size = reg->size;
1683
   assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
1684

1685
   if (old_pcopy)
1686
      list_del(&old_pcopy->node);
1687
}
1688

1689
static void
1690
insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
1691
{
1692
   physreg_t physreg = ra_interval_get_physreg(interval);
1693

1694
   bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
1695
   struct ir3_block **predecessors =
1696
      shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
1697
   unsigned predecessors_count = shared
1698
                                    ? ctx->block->physical_predecessors_count
1699
                                    : ctx->block->predecessors_count;
1700

1701
   for (unsigned i = 0; i < predecessors_count; i++) {
1702
      struct ir3_block *pred = predecessors[i];
1703
      struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1704

1705
      if (!pred_state->visited)
1706
         continue;
1707

1708
      physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
1709
      if (pred_reg != physreg) {
1710
         insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
1711

1712
         /* This is a bit tricky, but when visiting the destination of a
1713
          * physical-only edge, we have two predecessors (the if and the
1714
          * header block) and both have multiple successors. We pick the
1715
          * register for all live-ins from the normal edge, which should
1716
          * guarantee that there's no need for shuffling things around in
1717
          * the normal predecessor as long as there are no phi nodes, but
1718
          * we still may need to insert fixup code in the physical
1719
          * predecessor (i.e. the last block of the if) and that has
1720
          * another successor (the block after the if) so we need to update
1721
          * the renames state for when we process the other successor. This
1722
          * crucially depends on the other successor getting processed
1723
          * after this.
1724
          *
1725
          * For normal (non-physical) edges we disallow critical edges so
1726
          * that hacks like this aren't necessary.
1727
          */
1728
         if (!pred_state->renames)
1729
            pred_state->renames = _mesa_pointer_hash_table_create(ctx);
1730
         _mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
1731
                                 (void *)(uintptr_t)physreg);
1732
      }
1733
   }
1734
}
1735

1736
static void
1737
insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
1738
{
1739
   BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
1740
   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1741
                    physreg_node) {
1742
      /* Skip phi nodes. This needs to happen after phi nodes are allocated,
1743
       * because we may have to move live-ins around to make space for phi
1744
       * nodes, but we shouldn't be handling phi nodes here.
1745
       */
1746
      if (BITSET_TEST(live_in, interval->interval.reg->name))
1747
         insert_live_in_move(ctx, interval);
1748
   }
1749
}
1750

1751
static void
1752
insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
1753
{
1754
   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1755
                    physreg_node) {
1756
      _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
1757
                              (void *)(uintptr_t)interval->physreg_start);
1758
   }
1759
}
1760

1761
static void
1762
insert_live_in_moves(struct ra_ctx *ctx)
1763
{
1764
   insert_file_live_in_moves(ctx, &ctx->full);
1765
   insert_file_live_in_moves(ctx, &ctx->half);
1766
   insert_file_live_in_moves(ctx, &ctx->shared);
1767

1768
   /* If not all predecessors are visited, insert live-in regs so that
1769
    * insert_live_out_moves() will work.
1770
    */
1771
   bool all_preds_visited = true;
1772
   for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1773
      if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
1774
         all_preds_visited = false;
1775
         break;
1776
      }
1777
   }
1778

1779
   if (!all_preds_visited) {
1780
      struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1781
      state->entry_regs = _mesa_pointer_hash_table_create(ctx);
1782

1783
      insert_entry_regs(state, &ctx->full);
1784
      insert_entry_regs(state, &ctx->half);
1785
      insert_entry_regs(state, &ctx->shared);
1786
   }
1787
}
1788

1789
static void
1790
insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
1791
{
1792
   for (unsigned i = 0; i < 2; i++) {
1793
      if (!ctx->block->successors[i])
1794
         continue;
1795

1796
      struct ir3_block *succ = ctx->block->successors[i];
1797
      struct ra_block_state *succ_state = &ctx->blocks[succ->index];
1798

1799
      if (!succ_state->visited)
1800
         continue;
1801

1802
      struct hash_entry *entry = _mesa_hash_table_search(
1803
         succ_state->entry_regs, interval->interval.reg);
1804
      if (!entry)
1805
         continue;
1806

1807
      physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
1808
      if (new_reg != interval->physreg_start) {
1809
         insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
1810
                             interval->interval.reg);
1811
      }
1812
   }
1813
}
1814

1815
static void
1816
insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
1817
{
1818
   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1819
                    physreg_node) {
1820
      insert_live_out_move(ctx, interval);
1821
   }
1822
}
1823

1824
static void
1825
insert_live_out_moves(struct ra_ctx *ctx)
1826
{
1827
   insert_file_live_out_moves(ctx, &ctx->full);
1828
   insert_file_live_out_moves(ctx, &ctx->half);
1829
   insert_file_live_out_moves(ctx, &ctx->shared);
1830
}
1831

1832
static void
1833
handle_block(struct ra_ctx *ctx, struct ir3_block *block)
1834
{
1835
   ctx->block = block;
1836

1837
   /* Reset the register files from the last block */
1838
   ra_file_init(&ctx->full);
1839
   ra_file_init(&ctx->half);
1840
   ra_file_init(&ctx->shared);
1841

1842
   bool unreachable = false;
1843
   if (block != ir3_start_block(ctx->ir)) {
1844
      unreachable = true;
1845
      for (unsigned i = 0; i < block->predecessors_count; i++) {
1846
         struct ra_block_state *pred_state =
1847
            &ctx->blocks[block->predecessors[i]->index];
1848
         if (!pred_state->logical_unreachable) {
1849
            unreachable = false;
1850
            break;
1851
         }
1852
      }
1853
   }
1854

1855
   ctx->blocks[block->index].logical_unreachable = unreachable;
1856

1857
   /* Handle live-ins, phis, and input meta-instructions. These all appear
1858
    * live at the beginning of the block, and interfere with each other
1859
    * therefore need to be allocated "in parallel". This means that we
1860
    * have to allocate all of them, inserting them into the file, and then
1861
    * delay updating the IR until all of them are allocated.
1862
    *
1863
    * Handle precolored inputs first, because we need to make sure that other
1864
    * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
1865
    * and inputs at the same time, because the first block doesn't have
1866
    * predecessors. Therefore handle_live_in doesn't have to worry about
1867
    * them.
1868
    */
1869

1870
   foreach_instr (instr, &block->instr_list) {
1871
      if (instr->opc == OPC_META_INPUT)
1872
         handle_precolored_input(ctx, instr);
1873
      else
1874
         break;
1875
   }
1876

1877
   unsigned name;
1878
   BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1879
                       ctx->live->definitions_count) {
1880
      struct ir3_register *reg = ctx->live->definitions[name];
1881
      if (unreachable && !(reg->flags & IR3_REG_SHARED))
1882
         continue;
1883
      handle_live_in(ctx, reg);
1884
   }
1885

1886
   foreach_instr (instr, &block->instr_list) {
1887
      if (instr->opc == OPC_META_PHI)
1888
         handle_phi(ctx, instr->dsts[0]);
1889
      else if (instr->opc == OPC_META_INPUT ||
1890
               instr->opc == OPC_META_TEX_PREFETCH)
1891
         handle_input(ctx, instr);
1892
      else
1893
         break;
1894
   }
1895

1896
   /* After this point, every live-in/phi/input has an interval assigned to
1897
    * it. We delay actually assigning values until everything has been
1898
    * allocated, so we can simply ignore any parallel copy entries created
1899
    * when shuffling them around.
1900
    */
1901
   ctx->parallel_copies_count = 0;
1902

1903
   insert_live_in_moves(ctx);
1904

1905
   if (RA_DEBUG) {
1906
      printf("after live-in block %u:\n", block->index);
1907
      ra_ctx_dump(ctx);
1908
   }
1909

1910
   /* Now we're done with processing live-ins, and can handle the body of the
1911
    * block.
1912
    */
1913
   foreach_instr (instr, &block->instr_list) {
1914
      if (RA_DEBUG) {
1915
         printf("processing: ");
1916
         ir3_print_instr(instr);
1917
      }
1918

1919
      if (instr->opc == OPC_META_PHI)
1920
         assign_phi(ctx, instr);
1921
      else if (instr->opc == OPC_META_INPUT ||
1922
               instr->opc == OPC_META_TEX_PREFETCH)
1923
         assign_input(ctx, instr);
1924
      else if (instr->opc == OPC_META_SPLIT)
1925
         handle_split(ctx, instr);
1926
      else if (instr->opc == OPC_META_COLLECT)
1927
         handle_collect(ctx, instr);
1928
      else if (instr->opc == OPC_META_PARALLEL_COPY)
1929
         handle_pcopy(ctx, instr);
1930
      else if (instr->opc == OPC_CHMASK)
1931
         handle_chmask(ctx, instr);
1932
      else
1933
         handle_normal_instr(ctx, instr);
1934

1935
      if (RA_DEBUG)
1936
         ra_ctx_dump(ctx);
1937
   }
1938

1939
   insert_live_out_moves(ctx);
1940

1941
   BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],
1942
                       ctx->live->definitions_count) {
1943
      struct ir3_register *reg = ctx->live->definitions[name];
1944
      handle_live_out(ctx, reg);
1945
   }
1946

1947
   ctx->blocks[block->index].visited = true;
1948
}
1949

1950
static unsigned
1951
calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
1952
{
1953
   /* Registers are allocated in units of vec4, so switch from units of
1954
    * half-regs to vec4.
1955
    */
1956
   unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
1957

1958
   bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
1959

1960
   unsigned target = reg_count;
1961
   unsigned reg_independent_max_waves =
1962
      ir3_get_reg_independent_max_waves(v, double_threadsize);
1963
   unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
1964
      v->shader->compiler, reg_count, double_threadsize);
1965
   unsigned target_waves =
1966
      MIN2(reg_independent_max_waves, reg_dependent_max_waves);
1967

1968
   while (target <= RA_FULL_SIZE / (2 * 4) &&
1969
          ir3_should_double_threadsize(v, target) == double_threadsize &&
1970
          ir3_get_reg_dependent_max_waves(v->shader->compiler, target,
1971
                                          double_threadsize) >= target_waves)
1972
      target++;
1973

1974
   return (target - 1) * 2 * 4;
1975
}
1976

1977
int
1978
ir3_ra(struct ir3_shader_variant *v)
1979
{
1980
   ir3_calc_dominance(v->ir);
1981

1982
   ir3_create_parallel_copies(v->ir);
1983

1984
   struct ir3_liveness *live = ir3_calc_liveness(v);
1985

1986
   ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
1987

1988
   ir3_merge_regs(live, v->ir);
1989

1990
   struct ir3_pressure max_pressure;
1991
   ir3_calc_pressure(v, live, &max_pressure);
1992
   d("max pressure:");
1993
   d("\tfull: %u", max_pressure.full);
1994
   d("\thalf: %u", max_pressure.half);
1995
   d("\tshared: %u", max_pressure.shared);
1996

1997
   if (v->mergedregs) {
1998
      max_pressure.full += max_pressure.half;
1999
      max_pressure.half = 0;
2000
   }
2001

2002
   if (max_pressure.full > RA_FULL_SIZE || max_pressure.half > RA_HALF_SIZE ||
2003
       max_pressure.shared > RA_SHARED_SIZE) {
2004
      d("max pressure exceeded!");
2005
      return 1;
2006
   }
2007

2008
   struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
2009

2010
   ctx->ir = v->ir;
2011
   ctx->merged_regs = v->mergedregs;
2012
   ctx->compiler = v->shader->compiler;
2013
   ctx->stage = v->type;
2014
   ctx->live = live;
2015
   ctx->intervals =
2016
      rzalloc_array(ctx, struct ra_interval, live->definitions_count);
2017
   ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
2018

2019
   ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
2020
   d("full size: %u", ctx->full.size);
2021

2022
   if (!v->mergedregs)
2023
      ctx->half.size = RA_HALF_SIZE;
2024

2025
   ctx->shared.size = RA_SHARED_SIZE;
2026

2027
   foreach_block (block, &v->ir->block_list)
2028
      handle_block(ctx, block);
2029

2030
   ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
2031

2032
   /* Strip array-ness and SSA-ness at the end, because various helpers still
2033
    * need to work even on definitions that have already been assigned. For
2034
    * example, we need to preserve array-ness so that array live-ins have the
2035
    * right size.
2036
    */
2037
   foreach_block (block, &v->ir->block_list) {
2038
      foreach_instr (instr, &block->instr_list) {
2039
         for (unsigned i = 0; i < instr->dsts_count; i++) {
2040
            instr->dsts[i]->flags &= ~IR3_REG_SSA;
2041

2042
            /* Parallel copies of array registers copy the whole register,
2043
             * and we need some way to let the parallel copy code know
2044
             * that this was an array whose size is determined by
2045
             * reg->size. So keep the array flag on those.
2046
             */
2047
            if (!is_meta(instr))
2048
               instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
2049
         }
2050

2051
         for (unsigned i = 0; i < instr->srcs_count; i++) {
2052
            instr->srcs[i]->flags &= ~IR3_REG_SSA;
2053

2054
            if (!is_meta(instr))
2055
               instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
2056
         }
2057
      }
2058
   }
2059

2060
   ir3_debug_print(v->ir, "AFTER: register allocation");
2061

2062
   ir3_lower_copies(v);
2063

2064
   ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
2065

2066
   ralloc_free(ctx);
2067
   ralloc_free(live);
2068
   return 0;
2069
}
2070

2071
Product

Resources

Company