CoCalc -- ir3_merge

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/freedreno/ir3/ir3_merge_regs.c
⁴⁵⁶⁵ views
1
/*
2
 * Copyright (C) 2021 Valve Corporation
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
 * SOFTWARE.
22
 */
23

24
#include "ir3_compiler.h"
25
#include "ir3_ra.h"
26
#include "ralloc.h"
27

28
/* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
29
 * of parallelcopy's to trivially turn the program into CSSA form. Then we
30
 * try to "merge" SSA def's into "merge sets" which could be allocated to a
31
 * single register in order to eliminate copies. First we merge phi nodes,
32
 * which should always succeed because of the parallelcopy's we inserted, and
33
 * then we try to coalesce the copies we introduced.
34
 *
35
 * The merged registers are used for three purposes:
36
 *
37
 * 1. We always use the same pvtmem slot for spilling all SSA defs in each
38
 * merge set. This prevents us from having to insert memory-to-memory copies
39
 * in the spiller and makes sure we don't insert unecessary copies.
40
 * 2. When two values are live at the same time, part of the same merge
41
 * set, and they overlap each other in the merge set, they always occupy
42
 * overlapping physical registers in RA. This reduces register pressure and
43
 * copies in several important scenarios:
44
 *	- When sources of a collect are used later by something else, we don't
45
 *	have to introduce copies.
46
 *	- We can handle sequences of extracts that "explode" a vector into its
47
 *	components without any additional copying.
48
 * 3. We use the merge sets for affinities in register allocation: That is, we
49
 * try to allocate all the definitions in the same merge set to the
50
 * same/compatible registers. This helps us e.g. allocate sources of a collect
51
 * to contiguous registers without too much special code in RA.
52
 *
53
 * In a "normal" register allocator, or when spilling, we'd just merge
54
 * registers in the same merge set to the same register, but with SSA-based
55
 * register allocation we may have to split the live interval.
56
 *
57
 * The implementation is based on "Revisiting Out-of-SSA Translation for
58
 * Correctness, CodeQuality, and Eﬀiciency," and is broadly similar to the
59
 * implementation in nir_from_ssa, with the twist that we also try to coalesce
60
 * META_SPLIT and META_COLLECT. This makes this pass more complicated but
61
 * prevents us from needing to handle these specially in RA and the spiller,
62
 * which are already complicated enough. This also forces us to implement that
63
 * value-comparison optimization they explain, as without it we wouldn't be
64
 * able to coalesce META_SPLIT even in the simplest of cases.
65
 */
66

67
/* In order to dynamically reconstruct the dominance forest, we need the
68
 * instructions ordered by a preorder traversal of the dominance tree:
69
 */
70

71
static unsigned
72
index_instrs(struct ir3_block *block, unsigned index)
73
{
74
   foreach_instr (instr, &block->instr_list)
75
      instr->ip = index++;
76

77
   for (unsigned i = 0; i < block->dom_children_count; i++)
78
      index = index_instrs(block->dom_children[i], index);
79

80
   return index;
81
}
82

83
/* Definitions within a merge set are ordered by instr->ip as set above: */
84

85
static bool
86
def_after(struct ir3_register *a, struct ir3_register *b)
87
{
88
   return a->instr->ip > b->instr->ip;
89
}
90

91
static bool
92
def_dominates(struct ir3_register *a, struct ir3_register *b)
93
{
94
   if (def_after(a, b)) {
95
      return false;
96
   } else if (a->instr->block == b->instr->block) {
97
      return def_after(b, a);
98
   } else {
99
      return ir3_block_dominates(a->instr->block, b->instr->block);
100
   }
101
}
102

103
/* This represents a region inside a register. The offset is relative to the
104
 * start of the register, and offset + size <= size(reg).
105
 */
106
struct def_value {
107
   struct ir3_register *reg;
108
   unsigned offset, size;
109
};
110

111
/* Chase any copies to get the source of a region inside a register. This is
112
 * Value(a) in the paper.
113
 */
114
static struct def_value
115
chase_copies(struct def_value value)
116
{
117
   while (true) {
118
      struct ir3_instruction *instr = value.reg->instr;
119
      if (instr->opc == OPC_META_SPLIT) {
120
         value.offset += instr->split.off * reg_elem_size(value.reg);
121
         value.reg = instr->srcs[0]->def;
122
      } else if (instr->opc == OPC_META_COLLECT) {
123
         if (value.offset % reg_elem_size(value.reg) != 0 ||
124
             value.size > reg_elem_size(value.reg) ||
125
             value.offset + value.size > reg_size(value.reg))
126
            break;
127
         struct ir3_register *src =
128
            instr->srcs[value.offset / reg_elem_size(value.reg)];
129
         if (!src->def)
130
            break;
131
         value.offset = 0;
132
         value.reg = src->def;
133
      } else {
134
         /* TODO: parallelcopy */
135
         break;
136
      }
137
   }
138

139
   return value;
140
}
141

142
/* This represents an entry in the merge set, and consists of a register +
143
 * offset from the merge set base.
144
 */
145
struct merge_def {
146
   struct ir3_register *reg;
147
   unsigned offset;
148
};
149

150
static bool
151
can_skip_interference(const struct merge_def *a, const struct merge_def *b)
152
{
153
   unsigned a_start = a->offset;
154
   unsigned b_start = b->offset;
155
   unsigned a_end = a_start + reg_size(a->reg);
156
   unsigned b_end = b_start + reg_size(b->reg);
157

158
   /* Registers that don't overlap never interfere */
159
   if (a_end <= b_start || b_end <= a_start)
160
      return true;
161

162
   /* Disallow skipping interference unless one definition contains the
163
    * other. This restriction is important for register allocation, because
164
    * it means that at any given point in the program, the live values in a
165
    * given merge set will form a tree. If they didn't, then one live value
166
    * would partially overlap another, and they would have overlapping live
167
    * ranges because they're live at the same point. This simplifies register
168
    * allocation and spilling.
169
    */
170
   if (!((a_start <= b_start && a_end >= b_end) ||
171
         (b_start <= a_start && b_end >= a_end)))
172
      return false;
173

174
   /* For each register, chase the intersection of a and b to find the
175
    * ultimate source.
176
    */
177
   unsigned start = MAX2(a_start, b_start);
178
   unsigned end = MIN2(a_end, b_end);
179
   struct def_value a_value = chase_copies((struct def_value){
180
      .reg = a->reg,
181
      .offset = start - a_start,
182
      .size = end - start,
183
   });
184
   struct def_value b_value = chase_copies((struct def_value){
185
      .reg = b->reg,
186
      .offset = start - b_start,
187
      .size = end - start,
188
   });
189
   return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
190
}
191

192
static struct ir3_merge_set *
193
get_merge_set(struct ir3_register *def)
194
{
195
   if (def->merge_set)
196
      return def->merge_set;
197

198
   struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
199
   set->preferred_reg = ~0;
200
   set->interval_start = ~0;
201
   set->size = reg_size(def);
202
   set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
203
   set->regs_count = 1;
204
   set->regs = ralloc(set, struct ir3_register *);
205
   set->regs[0] = def;
206

207
   return set;
208
}
209

210
/* Merges b into a */
211
static struct ir3_merge_set *
212
merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset)
213
{
214
   if (b_offset < 0)
215
      return merge_merge_sets(b, a, -b_offset);
216

217
   struct ir3_register **new_regs =
218
      rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
219

220
   unsigned a_index = 0, b_index = 0, new_index = 0;
221
   for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
222
      if (b_index < b->regs_count &&
223
          (a_index == a->regs_count ||
224
           def_after(a->regs[a_index], b->regs[b_index]))) {
225
         new_regs[new_index] = b->regs[b_index++];
226
         new_regs[new_index]->merge_set_offset += b_offset;
227
      } else {
228
         new_regs[new_index] = a->regs[a_index++];
229
      }
230
      new_regs[new_index]->merge_set = a;
231
   }
232

233
   assert(new_index == a->regs_count + b->regs_count);
234

235
   /* Technically this should be the lcm, but because alignment is only 1 or
236
    * 2 so far this should be ok.
237
    */
238
   a->alignment = MAX2(a->alignment, b->alignment);
239
   a->regs_count += b->regs_count;
240
   ralloc_free(a->regs);
241
   a->regs = new_regs;
242
   a->size = MAX2(a->size, b->size + b_offset);
243

244
   return a;
245
}
246

247
static bool
248
merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a,
249
                     struct ir3_merge_set *b, int b_offset)
250
{
251
   if (b_offset < 0)
252
      return merge_sets_interfere(live, b, a, -b_offset);
253

254
   struct merge_def dom[a->regs_count + b->regs_count];
255
   unsigned a_index = 0, b_index = 0;
256
   int dom_index = -1;
257

258
   /* Reject trying to merge the sets if the alignment doesn't work out */
259
   if (b_offset % a->alignment != 0)
260
      return true;
261

262
   while (a_index < a->regs_count || b_index < b->regs_count) {
263
      struct merge_def current;
264
      if (a_index == a->regs_count) {
265
         current.reg = b->regs[b_index];
266
         current.offset = current.reg->merge_set_offset + b_offset;
267
         b_index++;
268
      } else if (b_index == b->regs_count) {
269
         current.reg = a->regs[a_index];
270
         current.offset = current.reg->merge_set_offset;
271
         a_index++;
272
      } else {
273
         if (def_after(b->regs[b_index], a->regs[a_index])) {
274
            current.reg = a->regs[a_index];
275
            current.offset = current.reg->merge_set_offset;
276
            a_index++;
277
         } else {
278
            current.reg = b->regs[b_index];
279
            current.offset = current.reg->merge_set_offset + b_offset;
280
            b_index++;
281
         }
282
      }
283

284
      while (dom_index >= 0 &&
285
             !def_dominates(dom[dom_index].reg, current.reg)) {
286
         dom_index--;
287
      }
288

289
      /* TODO: in the original paper, just dom[dom_index] needs to be
290
       * checked for interference. We implement the value-chasing extension
291
       * as well as support for sub-registers, which complicates this
292
       * significantly because it's no longer the case that if a dominates b
293
       * dominates c and a and b don't interfere then we only need to check
294
       * interference between b and c to be sure a and c don't interfere --
295
       * this means we may have to check for interference against values
296
       * higher in the stack then dom[dom_index]. In the paper there's a
297
       * description of a way to do less interference tests with the
298
       * value-chasing extension, but we'd have to come up with something
299
       * ourselves for handling the similar problems that come up with
300
       * allowing values to contain subregisters. For now we just test
301
       * everything in the stack.
302
       */
303
      for (int i = 0; i <= dom_index; i++) {
304
         if (can_skip_interference(&current, &dom[i]))
305
            continue;
306

307
         /* Ok, now we actually have to check interference. Since we know
308
          * that dom[i] dominates current, this boils down to checking
309
          * whether dom[i] is live after current.
310
          */
311
         if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
312
            return true;
313
      }
314

315
      dom[++dom_index] = current;
316
   }
317

318
   return false;
319
}
320

321
static void
322
try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
323
               struct ir3_register *b, unsigned b_offset)
324
{
325
   struct ir3_merge_set *a_set = get_merge_set(a);
326
   struct ir3_merge_set *b_set = get_merge_set(b);
327

328
   if (a_set == b_set) {
329
      /* Note: Even in this case we may not always successfully be able to
330
       * coalesce this copy, if the offsets don't line up. But in any
331
       * case, we can't do anything.
332
       */
333
      return;
334
   }
335

336
   int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
337

338
   if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
339
      merge_merge_sets(a_set, b_set, b_set_offset);
340
}
341

342
static void
343
coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
344
{
345
   for (unsigned i = 0; i < phi->srcs_count; i++) {
346
      if (phi->srcs[i]->def)
347
         try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
348
   }
349
}
350

351
static void
352
aggressive_coalesce_parallel_copy(struct ir3_liveness *live,
353
                                  struct ir3_instruction *pcopy)
354
{
355
   for (unsigned i = 0; i < pcopy->dsts_count; i++) {
356
      if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
357
         continue;
358
      try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
359
   }
360
}
361

362
static void
363
aggressive_coalesce_split(struct ir3_liveness *live,
364
                          struct ir3_instruction *split)
365
{
366
   try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
367
                  split->split.off * reg_elem_size(split->dsts[0]));
368
}
369

370
static void
371
aggressive_coalesce_collect(struct ir3_liveness *live,
372
                            struct ir3_instruction *collect)
373
{
374
   for (unsigned i = 0, offset = 0; i < collect->srcs_count;
375
        offset += reg_elem_size(collect->srcs[i]), i++) {
376
      if (!(collect->srcs[i]->flags & IR3_REG_SSA))
377
         continue;
378
      try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
379
   }
380
}
381

382
static void
383
create_parallel_copy(struct ir3_block *block)
384
{
385
   for (unsigned i = 0; i < 2; i++) {
386
      if (!block->successors[i])
387
         continue;
388

389
      struct ir3_block *succ = block->successors[i];
390

391
      unsigned pred_idx = ir3_block_get_pred_index(succ, block);
392

393
      unsigned phi_count = 0;
394
      foreach_instr (phi, &succ->instr_list) {
395
         if (phi->opc != OPC_META_PHI)
396
            break;
397

398
         /* Avoid undef */
399
         if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
400
             !phi->srcs[pred_idx]->def)
401
            continue;
402

403
         /* We don't support critical edges. If we were to support them,
404
          * we'd need to insert parallel copies after the phi node to solve
405
          * the lost-copy problem.
406
          */
407
         assert(i == 0 && !block->successors[1]);
408
         phi_count++;
409
      }
410

411
      if (phi_count == 0)
412
         continue;
413

414
      struct ir3_register *src[phi_count];
415
      unsigned j = 0;
416
      foreach_instr (phi, &succ->instr_list) {
417
         if (phi->opc != OPC_META_PHI)
418
            break;
419
         if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
420
             !phi->srcs[pred_idx]->def)
421
            continue;
422
         src[j++] = phi->srcs[pred_idx];
423
      }
424
      assert(j == phi_count);
425

426
      struct ir3_instruction *pcopy =
427
         ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);
428

429
      for (j = 0; j < phi_count; j++) {
430
         struct ir3_register *reg = __ssa_dst(pcopy);
431
         reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
432
         reg->size = reg_elems(src[j]);
433
      }
434

435
      for (j = 0; j < phi_count; j++) {
436
         pcopy->srcs[pcopy->srcs_count++] =
437
            ir3_reg_clone(block->shader, src[j]);
438
      }
439

440
      j = 0;
441
      foreach_instr (phi, &succ->instr_list) {
442
         if (phi->opc != OPC_META_PHI)
443
            break;
444
         if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
445
             !phi->srcs[pred_idx]->def)
446
            continue;
447
         phi->srcs[pred_idx]->def = pcopy->dsts[j];
448
         phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
449
         j++;
450
      }
451
      assert(j == phi_count);
452
   }
453
}
454

455
void
456
ir3_create_parallel_copies(struct ir3 *ir)
457
{
458
   foreach_block (block, &ir->block_list) {
459
      create_parallel_copy(block);
460
   }
461
}
462

463
static void
464
index_merge_sets(struct ir3 *ir)
465
{
466
   unsigned offset = 0;
467
   foreach_block (block, &ir->block_list) {
468
      foreach_instr (instr, &block->instr_list) {
469
         for (unsigned i = 0; i < instr->dsts_count; i++) {
470
            struct ir3_register *dst = instr->dsts[i];
471

472
            unsigned dst_offset;
473
            struct ir3_merge_set *merge_set = dst->merge_set;
474
            unsigned size = reg_size(dst);
475
            if (merge_set) {
476
               if (merge_set->interval_start == ~0) {
477
                  merge_set->interval_start = offset;
478
                  offset += merge_set->size;
479
               }
480
               dst_offset = merge_set->interval_start + dst->merge_set_offset;
481
            } else {
482
               dst_offset = offset;
483
               offset += size;
484
            }
485

486
            dst->interval_start = dst_offset;
487
            dst->interval_end = dst_offset + size;
488
         }
489
      }
490
   }
491
}
492

493
#define RESET      "\x1b[0m"
494
#define BLUE       "\x1b[0;34m"
495
#define SYN_SSA(x) BLUE x RESET
496

497
static void
498
dump_merge_sets(struct ir3 *ir)
499
{
500
   printf("merge sets:\n");
501
   struct set *merge_sets = _mesa_pointer_set_create(NULL);
502
   foreach_block (block, &ir->block_list) {
503
      foreach_instr (instr, &block->instr_list) {
504
         for (unsigned i = 0; i < instr->dsts_count; i++) {
505
            struct ir3_register *dst = instr->dsts[i];
506

507
            struct ir3_merge_set *merge_set = dst->merge_set;
508
            if (!merge_set || _mesa_set_search(merge_sets, merge_set))
509
               continue;
510

511
            printf("merge set, size %u, align %u:\n", merge_set->size,
512
                   merge_set->alignment);
513
            for (unsigned j = 0; j < merge_set->regs_count; j++) {
514
               struct ir3_register *reg = merge_set->regs[j];
515
               printf("\t" SYN_SSA("ssa_%u") ":%u, offset %u\n",
516
                      reg->instr->serialno, reg->name, reg->merge_set_offset);
517
            }
518

519
            _mesa_set_add(merge_sets, merge_set);
520
         }
521
      }
522
   }
523

524
   ralloc_free(merge_sets);
525
}
526

527
void
528
ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
529
{
530
   index_instrs(ir3_start_block(ir), 0);
531

532
   /* First pass: coalesce phis, which must be together. */
533
   foreach_block (block, &ir->block_list) {
534
      foreach_instr (instr, &block->instr_list) {
535
         if (instr->opc != OPC_META_PHI)
536
            break;
537

538
         coalesce_phi(live, instr);
539
      }
540
   }
541

542
   /* Second pass: aggressively coalesce parallelcopy, split, collect */
543
   foreach_block (block, &ir->block_list) {
544
      foreach_instr (instr, &block->instr_list) {
545
         switch (instr->opc) {
546
         case OPC_META_SPLIT:
547
            aggressive_coalesce_split(live, instr);
548
            break;
549
         case OPC_META_COLLECT:
550
            aggressive_coalesce_collect(live, instr);
551
            break;
552
         case OPC_META_PARALLEL_COPY:
553
            aggressive_coalesce_parallel_copy(live, instr);
554
            break;
555
         default:
556
            break;
557
         }
558
      }
559
   }
560

561
   index_merge_sets(ir);
562

563
   if (ir3_shader_debug & IR3_DBG_RAMSGS)
564
      dump_merge_sets(ir);
565
}
566

567
Product

Resources

Company