CoCalc -- ir3_postsched.c

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/freedreno/ir3/ir3_postsched.c
⁴⁵⁶⁵ views
1
/*
2
 * Copyright (C) 2019 Google, Inc.
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
 * SOFTWARE.
22
 *
23
 * Authors:
24
 *    Rob Clark <[email protected]>
25
 */
26

27
#include "util/dag.h"
28
#include "util/u_math.h"
29

30
#include "ir3.h"
31
#include "ir3_compiler.h"
32
#include "ir3_context.h"
33

34
#ifdef DEBUG
35
#define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
36
#else
37
#define SCHED_DEBUG 0
38
#endif
39
#define d(fmt, ...)                                                            \
40
   do {                                                                        \
41
      if (SCHED_DEBUG) {                                                       \
42
         printf("PSCHED: " fmt "\n", ##__VA_ARGS__);                           \
43
      }                                                                        \
44
   } while (0)
45

46
#define di(instr, fmt, ...)                                                    \
47
   do {                                                                        \
48
      if (SCHED_DEBUG) {                                                       \
49
         printf("PSCHED: " fmt ": ", ##__VA_ARGS__);                           \
50
         ir3_print_instr(instr);                                               \
51
      }                                                                        \
52
   } while (0)
53

54
/*
55
 * Post RA Instruction Scheduling
56
 */
57

58
struct ir3_postsched_ctx {
59
   struct ir3 *ir;
60

61
   struct ir3_shader_variant *v;
62

63
   void *mem_ctx;
64
   struct ir3_block *block; /* the current block */
65
   struct dag *dag;
66

67
   struct list_head unscheduled_list; /* unscheduled instructions */
68

69
   int sfu_delay;
70
   int tex_delay;
71
};
72

73
struct ir3_postsched_node {
74
   struct dag_node dag; /* must be first for util_dynarray_foreach */
75
   struct ir3_instruction *instr;
76
   bool partially_evaluated_path;
77

78
   bool has_tex_src, has_sfu_src;
79

80
   unsigned delay;
81
   unsigned max_delay;
82
};
83

84
#define foreach_sched_node(__n, __list)                                        \
85
   list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
86

87
static bool
88
has_tex_src(struct ir3_instruction *instr)
89
{
90
   struct ir3_postsched_node *node = instr->data;
91
   return node->has_tex_src;
92
}
93

94
static bool
95
has_sfu_src(struct ir3_instruction *instr)
96
{
97
   struct ir3_postsched_node *node = instr->data;
98
   return node->has_sfu_src;
99
}
100

101
static void
102
schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
103
{
104
   debug_assert(ctx->block == instr->block);
105

106
   /* remove from unscheduled_list:
107
    */
108
   list_delinit(&instr->node);
109

110
   di(instr, "schedule");
111

112
   list_addtail(&instr->node, &instr->block->instr_list);
113

114
   struct ir3_postsched_node *n = instr->data;
115
   dag_prune_head(ctx->dag, &n->dag);
116

117
   if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
118
      return;
119

120
   if (is_sfu(instr)) {
121
      ctx->sfu_delay = 8;
122
   } else if (has_sfu_src(instr)) {
123
      ctx->sfu_delay = 0;
124
   } else if (ctx->sfu_delay > 0) {
125
      ctx->sfu_delay--;
126
   }
127

128
   if (is_tex_or_prefetch(instr)) {
129
      ctx->tex_delay = 10;
130
   } else if (has_tex_src(instr)) {
131
      ctx->tex_delay = 0;
132
   } else if (ctx->tex_delay > 0) {
133
      ctx->tex_delay--;
134
   }
135
}
136

137
static void
138
dump_state(struct ir3_postsched_ctx *ctx)
139
{
140
   if (!SCHED_DEBUG)
141
      return;
142

143
   foreach_sched_node (n, &ctx->dag->heads) {
144
      di(n->instr, "maxdel=%3d    ", n->max_delay);
145

146
      util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
147
         struct ir3_postsched_node *child =
148
            (struct ir3_postsched_node *)edge->child;
149

150
         di(child->instr, " -> (%d parents) ", child->dag.parent_count);
151
      }
152
   }
153
}
154

155
/* Determine if this is an instruction that we'd prefer not to schedule
156
 * yet, in order to avoid an (ss) sync.  This is limited by the sfu_delay
157
 * counter, ie. the more cycles it has been since the last SFU, the less
158
 * costly a sync would be.
159
 */
160
static bool
161
would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
162
{
163
   if (ctx->sfu_delay) {
164
      if (has_sfu_src(instr))
165
         return true;
166
   }
167

168
   if (ctx->tex_delay) {
169
      if (has_tex_src(instr))
170
         return true;
171
   }
172

173
   return false;
174
}
175

176
/* find instruction to schedule: */
177
static struct ir3_instruction *
178
choose_instr(struct ir3_postsched_ctx *ctx)
179
{
180
   struct ir3_postsched_node *chosen = NULL;
181

182
   dump_state(ctx);
183

184
   foreach_sched_node (n, &ctx->dag->heads) {
185
      if (!is_meta(n->instr))
186
         continue;
187

188
      if (!chosen || (chosen->max_delay < n->max_delay))
189
         chosen = n;
190
   }
191

192
   if (chosen) {
193
      di(chosen->instr, "prio: chose (meta)");
194
      return chosen->instr;
195
   }
196

197
   /* Try to schedule inputs with a higher priority, if possible, as
198
    * the last bary.f unlocks varying storage to unblock more VS
199
    * warps.
200
    */
201
   foreach_sched_node (n, &ctx->dag->heads) {
202
      if (!is_input(n->instr))
203
         continue;
204

205
      if (!chosen || (chosen->max_delay < n->max_delay))
206
         chosen = n;
207
   }
208

209
   if (chosen) {
210
      di(chosen->instr, "prio: chose (input)");
211
      return chosen->instr;
212
   }
213

214
   /* Next prioritize discards: */
215
   foreach_sched_node (n, &ctx->dag->heads) {
216
      unsigned d =
217
         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
218

219
      if (d > 0)
220
         continue;
221

222
      if (!is_kill_or_demote(n->instr))
223
         continue;
224

225
      if (!chosen || (chosen->max_delay < n->max_delay))
226
         chosen = n;
227
   }
228

229
   if (chosen) {
230
      di(chosen->instr, "csp: chose (kill, hard ready)");
231
      return chosen->instr;
232
   }
233

234
   /* Next prioritize expensive instructions: */
235
   foreach_sched_node (n, &ctx->dag->heads) {
236
      unsigned d =
237
         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
238

239
      if (d > 0)
240
         continue;
241

242
      if (!(is_sfu(n->instr) || is_tex(n->instr)))
243
         continue;
244

245
      if (!chosen || (chosen->max_delay < n->max_delay))
246
         chosen = n;
247
   }
248

249
   if (chosen) {
250
      di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
251
      return chosen->instr;
252
   }
253

254
   /*
255
    * Sometimes be better to take a nop, rather than scheduling an
256
    * instruction that would require an (ss) shortly after another
257
    * SFU..  ie. if last SFU was just one or two instr ago, and we
258
    * could choose between taking a nop and then scheduling
259
    * something else, vs scheduling the immed avail instruction that
260
    * would require (ss), we are better with the nop.
261
    */
262
   for (unsigned delay = 0; delay < 4; delay++) {
263
      foreach_sched_node (n, &ctx->dag->heads) {
264
         if (would_sync(ctx, n->instr))
265
            continue;
266

267
         unsigned d = ir3_delay_calc_postra(ctx->block, n->instr, true,
268
                                            ctx->v->mergedregs);
269

270
         if (d > delay)
271
            continue;
272

273
         if (!chosen || (chosen->max_delay < n->max_delay))
274
            chosen = n;
275
      }
276

277
      if (chosen) {
278
         di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
279
         return chosen->instr;
280
      }
281
   }
282

283
   /* Next try to find a ready leader w/ soft delay (ie. including extra
284
    * delay for things like tex fetch which can be synchronized w/ sync
285
    * bit (but we probably do want to schedule some other instructions
286
    * while we wait)
287
    */
288
   foreach_sched_node (n, &ctx->dag->heads) {
289
      unsigned d =
290
         ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
291

292
      if (d > 0)
293
         continue;
294

295
      if (!chosen || (chosen->max_delay < n->max_delay))
296
         chosen = n;
297
   }
298

299
   if (chosen) {
300
      di(chosen->instr, "csp: chose (soft ready)");
301
      return chosen->instr;
302
   }
303

304
   /* Next try to find a ready leader that can be scheduled without nop's,
305
    * which in the case of things that need (sy)/(ss) could result in
306
    * stalls.. but we've already decided there is not a better option.
307
    */
308
   foreach_sched_node (n, &ctx->dag->heads) {
309
      unsigned d =
310
         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
311

312
      if (d > 0)
313
         continue;
314

315
      if (!chosen || (chosen->max_delay < n->max_delay))
316
         chosen = n;
317
   }
318

319
   if (chosen) {
320
      di(chosen->instr, "csp: chose (hard ready)");
321
      return chosen->instr;
322
   }
323

324
   /* Otherwise choose leader with maximum cost:
325
    *
326
    * TODO should we try to balance cost and delays?  I guess it is
327
    * a balance between now-nop's and future-nop's?
328
    */
329
   foreach_sched_node (n, &ctx->dag->heads) {
330
      if (!chosen || chosen->max_delay < n->max_delay)
331
         chosen = n;
332
   }
333

334
   if (chosen) {
335
      di(chosen->instr, "csp: chose (leader)");
336
      return chosen->instr;
337
   }
338

339
   return NULL;
340
}
341

342
struct ir3_postsched_deps_state {
343
   struct ir3_postsched_ctx *ctx;
344

345
   enum { F, R } direction;
346

347
   bool merged;
348

349
   /* Track the mapping between sched node (instruction) that last
350
    * wrote a given register (in whichever direction we are iterating
351
    * the block)
352
    *
353
    * Note, this table is twice as big as the # of regs, to deal with
354
    * half-precision regs.  The approach differs depending on whether
355
    * the half and full precision register files are "merged" (conflict,
356
    * ie. a6xx+) in which case we consider each full precision dep
357
    * as two half-precision dependencies, vs older separate (non-
358
    * conflicting) in which case the first half of the table is used
359
    * for full precision and 2nd half for half-precision.
360
    */
361
   struct ir3_postsched_node *regs[2 * 256];
362
};
363

364
/* bounds checking read/write accessors, since OoB access to stuff on
365
 * the stack is gonna cause a bad day.
366
 */
367
#define dep_reg(state, idx)                                                    \
368
   *({                                                                         \
369
      assert((idx) < ARRAY_SIZE((state)->regs));                               \
370
      &(state)->regs[(idx)];                                                   \
371
   })
372

373
static void
374
add_dep(struct ir3_postsched_deps_state *state,
375
        struct ir3_postsched_node *before, struct ir3_postsched_node *after)
376
{
377
   if (!before || !after)
378
      return;
379

380
   assert(before != after);
381

382
   if (state->direction == F) {
383
      dag_add_edge(&before->dag, &after->dag, NULL);
384
   } else {
385
      dag_add_edge(&after->dag, &before->dag, NULL);
386
   }
387
}
388

389
static void
390
add_single_reg_dep(struct ir3_postsched_deps_state *state,
391
                   struct ir3_postsched_node *node, unsigned num, int src_n)
392
{
393
   struct ir3_postsched_node *dep = dep_reg(state, num);
394

395
   if (src_n >= 0 && dep && state->direction == F) {
396
      unsigned d = ir3_delayslots(dep->instr, node->instr, src_n, true);
397
      node->delay = MAX2(node->delay, d);
398
      if (is_tex_or_prefetch(dep->instr))
399
         node->has_tex_src = true;
400
      if (is_tex_or_prefetch(dep->instr))
401
         node->has_sfu_src = true;
402
   }
403

404
   add_dep(state, dep, node);
405
   if (src_n < 0) {
406
      dep_reg(state, num) = node;
407
   }
408
}
409

410
/* This is where we handled full vs half-precision, and potential conflicts
411
 * between half and full precision that result in additional dependencies.
412
 * The 'reg' arg is really just to know half vs full precision.
413
 *
414
 * If non-negative, then this adds a dependency on a source register, and
415
 * src_n is the index passed into ir3_delayslots() for calculating the delay:
416
 * If positive, corresponds to node->instr->regs[src_n]. If negative, then
417
 * this is for a destination register.
418
 */
419
static void
420
add_reg_dep(struct ir3_postsched_deps_state *state,
421
            struct ir3_postsched_node *node, const struct ir3_register *reg,
422
            unsigned num, int src_n)
423
{
424
   if (state->merged) {
425
      /* Make sure that special registers like a0.x that are written as
426
       * half-registers don't alias random full registers by pretending that
427
       * they're full registers:
428
       */
429
      if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) {
430
         /* single conflict in half-reg space: */
431
         add_single_reg_dep(state, node, num, src_n);
432
      } else {
433
         /* two conflicts in half-reg space: */
434
         add_single_reg_dep(state, node, 2 * num + 0, src_n);
435
         add_single_reg_dep(state, node, 2 * num + 1, src_n);
436
      }
437
   } else {
438
      if (reg->flags & IR3_REG_HALF)
439
         num += ARRAY_SIZE(state->regs) / 2;
440
      add_single_reg_dep(state, node, num, src_n);
441
   }
442
}
443

444
static void
445
calculate_deps(struct ir3_postsched_deps_state *state,
446
               struct ir3_postsched_node *node)
447
{
448
   /* Add dependencies on instructions that previously (or next,
449
    * in the reverse direction) wrote any of our src registers:
450
    */
451
   foreach_src_n (reg, i, node->instr) {
452
      if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
453
         continue;
454

455
      if (reg->flags & IR3_REG_RELATIV) {
456
         /* mark entire array as read: */
457
         for (unsigned j = 0; j < reg->size; j++) {
458
            add_reg_dep(state, node, reg, reg->array.base + j, i);
459
         }
460
      } else {
461
         assert(reg->wrmask >= 1);
462
         u_foreach_bit (b, reg->wrmask) {
463
            add_reg_dep(state, node, reg, reg->num + b, i);
464
         }
465
      }
466
   }
467

468
   /* And then after we update the state for what this instruction
469
    * wrote:
470
    */
471
   foreach_dst (reg, node->instr) {
472
      if (reg->wrmask == 0)
473
         continue;
474
      if (reg->flags & IR3_REG_RELATIV) {
475
         /* mark the entire array as written: */
476
         for (unsigned i = 0; i < reg->size; i++) {
477
            add_reg_dep(state, node, reg, reg->array.base + i, -1);
478
         }
479
      } else {
480
         assert(reg->wrmask >= 1);
481
         u_foreach_bit (b, reg->wrmask) {
482
            add_reg_dep(state, node, reg, reg->num + b, -1);
483
         }
484
      }
485
   }
486
}
487

488
static void
489
calculate_forward_deps(struct ir3_postsched_ctx *ctx)
490
{
491
   struct ir3_postsched_deps_state state = {
492
      .ctx = ctx,
493
      .direction = F,
494
      .merged = ctx->v->mergedregs,
495
   };
496

497
   foreach_instr (instr, &ctx->unscheduled_list) {
498
      calculate_deps(&state, instr->data);
499
   }
500
}
501

502
static void
503
calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
504
{
505
   struct ir3_postsched_deps_state state = {
506
      .ctx = ctx,
507
      .direction = R,
508
      .merged = ctx->v->mergedregs,
509
   };
510

511
   foreach_instr_rev (instr, &ctx->unscheduled_list) {
512
      calculate_deps(&state, instr->data);
513
   }
514
}
515

516
static void
517
sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
518
{
519
   struct ir3_postsched_node *n =
520
      rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
521

522
   dag_init_node(ctx->dag, &n->dag);
523

524
   n->instr = instr;
525
   instr->data = n;
526
}
527

528
static void
529
sched_dag_max_delay_cb(struct dag_node *node, void *state)
530
{
531
   struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
532
   uint32_t max_delay = 0;
533

534
   util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
535
      struct ir3_postsched_node *child =
536
         (struct ir3_postsched_node *)edge->child;
537
      max_delay = MAX2(child->max_delay, max_delay);
538
   }
539

540
   n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
541
}
542

543
static void
544
sched_dag_init(struct ir3_postsched_ctx *ctx)
545
{
546
   ctx->mem_ctx = ralloc_context(NULL);
547

548
   ctx->dag = dag_create(ctx->mem_ctx);
549

550
   foreach_instr (instr, &ctx->unscheduled_list)
551
      sched_node_init(ctx, instr);
552

553
   calculate_forward_deps(ctx);
554
   calculate_reverse_deps(ctx);
555

556
   /*
557
    * To avoid expensive texture fetches, etc, from being moved ahead
558
    * of kills, track the kills we've seen so far, so we can add an
559
    * extra dependency on them for tex/mem instructions
560
    */
561
   struct util_dynarray kills;
562
   util_dynarray_init(&kills, ctx->mem_ctx);
563

564
   /* The last bary.f with the (ei) flag must be scheduled before any kills,
565
    * or the hw gets angry. Keep track of inputs here so we can add the
566
    * false dep on the kill instruction.
567
    */
568
   struct util_dynarray inputs;
569
   util_dynarray_init(&inputs, ctx->mem_ctx);
570

571
   /*
572
    * Normal srcs won't be in SSA at this point, those are dealt with in
573
    * calculate_forward_deps() and calculate_reverse_deps().  But we still
574
    * have the false-dep information in SSA form, so go ahead and add
575
    * dependencies for that here:
576
    */
577
   foreach_instr (instr, &ctx->unscheduled_list) {
578
      struct ir3_postsched_node *n = instr->data;
579

580
      foreach_ssa_src_n (src, i, instr) {
581
         if (src->block != instr->block)
582
            continue;
583

584
         /* we can end up with unused false-deps.. just skip them: */
585
         if (src->flags & IR3_INSTR_UNUSED)
586
            continue;
587

588
         struct ir3_postsched_node *sn = src->data;
589

590
         /* don't consider dependencies in other blocks: */
591
         if (src->block != instr->block)
592
            continue;
593

594
         dag_add_edge(&sn->dag, &n->dag, NULL);
595
      }
596

597
      if (is_input(instr)) {
598
         util_dynarray_append(&inputs, struct ir3_instruction *, instr);
599
      } else if (is_kill_or_demote(instr)) {
600
         util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {
601
            struct ir3_instruction *input = *instrp;
602
            struct ir3_postsched_node *in = input->data;
603
            dag_add_edge(&in->dag, &n->dag, NULL);
604
         }
605
         util_dynarray_append(&kills, struct ir3_instruction *, instr);
606
      } else if (is_tex(instr) || is_mem(instr)) {
607
         util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {
608
            struct ir3_instruction *kill = *instrp;
609
            struct ir3_postsched_node *kn = kill->data;
610
            dag_add_edge(&kn->dag, &n->dag, NULL);
611
         }
612
      }
613
   }
614

615
   // TODO do we want to do this after reverse-dependencies?
616
   dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
617
}
618

619
static void
620
sched_dag_destroy(struct ir3_postsched_ctx *ctx)
621
{
622
   ralloc_free(ctx->mem_ctx);
623
   ctx->mem_ctx = NULL;
624
   ctx->dag = NULL;
625
}
626

627
static void
628
sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
629
{
630
   ctx->block = block;
631
   ctx->tex_delay = 0;
632
   ctx->sfu_delay = 0;
633

634
   /* move all instructions to the unscheduled list, and
635
    * empty the block's instruction list (to which we will
636
    * be inserting).
637
    */
638
   list_replace(&block->instr_list, &ctx->unscheduled_list);
639
   list_inithead(&block->instr_list);
640

641
   // TODO once we are using post-sched for everything we can
642
   // just not stick in NOP's prior to post-sched, and drop this.
643
   // for now keep this, since it makes post-sched optional:
644
   foreach_instr_safe (instr, &ctx->unscheduled_list) {
645
      switch (instr->opc) {
646
      case OPC_NOP:
647
      case OPC_B:
648
      case OPC_JUMP:
649
         list_delinit(&instr->node);
650
         break;
651
      default:
652
         break;
653
      }
654
   }
655

656
   sched_dag_init(ctx);
657

658
   /* First schedule all meta:input instructions, followed by
659
    * tex-prefetch.  We want all of the instructions that load
660
    * values into registers before the shader starts to go
661
    * before any other instructions.  But in particular we
662
    * want inputs to come before prefetches.  This is because
663
    * a FS's bary_ij input may not actually be live in the
664
    * shader, but it should not be scheduled on top of any
665
    * other input (but can be overwritten by a tex prefetch)
666
    */
667
   foreach_instr_safe (instr, &ctx->unscheduled_list)
668
      if (instr->opc == OPC_META_INPUT)
669
         schedule(ctx, instr);
670

671
   foreach_instr_safe (instr, &ctx->unscheduled_list)
672
      if (instr->opc == OPC_META_TEX_PREFETCH)
673
         schedule(ctx, instr);
674

675
   while (!list_is_empty(&ctx->unscheduled_list)) {
676
      struct ir3_instruction *instr = choose_instr(ctx);
677

678
      unsigned delay =
679
         ir3_delay_calc_postra(ctx->block, instr, false, ctx->v->mergedregs);
680
      d("delay=%u", delay);
681

682
      /* and if we run out of instructions that can be scheduled,
683
       * then it is time for nop's:
684
       */
685
      debug_assert(delay <= 6);
686
      while (delay > 0) {
687
         ir3_NOP(block);
688
         delay--;
689
      }
690

691
      schedule(ctx, instr);
692
   }
693

694
   sched_dag_destroy(ctx);
695
}
696

697
static bool
698
is_self_mov(struct ir3_instruction *instr)
699
{
700
   if (!is_same_type_mov(instr))
701
      return false;
702

703
   if (instr->dsts[0]->num != instr->srcs[0]->num)
704
      return false;
705

706
   if (instr->dsts[0]->flags & IR3_REG_RELATIV)
707
      return false;
708

709
   if (instr->cat1.round != ROUND_ZERO)
710
      return false;
711

712
   if (instr->srcs[0]->flags &
713
       (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG |
714
        IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))
715
      return false;
716

717
   return true;
718
}
719

720
/* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
721
 * as a result of places were before RA we are not sure that it is
722
 * safe to eliminate.  We could eliminate these earlier, but sometimes
723
 * they are tangled up in false-dep's, etc, so it is easier just to
724
 * let them exist until after RA
725
 */
726
static void
727
cleanup_self_movs(struct ir3 *ir)
728
{
729
   foreach_block (block, &ir->block_list) {
730
      foreach_instr_safe (instr, &block->instr_list) {
731
         for (unsigned i = 0; i < instr->deps_count; i++) {
732
            if (instr->deps[i] && is_self_mov(instr->deps[i])) {
733
               instr->deps[i] = NULL;
734
            }
735
         }
736

737
         if (is_self_mov(instr))
738
            list_delinit(&instr->node);
739
      }
740
   }
741
}
742

743
bool
744
ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
745
{
746
   struct ir3_postsched_ctx ctx = {
747
      .ir = ir,
748
      .v = v,
749
   };
750

751
   ir3_remove_nops(ir);
752
   cleanup_self_movs(ir);
753

754
   foreach_block (block, &ir->block_list) {
755
      sched_block(&ctx, block);
756
   }
757

758
   return true;
759
}
760

761
Product

Resources

Company