CoCalc -- ir3

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/freedreno/ir3/ir3_sched.c
⁴⁵⁶⁵ views
1
/*
2
 * Copyright (C) 2014 Rob Clark <[email protected]>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
 * SOFTWARE.
22
 *
23
 * Authors:
24
 *    Rob Clark <[email protected]>
25
 */
26

27
#include "util/dag.h"
28
#include "util/u_math.h"
29

30
#include "ir3.h"
31
#include "ir3_compiler.h"
32

33
#ifdef DEBUG
34
#define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
35
#else
36
#define SCHED_DEBUG 0
37
#endif
38
#define d(fmt, ...)                                                            \
39
   do {                                                                        \
40
      if (SCHED_DEBUG) {                                                       \
41
         printf("SCHED: " fmt "\n", ##__VA_ARGS__);                            \
42
      }                                                                        \
43
   } while (0)
44

45
#define di(instr, fmt, ...)                                                    \
46
   do {                                                                        \
47
      if (SCHED_DEBUG) {                                                       \
48
         printf("SCHED: " fmt ": ", ##__VA_ARGS__);                            \
49
         ir3_print_instr(instr);                                               \
50
      }                                                                        \
51
   } while (0)
52

53
/*
54
 * Instruction Scheduling:
55
 *
56
 * A block-level pre-RA scheduler, which works by creating a DAG of
57
 * instruction dependencies, and heuristically picking a DAG head
58
 * (instruction with no unscheduled dependencies).
59
 *
60
 * Where possible, it tries to pick instructions that avoid nop delay
61
 * slots, but it will prefer to pick instructions that reduce (or do
62
 * not increase) the number of live values.
63
 *
64
 * If the only possible choices are instructions that increase the
65
 * number of live values, it will try to pick the one with the earliest
66
 * consumer (based on pre-sched program order).
67
 *
68
 * There are a few special cases that need to be handled, since sched
69
 * is currently independent of register allocation.  Usages of address
70
 * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
71
 * if you have two pairs of instructions that write the same special
72
 * register and then read it, then those pairs cannot be interleaved.
73
 * To solve this, when we are in such a scheduling "critical section",
74
 * and we encounter a conflicting write to a special register, we try
75
 * to schedule any remaining instructions that use that value first.
76
 *
77
 * TODO we can detect too-large live_values here.. would be a good place
78
 * to "spill" cheap things, like move from uniform/immed.  (Constructing
79
 * list of ssa def consumers before sched pass would make this easier.
80
 * Also, in general it is general it might be best not to re-use load_immed
81
 * across blocks.
82
 *
83
 * TODO we can use (abs)/(neg) src modifiers in a lot of cases to reduce
84
 * the # of immediates in play (or at least that would help with
85
 * dEQP-GLES31.functional.ubo.random.all_per_block_buffers.*).. probably
86
 * do this in a nir pass that inserts fneg/etc?  The cp pass should fold
87
 * these into src modifiers..
88
 */
89

90
struct ir3_sched_ctx {
91
   struct ir3_block *block; /* the current block */
92
   struct dag *dag;
93

94
   struct list_head unscheduled_list; /* unscheduled instructions */
95
   struct ir3_instruction *scheduled; /* last scheduled instr */
96
   struct ir3_instruction *addr0;     /* current a0.x user, if any */
97
   struct ir3_instruction *addr1;     /* current a1.x user, if any */
98
   struct ir3_instruction *pred;      /* current p0.x user, if any */
99

100
   struct ir3_instruction *split; /* most-recently-split a0/a1/p0 producer */
101

102
   int remaining_kills;
103
   int remaining_tex;
104

105
   bool error;
106

107
   int sfu_delay;
108
   int tex_delay;
109

110
   /* We order the scheduled tex/SFU instructions, and keep track of the
111
    * index of the last waited on instruction, so we can know which
112
    * instructions are still outstanding (and therefore would require us to
113
    * wait for all outstanding instructions before scheduling a use).
114
    */
115
   int tex_index, first_outstanding_tex_index;
116
   int sfu_index, first_outstanding_sfu_index;
117
};
118

119
struct ir3_sched_node {
120
   struct dag_node dag; /* must be first for util_dynarray_foreach */
121
   struct ir3_instruction *instr;
122

123
   unsigned delay;
124
   unsigned max_delay;
125

126
   unsigned tex_index;
127
   unsigned sfu_index;
128

129
   /* For instructions that are a meta:collect src, once we schedule
130
    * the first src of the collect, the entire vecN is live (at least
131
    * from the PoV of the first RA pass.. the 2nd scalar pass can fill
132
    * in some of the gaps, but often not all).  So we want to help out
133
    * RA, and realize that as soon as we schedule the first collect
134
    * src, there is no penalty to schedule the remainder (ie. they
135
    * don't make additional values live).  In fact we'd prefer to
136
    * schedule the rest ASAP to minimize the live range of the vecN.
137
    *
138
    * For instructions that are the src of a collect, we track the
139
    * corresponding collect, and mark them as partially live as soon
140
    * as any one of the src's is scheduled.
141
    */
142
   struct ir3_instruction *collect;
143
   bool partially_live;
144

145
   /* Is this instruction a direct or indirect dependency for a kill?
146
    * If so, we should prioritize it when possible
147
    */
148
   bool kill_path;
149

150
   /* This node represents a shader output.  A semi-common pattern in
151
    * shaders is something along the lines of:
152
    *
153
    *    fragcolor.w = 1.0
154
    *
155
    * Which we'd prefer to schedule as late as possible, since it
156
    * produces a live value that is never killed/consumed.  So detect
157
    * outputs up-front, and avoid scheduling them unless the reduce
158
    * register pressure (or at least are neutral)
159
    */
160
   bool output;
161
};
162

163
#define foreach_sched_node(__n, __list)                                        \
164
   list_for_each_entry (struct ir3_sched_node, __n, __list, dag.link)
165

166
static void sched_node_init(struct ir3_sched_ctx *ctx,
167
                            struct ir3_instruction *instr);
168
static void sched_node_add_dep(struct ir3_instruction *instr,
169
                               struct ir3_instruction *src, int i);
170

171
static bool
172
is_scheduled(struct ir3_instruction *instr)
173
{
174
   return !!(instr->flags & IR3_INSTR_MARK);
175
}
176

177
/* check_src_cond() passing a ir3_sched_ctx. */
178
static bool
179
sched_check_src_cond(struct ir3_instruction *instr,
180
                     bool (*cond)(struct ir3_instruction *,
181
                                  struct ir3_sched_ctx *),
182
                     struct ir3_sched_ctx *ctx)
183
{
184
   foreach_ssa_src (src, instr) {
185
      /* meta:split/collect aren't real instructions, the thing that
186
       * we actually care about is *their* srcs
187
       */
188
      if ((src->opc == OPC_META_SPLIT) || (src->opc == OPC_META_COLLECT)) {
189
         if (sched_check_src_cond(src, cond, ctx))
190
            return true;
191
      } else {
192
         if (cond(src, ctx))
193
            return true;
194
      }
195
   }
196

197
   return false;
198
}
199

200
/* Is this a prefetch or tex that hasn't been waited on yet? */
201

202
static bool
203
is_outstanding_tex_or_prefetch(struct ir3_instruction *instr,
204
                               struct ir3_sched_ctx *ctx)
205
{
206
   if (!is_tex_or_prefetch(instr))
207
      return false;
208

209
   /* The sched node is only valid within the same block, we cannot
210
    * really say anything about src's from other blocks
211
    */
212
   if (instr->block != ctx->block)
213
      return true;
214

215
   struct ir3_sched_node *n = instr->data;
216
   return n->tex_index >= ctx->first_outstanding_tex_index;
217
}
218

219
static bool
220
is_outstanding_sfu(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx)
221
{
222
   if (!is_sfu(instr))
223
      return false;
224

225
   /* The sched node is only valid within the same block, we cannot
226
    * really say anything about src's from other blocks
227
    */
228
   if (instr->block != ctx->block)
229
      return true;
230

231
   struct ir3_sched_node *n = instr->data;
232
   return n->sfu_index >= ctx->first_outstanding_sfu_index;
233
}
234

235
static unsigned
236
cycle_count(struct ir3_instruction *instr)
237
{
238
   if (instr->opc == OPC_META_COLLECT) {
239
      /* Assume that only immed/const sources produce moves */
240
      unsigned n = 0;
241
      foreach_src (src, instr) {
242
         if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
243
            n++;
244
      }
245
      return n;
246
   } else if (is_meta(instr)) {
247
      return 0;
248
   } else {
249
      return 1;
250
   }
251
}
252

253
static void
254
schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
255
{
256
   debug_assert(ctx->block == instr->block);
257

258
   /* remove from depth list:
259
    */
260
   list_delinit(&instr->node);
261

262
   if (writes_addr0(instr)) {
263
      debug_assert(ctx->addr0 == NULL);
264
      ctx->addr0 = instr;
265
   }
266

267
   if (writes_addr1(instr)) {
268
      debug_assert(ctx->addr1 == NULL);
269
      ctx->addr1 = instr;
270
   }
271

272
   if (writes_pred(instr)) {
273
      debug_assert(ctx->pred == NULL);
274
      ctx->pred = instr;
275
   }
276

277
   instr->flags |= IR3_INSTR_MARK;
278

279
   di(instr, "schedule");
280

281
   list_addtail(&instr->node, &instr->block->instr_list);
282
   ctx->scheduled = instr;
283

284
   if (is_kill_or_demote(instr)) {
285
      assert(ctx->remaining_kills > 0);
286
      ctx->remaining_kills--;
287
   }
288

289
   struct ir3_sched_node *n = instr->data;
290

291
   /* If this instruction is a meta:collect src, mark the remaining
292
    * collect srcs as partially live.
293
    */
294
   if (n->collect) {
295
      foreach_ssa_src (src, n->collect) {
296
         if (src->block != instr->block)
297
            continue;
298
         struct ir3_sched_node *sn = src->data;
299
         sn->partially_live = true;
300
      }
301
   }
302

303
   dag_prune_head(ctx->dag, &n->dag);
304

305
   unsigned cycles = cycle_count(instr);
306

307
   if (is_sfu(instr)) {
308
      ctx->sfu_delay = 8;
309
      n->sfu_index = ctx->sfu_index++;
310
   } else if (!is_meta(instr) &&
311
              sched_check_src_cond(instr, is_outstanding_sfu, ctx)) {
312
      ctx->sfu_delay = 0;
313
      ctx->first_outstanding_sfu_index = ctx->sfu_index;
314
   } else if (ctx->sfu_delay > 0) {
315
      ctx->sfu_delay -= MIN2(cycles, ctx->sfu_delay);
316
   }
317

318
   if (is_tex_or_prefetch(instr)) {
319
      /* NOTE that this isn't an attempt to hide texture fetch latency,
320
       * but an attempt to hide the cost of switching to another warp.
321
       * If we can, we'd like to try to schedule another texture fetch
322
       * before scheduling something that would sync.
323
       */
324
      ctx->tex_delay = 10;
325
      assert(ctx->remaining_tex > 0);
326
      ctx->remaining_tex--;
327
      n->tex_index = ctx->tex_index++;
328
   } else if (!is_meta(instr) &&
329
              sched_check_src_cond(instr, is_outstanding_tex_or_prefetch,
330
                                   ctx)) {
331
      ctx->tex_delay = 0;
332
      ctx->first_outstanding_tex_index = ctx->tex_index;
333
   } else if (ctx->tex_delay > 0) {
334
      ctx->tex_delay -= MIN2(cycles, ctx->tex_delay);
335
   }
336
}
337

338
struct ir3_sched_notes {
339
   /* there is at least one kill which could be scheduled, except
340
    * for unscheduled bary.f's:
341
    */
342
   bool blocked_kill;
343
   /* there is at least one instruction that could be scheduled,
344
    * except for conflicting address/predicate register usage:
345
    */
346
   bool addr0_conflict, addr1_conflict, pred_conflict;
347
};
348

349
/* could an instruction be scheduled if specified ssa src was scheduled? */
350
static bool
351
could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
352
{
353
   foreach_ssa_src (other_src, instr) {
354
      /* if dependency not scheduled, we aren't ready yet: */
355
      if ((src != other_src) && !is_scheduled(other_src)) {
356
         return false;
357
      }
358
   }
359
   return true;
360
}
361

362
/* Check if instruction is ok to schedule.  Make sure it is not blocked
363
 * by use of addr/predicate register, etc.
364
 */
365
static bool
366
check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
367
            struct ir3_instruction *instr)
368
{
369
   debug_assert(!is_scheduled(instr));
370

371
   if (instr == ctx->split) {
372
      /* Don't schedule instructions created by splitting a a0.x/a1.x/p0.x
373
       * write until another "normal" instruction has been scheduled.
374
       */
375
      return false;
376
   }
377

378
   if (ctx->remaining_kills && (is_tex(instr) || is_mem(instr))) {
379
      /* avoid texture/memory access if we have unscheduled kills
380
       * that could make the expensive operation unnecessary.  By
381
       * definition, if there are remaining kills, and this instr
382
       * is not a dependency of a kill, there are other instructions
383
       * that we can choose from.
384
       */
385
      struct ir3_sched_node *n = instr->data;
386
      if (!n->kill_path)
387
         return false;
388
   }
389

390
   /* For instructions that write address register we need to
391
    * make sure there is at least one instruction that uses the
392
    * addr value which is otherwise ready.
393
    *
394
    * NOTE if any instructions use pred register and have other
395
    * src args, we would need to do the same for writes_pred()..
396
    */
397
   if (writes_addr0(instr)) {
398
      struct ir3 *ir = instr->block->shader;
399
      bool ready = false;
400
      for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) {
401
         struct ir3_instruction *indirect = ir->a0_users[i];
402
         if (!indirect)
403
            continue;
404
         if (indirect->address->def != instr->dsts[0])
405
            continue;
406
         ready = could_sched(indirect, instr);
407
      }
408

409
      /* nothing could be scheduled, so keep looking: */
410
      if (!ready)
411
         return false;
412
   }
413

414
   if (writes_addr1(instr)) {
415
      struct ir3 *ir = instr->block->shader;
416
      bool ready = false;
417
      for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) {
418
         struct ir3_instruction *indirect = ir->a1_users[i];
419
         if (!indirect)
420
            continue;
421
         if (indirect->address->def != instr->dsts[0])
422
            continue;
423
         ready = could_sched(indirect, instr);
424
      }
425

426
      /* nothing could be scheduled, so keep looking: */
427
      if (!ready)
428
         return false;
429
   }
430

431
   /* if this is a write to address/predicate register, and that
432
    * register is currently in use, we need to defer until it is
433
    * free:
434
    */
435
   if (writes_addr0(instr) && ctx->addr0) {
436
      debug_assert(ctx->addr0 != instr);
437
      notes->addr0_conflict = true;
438
      return false;
439
   }
440

441
   if (writes_addr1(instr) && ctx->addr1) {
442
      debug_assert(ctx->addr1 != instr);
443
      notes->addr1_conflict = true;
444
      return false;
445
   }
446

447
   if (writes_pred(instr) && ctx->pred) {
448
      debug_assert(ctx->pred != instr);
449
      notes->pred_conflict = true;
450
      return false;
451
   }
452

453
   /* if the instruction is a kill, we need to ensure *every*
454
    * bary.f is scheduled.  The hw seems unhappy if the thread
455
    * gets killed before the end-input (ei) flag is hit.
456
    *
457
    * We could do this by adding each bary.f instruction as
458
    * virtual ssa src for the kill instruction.  But we have
459
    * fixed length instr->srcs[].
460
    *
461
    * TODO we could handle this by false-deps now, probably.
462
    */
463
   if (is_kill_or_demote(instr)) {
464
      struct ir3 *ir = instr->block->shader;
465

466
      for (unsigned i = 0; i < ir->baryfs_count; i++) {
467
         struct ir3_instruction *baryf = ir->baryfs[i];
468
         if (baryf->flags & IR3_INSTR_UNUSED)
469
            continue;
470
         if (!is_scheduled(baryf)) {
471
            notes->blocked_kill = true;
472
            return false;
473
         }
474
      }
475
   }
476

477
   return true;
478
}
479

480
/* Find the instr->ip of the closest use of an instruction, in
481
 * pre-sched order.  This isn't going to be the same as post-sched
482
 * order, but it is a reasonable approximation to limit scheduling
483
 * instructions *too* early.  This is mostly to prevent bad behavior
484
 * in cases where we have a large number of possible instructions
485
 * to choose, to avoid creating too much parallelism (ie. blowing
486
 * up register pressure)
487
 *
488
 * See
489
 * dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread
490
 */
491
static int
492
nearest_use(struct ir3_instruction *instr)
493
{
494
   unsigned nearest = ~0;
495
   foreach_ssa_use (use, instr)
496
      if (!is_scheduled(use))
497
         nearest = MIN2(nearest, use->ip);
498

499
   /* slight hack.. this heuristic tends to push bary.f's to later
500
    * in the shader, closer to their uses.  But we actually would
501
    * prefer to get these scheduled earlier, to unlock varying
502
    * storage for more VS jobs:
503
    */
504
   if (is_input(instr))
505
      nearest /= 2;
506

507
   return nearest;
508
}
509

510
static bool
511
is_only_nonscheduled_use(struct ir3_instruction *instr,
512
                         struct ir3_instruction *use)
513
{
514
   foreach_ssa_use (other_use, instr) {
515
      if (other_use != use && !is_scheduled(other_use))
516
         return false;
517
   }
518

519
   return true;
520
}
521

522
/* find net change to live values if instruction were scheduled: */
523
static int
524
live_effect(struct ir3_instruction *instr)
525
{
526
   struct ir3_sched_node *n = instr->data;
527
   int new_live =
528
      (n->partially_live || !instr->uses || instr->uses->entries == 0)
529
         ? 0
530
         : dest_regs(instr);
531
   int freed_live = 0;
532

533
   /* if we schedule something that causes a vecN to be live,
534
    * then count all it's other components too:
535
    */
536
   if (n->collect)
537
      new_live *= n->collect->srcs_count;
538

539
   foreach_ssa_src_n (src, n, instr) {
540
      if (__is_false_dep(instr, n))
541
         continue;
542

543
      if (instr->block != src->block)
544
         continue;
545

546
      if (is_only_nonscheduled_use(src, instr))
547
         freed_live += dest_regs(src);
548
   }
549

550
   return new_live - freed_live;
551
}
552

553
/* Determine if this is an instruction that we'd prefer not to schedule
554
 * yet, in order to avoid an (ss)/(sy) sync.  This is limited by the
555
 * sfu_delay/tex_delay counters, ie. the more cycles it has been since
556
 * the last SFU/tex, the less costly a sync would be, and the number of
557
 * outstanding SFU/tex instructions to prevent a blowup in register pressure.
558
 */
559
static bool
560
should_defer(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
561
{
562
   if (ctx->sfu_delay) {
563
      if (sched_check_src_cond(instr, is_outstanding_sfu, ctx))
564
         return true;
565
   }
566

567
   /* We mostly just want to try to schedule another texture fetch
568
    * before scheduling something that would (sy) sync, so we can
569
    * limit this rule to cases where there are remaining texture
570
    * fetches
571
    */
572
   if (ctx->tex_delay && ctx->remaining_tex) {
573
      if (sched_check_src_cond(instr, is_outstanding_tex_or_prefetch, ctx))
574
         return true;
575
   }
576

577
   /* Avoid scheduling too many outstanding texture or sfu instructions at
578
    * once by deferring further tex/SFU instructions. This both prevents
579
    * stalls when the queue of texture/sfu instructions becomes too large,
580
    * and prevents unacceptably large increases in register pressure from too
581
    * many outstanding texture instructions.
582
    */
583
   if (ctx->tex_index - ctx->first_outstanding_tex_index >= 8 && is_tex(instr))
584
      return true;
585

586
   if (ctx->sfu_index - ctx->first_outstanding_sfu_index >= 8 && is_sfu(instr))
587
      return true;
588

589
   return false;
590
}
591

592
static struct ir3_sched_node *choose_instr_inc(struct ir3_sched_ctx *ctx,
593
                                               struct ir3_sched_notes *notes,
594
                                               bool defer, bool avoid_output);
595

596
/**
597
 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
598
 * Scheduling for Register pressure) heuristic.
599
 *
600
 * Only handles the case of choosing instructions that reduce register pressure
601
 * or are even.
602
 */
603
static struct ir3_sched_node *
604
choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
605
                 bool defer)
606
{
607
   const char *mode = defer ? "-d" : "";
608
   struct ir3_sched_node *chosen = NULL;
609

610
   /* Find a ready inst with regs freed and pick the one with max cost. */
611
   foreach_sched_node (n, &ctx->dag->heads) {
612
      if (defer && should_defer(ctx, n->instr))
613
         continue;
614

615
      /* Note: mergedregs is only used post-RA, just set it to false */
616
      unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
617

618
      if (d > 0)
619
         continue;
620

621
      if (live_effect(n->instr) > -1)
622
         continue;
623

624
      if (!check_instr(ctx, notes, n->instr))
625
         continue;
626

627
      if (!chosen || chosen->max_delay < n->max_delay) {
628
         chosen = n;
629
      }
630
   }
631

632
   if (chosen) {
633
      di(chosen->instr, "dec%s: chose (freed+ready)", mode);
634
      return chosen;
635
   }
636

637
   /* Find a leader with regs freed and pick the one with max cost. */
638
   foreach_sched_node (n, &ctx->dag->heads) {
639
      if (defer && should_defer(ctx, n->instr))
640
         continue;
641

642
      if (live_effect(n->instr) > -1)
643
         continue;
644

645
      if (!check_instr(ctx, notes, n->instr))
646
         continue;
647

648
      if (!chosen || chosen->max_delay < n->max_delay) {
649
         chosen = n;
650
      }
651
   }
652

653
   if (chosen) {
654
      di(chosen->instr, "dec%s: chose (freed)", mode);
655
      return chosen;
656
   }
657

658
   /* Contra the paper, pick a leader with no effect on used regs.  This may
659
    * open up new opportunities, as otherwise a single-operand instr consuming
660
    * a value will tend to block finding freeing that value.  This had a
661
    * massive effect on reducing spilling on V3D.
662
    *
663
    * XXX: Should this prioritize ready?
664
    */
665
   foreach_sched_node (n, &ctx->dag->heads) {
666
      if (defer && should_defer(ctx, n->instr))
667
         continue;
668

669
      unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
670

671
      if (d > 0)
672
         continue;
673

674
      if (live_effect(n->instr) > 0)
675
         continue;
676

677
      if (!check_instr(ctx, notes, n->instr))
678
         continue;
679

680
      if (!chosen || chosen->max_delay < n->max_delay)
681
         chosen = n;
682
   }
683

684
   if (chosen) {
685
      di(chosen->instr, "dec%s: chose (neutral+ready)", mode);
686
      return chosen;
687
   }
688

689
   foreach_sched_node (n, &ctx->dag->heads) {
690
      if (defer && should_defer(ctx, n->instr))
691
         continue;
692

693
      if (live_effect(n->instr) > 0)
694
         continue;
695

696
      if (!check_instr(ctx, notes, n->instr))
697
         continue;
698

699
      if (!chosen || chosen->max_delay < n->max_delay)
700
         chosen = n;
701
   }
702

703
   if (chosen) {
704
      di(chosen->instr, "dec%s: chose (neutral)", mode);
705
      return chosen;
706
   }
707

708
   return choose_instr_inc(ctx, notes, defer, true);
709
}
710

711
/**
712
 * When we can't choose an instruction that reduces register pressure or
713
 * is neutral, we end up here to try and pick the least bad option.
714
 */
715
static struct ir3_sched_node *
716
choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
717
                 bool defer, bool avoid_output)
718
{
719
   const char *mode = defer ? "-d" : "";
720
   struct ir3_sched_node *chosen = NULL;
721

722
   /*
723
    * From hear on out, we are picking something that increases
724
    * register pressure.  So try to pick something which will
725
    * be consumed soon:
726
    */
727
   unsigned chosen_distance = 0;
728

729
   /* Pick the max delay of the remaining ready set. */
730
   foreach_sched_node (n, &ctx->dag->heads) {
731
      if (avoid_output && n->output)
732
         continue;
733

734
      if (defer && should_defer(ctx, n->instr))
735
         continue;
736

737
      unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
738

739
      if (d > 0)
740
         continue;
741

742
      if (!check_instr(ctx, notes, n->instr))
743
         continue;
744

745
      unsigned distance = nearest_use(n->instr);
746

747
      if (!chosen || distance < chosen_distance) {
748
         chosen = n;
749
         chosen_distance = distance;
750
      }
751
   }
752

753
   if (chosen) {
754
      di(chosen->instr, "inc%s: chose (distance+ready)", mode);
755
      return chosen;
756
   }
757

758
   /* Pick the max delay of the remaining leaders. */
759
   foreach_sched_node (n, &ctx->dag->heads) {
760
      if (avoid_output && n->output)
761
         continue;
762

763
      if (defer && should_defer(ctx, n->instr))
764
         continue;
765

766
      if (!check_instr(ctx, notes, n->instr))
767
         continue;
768

769
      unsigned distance = nearest_use(n->instr);
770

771
      if (!chosen || distance < chosen_distance) {
772
         chosen = n;
773
         chosen_distance = distance;
774
      }
775
   }
776

777
   if (chosen) {
778
      di(chosen->instr, "inc%s: chose (distance)", mode);
779
      return chosen;
780
   }
781

782
   return NULL;
783
}
784

785
/* Handles instruction selections for instructions we want to prioritize
786
 * even if csp/csr would not pick them.
787
 */
788
static struct ir3_sched_node *
789
choose_instr_prio(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
790
{
791
   struct ir3_sched_node *chosen = NULL;
792

793
   foreach_sched_node (n, &ctx->dag->heads) {
794
      /*
795
       * - phi nodes and inputs must be scheduled first
796
       * - split should be scheduled first, so that the vector value is
797
       *   killed as soon as possible. RA cannot split up the vector and
798
       *   reuse components that have been killed until it's been killed.
799
       * - collect, on the other hand, should be treated as a "normal"
800
       *   instruction, and may add to register pressure if its sources are
801
       *   part of another vector or immediates.
802
       */
803
      if (!is_meta(n->instr) || n->instr->opc == OPC_META_COLLECT)
804
         continue;
805

806
      if (!chosen || (chosen->max_delay < n->max_delay))
807
         chosen = n;
808
   }
809

810
   if (chosen) {
811
      di(chosen->instr, "prio: chose (meta)");
812
      return chosen;
813
   }
814

815
   return NULL;
816
}
817

818
static void
819
dump_state(struct ir3_sched_ctx *ctx)
820
{
821
   if (!SCHED_DEBUG)
822
      return;
823

824
   foreach_sched_node (n, &ctx->dag->heads) {
825
      di(n->instr, "maxdel=%3d le=%d del=%u ", n->max_delay,
826
         live_effect(n->instr), ir3_delay_calc_prera(ctx->block, n->instr));
827

828
      util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
829
         struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
830

831
         di(child->instr, " -> (%d parents) ", child->dag.parent_count);
832
      }
833
   }
834
}
835

836
/* find instruction to schedule: */
837
static struct ir3_instruction *
838
choose_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
839
{
840
   struct ir3_sched_node *chosen;
841

842
   dump_state(ctx);
843

844
   chosen = choose_instr_prio(ctx, notes);
845
   if (chosen)
846
      return chosen->instr;
847

848
   chosen = choose_instr_dec(ctx, notes, true);
849
   if (chosen)
850
      return chosen->instr;
851

852
   chosen = choose_instr_dec(ctx, notes, false);
853
   if (chosen)
854
      return chosen->instr;
855

856
   chosen = choose_instr_inc(ctx, notes, false, false);
857
   if (chosen)
858
      return chosen->instr;
859

860
   return NULL;
861
}
862

863
static struct ir3_instruction *
864
split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
865
{
866
   struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
867
   di(new_instr, "split instruction");
868
   sched_node_init(ctx, new_instr);
869
   return new_instr;
870
}
871

872
/* "spill" the address registers by remapping any unscheduled
873
 * instructions which depend on the current address register
874
 * to a clone of the instruction which wrote the address reg.
875
 */
876
static struct ir3_instruction *
877
split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
878
           struct ir3_instruction **users, unsigned users_count)
879
{
880
   struct ir3_instruction *new_addr = NULL;
881
   unsigned i;
882

883
   debug_assert(*addr);
884

885
   for (i = 0; i < users_count; i++) {
886
      struct ir3_instruction *indirect = users[i];
887

888
      if (!indirect)
889
         continue;
890

891
      /* skip instructions already scheduled: */
892
      if (is_scheduled(indirect))
893
         continue;
894

895
      /* remap remaining instructions using current addr
896
       * to new addr:
897
       */
898
      if (indirect->address->def == (*addr)->dsts[0]) {
899
         if (!new_addr) {
900
            new_addr = split_instr(ctx, *addr);
901
            /* original addr is scheduled, but new one isn't: */
902
            new_addr->flags &= ~IR3_INSTR_MARK;
903
         }
904
         indirect->address->def = new_addr->dsts[0];
905
         /* don't need to remove old dag edge since old addr is
906
          * already scheduled:
907
          */
908
         sched_node_add_dep(indirect, new_addr, 0);
909
         di(indirect, "new address");
910
      }
911
   }
912

913
   /* all remaining indirects remapped to new addr: */
914
   *addr = NULL;
915

916
   return new_addr;
917
}
918

919
/* "spill" the predicate register by remapping any unscheduled
920
 * instructions which depend on the current predicate register
921
 * to a clone of the instruction which wrote the address reg.
922
 */
923
static struct ir3_instruction *
924
split_pred(struct ir3_sched_ctx *ctx)
925
{
926
   struct ir3 *ir;
927
   struct ir3_instruction *new_pred = NULL;
928
   unsigned i;
929

930
   debug_assert(ctx->pred);
931

932
   ir = ctx->pred->block->shader;
933

934
   for (i = 0; i < ir->predicates_count; i++) {
935
      struct ir3_instruction *predicated = ir->predicates[i];
936

937
      if (!predicated)
938
         continue;
939

940
      /* skip instructions already scheduled: */
941
      if (is_scheduled(predicated))
942
         continue;
943

944
      /* remap remaining instructions using current pred
945
       * to new pred:
946
       *
947
       * TODO is there ever a case when pred isn't first
948
       * (and only) src?
949
       */
950
      if (ssa(predicated->srcs[0]) == ctx->pred) {
951
         if (!new_pred) {
952
            new_pred = split_instr(ctx, ctx->pred);
953
            /* original pred is scheduled, but new one isn't: */
954
            new_pred->flags &= ~IR3_INSTR_MARK;
955
         }
956
         predicated->srcs[0]->instr = new_pred;
957
         /* don't need to remove old dag edge since old pred is
958
          * already scheduled:
959
          */
960
         sched_node_add_dep(predicated, new_pred, 0);
961
         di(predicated, "new predicate");
962
      }
963
   }
964

965
   if (ctx->block->condition == ctx->pred) {
966
      if (!new_pred) {
967
         new_pred = split_instr(ctx, ctx->pred);
968
         /* original pred is scheduled, but new one isn't: */
969
         new_pred->flags &= ~IR3_INSTR_MARK;
970
      }
971
      ctx->block->condition = new_pred;
972
      d("new branch condition");
973
   }
974

975
   /* all remaining predicated remapped to new pred: */
976
   ctx->pred = NULL;
977

978
   return new_pred;
979
}
980

981
static void
982
sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
983
{
984
   struct ir3_sched_node *n = rzalloc(ctx->dag, struct ir3_sched_node);
985

986
   dag_init_node(ctx->dag, &n->dag);
987

988
   n->instr = instr;
989
   instr->data = n;
990
}
991

992
static void
993
sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src,
994
                   int i)
995
{
996
   /* don't consider dependencies in other blocks: */
997
   if (src->block != instr->block)
998
      return;
999

1000
   /* we could have false-dep's that end up unused: */
1001
   if (src->flags & IR3_INSTR_UNUSED) {
1002
      debug_assert(__is_false_dep(instr, i));
1003
      return;
1004
   }
1005

1006
   struct ir3_sched_node *n = instr->data;
1007
   struct ir3_sched_node *sn = src->data;
1008

1009
   /* If src is consumed by a collect, track that to realize that once
1010
    * any of the collect srcs are live, we should hurry up and schedule
1011
    * the rest.
1012
    */
1013
   if (instr->opc == OPC_META_COLLECT)
1014
      sn->collect = instr;
1015

1016
   dag_add_edge(&sn->dag, &n->dag, NULL);
1017

1018
   unsigned d = ir3_delayslots(src, instr, i, true);
1019

1020
   n->delay = MAX2(n->delay, d);
1021
}
1022

1023
static void
1024
mark_kill_path(struct ir3_instruction *instr)
1025
{
1026
   struct ir3_sched_node *n = instr->data;
1027

1028
   if (n->kill_path) {
1029
      return;
1030
   }
1031

1032
   n->kill_path = true;
1033

1034
   foreach_ssa_src (src, instr) {
1035
      if (src->block != instr->block)
1036
         continue;
1037
      mark_kill_path(src);
1038
   }
1039
}
1040

1041
/* Is it an output? */
1042
static bool
1043
is_output_collect(struct ir3_instruction *instr)
1044
{
1045
   if (instr->opc != OPC_META_COLLECT)
1046
      return false;
1047

1048
   foreach_ssa_use (use, instr) {
1049
      if (use->opc != OPC_END && use->opc != OPC_CHMASK)
1050
         return false;
1051
   }
1052

1053
   return true;
1054
}
1055

1056
/* Is it's only use as output? */
1057
static bool
1058
is_output_only(struct ir3_instruction *instr)
1059
{
1060
   if (!writes_gpr(instr))
1061
      return false;
1062

1063
   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1064
      return false;
1065

1066
   foreach_ssa_use (use, instr)
1067
      if (!is_output_collect(use))
1068
         return false;
1069

1070
   return true;
1071
}
1072

1073
static void
1074
sched_node_add_deps(struct ir3_instruction *instr)
1075
{
1076
   /* There's nothing to do for phi nodes, since they always go first. And
1077
    * phi nodes can reference sources later in the same block, so handling
1078
    * sources is not only unnecessary but could cause problems.
1079
    */
1080
   if (instr->opc == OPC_META_PHI)
1081
      return;
1082

1083
   /* Since foreach_ssa_src() already handles false-dep's we can construct
1084
    * the DAG easily in a single pass.
1085
    */
1086
   foreach_ssa_src_n (src, i, instr) {
1087
      sched_node_add_dep(instr, src, i);
1088
   }
1089

1090
   /* NOTE that all inputs must be scheduled before a kill, so
1091
    * mark these to be prioritized as well:
1092
    */
1093
   if (is_kill_or_demote(instr) || is_input(instr)) {
1094
      mark_kill_path(instr);
1095
   }
1096

1097
   if (is_output_only(instr)) {
1098
      struct ir3_sched_node *n = instr->data;
1099
      n->output = true;
1100
   }
1101
}
1102

1103
static void
1104
sched_dag_max_delay_cb(struct dag_node *node, void *state)
1105
{
1106
   struct ir3_sched_node *n = (struct ir3_sched_node *)node;
1107
   uint32_t max_delay = 0;
1108

1109
   util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
1110
      struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
1111
      max_delay = MAX2(child->max_delay, max_delay);
1112
   }
1113

1114
   n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
1115
}
1116

1117
static void
1118
sched_dag_init(struct ir3_sched_ctx *ctx)
1119
{
1120
   ctx->dag = dag_create(ctx);
1121

1122
   foreach_instr (instr, &ctx->unscheduled_list) {
1123
      sched_node_init(ctx, instr);
1124
      sched_node_add_deps(instr);
1125
   }
1126

1127
   dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
1128
}
1129

1130
static void
1131
sched_dag_destroy(struct ir3_sched_ctx *ctx)
1132
{
1133
   ralloc_free(ctx->dag);
1134
   ctx->dag = NULL;
1135
}
1136

1137
static void
1138
sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
1139
{
1140
   ctx->block = block;
1141

1142
   /* addr/pred writes are per-block: */
1143
   ctx->addr0 = NULL;
1144
   ctx->addr1 = NULL;
1145
   ctx->pred = NULL;
1146
   ctx->tex_delay = 0;
1147
   ctx->sfu_delay = 0;
1148
   ctx->tex_index = ctx->first_outstanding_tex_index = 0;
1149
   ctx->sfu_index = ctx->first_outstanding_sfu_index = 0;
1150

1151
   /* move all instructions to the unscheduled list, and
1152
    * empty the block's instruction list (to which we will
1153
    * be inserting).
1154
    */
1155
   list_replace(&block->instr_list, &ctx->unscheduled_list);
1156
   list_inithead(&block->instr_list);
1157

1158
   sched_dag_init(ctx);
1159

1160
   ctx->remaining_kills = 0;
1161
   ctx->remaining_tex = 0;
1162
   foreach_instr_safe (instr, &ctx->unscheduled_list) {
1163
      if (is_kill_or_demote(instr))
1164
         ctx->remaining_kills++;
1165
      if (is_tex_or_prefetch(instr))
1166
         ctx->remaining_tex++;
1167
   }
1168

1169
   /* First schedule all meta:input and meta:phi instructions, followed by
1170
    * tex-prefetch.  We want all of the instructions that load values into
1171
    * registers before the shader starts to go before any other instructions.
1172
    * But in particular we want inputs to come before prefetches.  This is
1173
    * because a FS's bary_ij input may not actually be live in the shader,
1174
    * but it should not be scheduled on top of any other input (but can be
1175
    * overwritten by a tex prefetch)
1176
    *
1177
    * Note: Because the first block cannot have predecessors, meta:input and
1178
    * meta:phi cannot exist in the same block.
1179
    */
1180
   foreach_instr_safe (instr, &ctx->unscheduled_list)
1181
      if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_PHI)
1182
         schedule(ctx, instr);
1183

1184
   foreach_instr_safe (instr, &ctx->unscheduled_list)
1185
      if (instr->opc == OPC_META_TEX_PREFETCH)
1186
         schedule(ctx, instr);
1187

1188
   while (!list_is_empty(&ctx->unscheduled_list)) {
1189
      struct ir3_sched_notes notes = {0};
1190
      struct ir3_instruction *instr;
1191

1192
      instr = choose_instr(ctx, &notes);
1193
      if (instr) {
1194
         unsigned delay = ir3_delay_calc_prera(ctx->block, instr);
1195
         d("delay=%u", delay);
1196

1197
         /* and if we run out of instructions that can be scheduled,
1198
          * then it is time for nop's:
1199
          */
1200
         debug_assert(delay <= 6);
1201
         while (delay > 0) {
1202
            ir3_NOP(block);
1203
            delay--;
1204
         }
1205

1206
         schedule(ctx, instr);
1207

1208
         /* Since we've scheduled a "real" instruction, we can now
1209
          * schedule any split instruction created by the scheduler again.
1210
          */
1211
         ctx->split = NULL;
1212
      } else {
1213
         struct ir3_instruction *new_instr = NULL;
1214
         struct ir3 *ir = block->shader;
1215

1216
         /* nothing available to schedule.. if we are blocked on
1217
          * address/predicate register conflict, then break the
1218
          * deadlock by cloning the instruction that wrote that
1219
          * reg:
1220
          */
1221
         if (notes.addr0_conflict) {
1222
            new_instr =
1223
               split_addr(ctx, &ctx->addr0, ir->a0_users, ir->a0_users_count);
1224
         } else if (notes.addr1_conflict) {
1225
            new_instr =
1226
               split_addr(ctx, &ctx->addr1, ir->a1_users, ir->a1_users_count);
1227
         } else if (notes.pred_conflict) {
1228
            new_instr = split_pred(ctx);
1229
         } else {
1230
            d("unscheduled_list:");
1231
            foreach_instr (instr, &ctx->unscheduled_list)
1232
               di(instr, "unscheduled: ");
1233
            debug_assert(0);
1234
            ctx->error = true;
1235
            return;
1236
         }
1237

1238
         if (new_instr) {
1239
            list_delinit(&new_instr->node);
1240
            list_addtail(&new_instr->node, &ctx->unscheduled_list);
1241
         }
1242

1243
         /* If we produced a new instruction, do not schedule it next to
1244
          * guarantee progress.
1245
          */
1246
         ctx->split = new_instr;
1247
      }
1248
   }
1249

1250
   sched_dag_destroy(ctx);
1251
}
1252

1253
int
1254
ir3_sched(struct ir3 *ir)
1255
{
1256
   struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx);
1257

1258
   foreach_block (block, &ir->block_list) {
1259
      foreach_instr (instr, &block->instr_list) {
1260
         instr->data = NULL;
1261
      }
1262
   }
1263

1264
   ir3_count_instructions(ir);
1265
   ir3_clear_mark(ir);
1266
   ir3_find_ssa_uses(ir, ctx, false);
1267

1268
   foreach_block (block, &ir->block_list) {
1269
      sched_block(ctx, block);
1270
   }
1271

1272
   int ret = ctx->error ? -1 : 0;
1273

1274
   ralloc_free(ctx);
1275

1276
   return ret;
1277
}
1278

1279
static unsigned
1280
get_array_id(struct ir3_instruction *instr)
1281
{
1282
   /* The expectation is that there is only a single array
1283
    * src or dst, ir3_cp should enforce this.
1284
    */
1285

1286
   foreach_dst (dst, instr)
1287
      if (dst->flags & IR3_REG_ARRAY)
1288
         return dst->array.id;
1289
   foreach_src (src, instr)
1290
      if (src->flags & IR3_REG_ARRAY)
1291
         return src->array.id;
1292

1293
   unreachable("this was unexpected");
1294
}
1295

1296
/* does instruction 'prior' need to be scheduled before 'instr'? */
1297
static bool
1298
depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
1299
{
1300
   /* TODO for dependencies that are related to a specific object, ie
1301
    * a specific SSBO/image/array, we could relax this constraint to
1302
    * make accesses to unrelated objects not depend on each other (at
1303
    * least as long as not declared coherent)
1304
    */
1305
   if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) &&
1306
        prior->barrier_class) ||
1307
       ((prior->barrier_class & IR3_BARRIER_EVERYTHING) &&
1308
        instr->barrier_class))
1309
      return true;
1310

1311
   if (instr->barrier_class & prior->barrier_conflict) {
1312
      if (!(instr->barrier_class &
1313
            ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
1314
         /* if only array barrier, then we can further limit false-deps
1315
          * by considering the array-id, ie reads/writes to different
1316
          * arrays do not depend on each other (no aliasing)
1317
          */
1318
         if (get_array_id(instr) != get_array_id(prior)) {
1319
            return false;
1320
         }
1321
      }
1322

1323
      return true;
1324
   }
1325

1326
   return false;
1327
}
1328

1329
static void
1330
add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
1331
{
1332
   struct list_head *prev = instr->node.prev;
1333
   struct list_head *next = instr->node.next;
1334

1335
   /* add dependencies on previous instructions that must be scheduled
1336
    * prior to the current instruction
1337
    */
1338
   while (prev != &block->instr_list) {
1339
      struct ir3_instruction *pi =
1340
         LIST_ENTRY(struct ir3_instruction, prev, node);
1341

1342
      prev = prev->prev;
1343

1344
      if (is_meta(pi))
1345
         continue;
1346

1347
      if (instr->barrier_class == pi->barrier_class) {
1348
         ir3_instr_add_dep(instr, pi);
1349
         break;
1350
      }
1351

1352
      if (depends_on(instr, pi))
1353
         ir3_instr_add_dep(instr, pi);
1354
   }
1355

1356
   /* add dependencies on this instruction to following instructions
1357
    * that must be scheduled after the current instruction:
1358
    */
1359
   while (next != &block->instr_list) {
1360
      struct ir3_instruction *ni =
1361
         LIST_ENTRY(struct ir3_instruction, next, node);
1362

1363
      next = next->next;
1364

1365
      if (is_meta(ni))
1366
         continue;
1367

1368
      if (instr->barrier_class == ni->barrier_class) {
1369
         ir3_instr_add_dep(ni, instr);
1370
         break;
1371
      }
1372

1373
      if (depends_on(ni, instr))
1374
         ir3_instr_add_dep(ni, instr);
1375
   }
1376
}
1377

1378
/* before scheduling a block, we need to add any necessary false-dependencies
1379
 * to ensure that:
1380
 *
1381
 *  (1) barriers are scheduled in the right order wrt instructions related
1382
 *      to the barrier
1383
 *
1384
 *  (2) reads that come before a write actually get scheduled before the
1385
 *      write
1386
 */
1387
bool
1388
ir3_sched_add_deps(struct ir3 *ir)
1389
{
1390
   bool progress = false;
1391

1392
   foreach_block (block, &ir->block_list) {
1393
      foreach_instr (instr, &block->instr_list) {
1394
         if (instr->barrier_class) {
1395
            add_barrier_deps(block, instr);
1396
            progress = true;
1397
         }
1398
      }
1399
   }
1400

1401
   return progress;
1402
}
1403

1404
Product

Resources

Company