CoCalc -- scheduler.c

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/lima/ir/gp/scheduler.c
⁴⁵⁷⁴ views
1
/*
2
 * Copyright (c) 2017 Lima Project
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sub license,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the
12
 * next paragraph) shall be included in all copies or substantial portions
13
 * of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21
 * DEALINGS IN THE SOFTWARE.
22
 *
23
 */
24

25
#include <limits.h>
26

27
#include "gpir.h"
28

29
/*
30
 * GP scheduling algorithm (by Connor Abbott <[email protected]>)
31
 * 
32
 * The GP pipeline has three main stages:
33
 *
34
 * --------------------------------------------------------
35
 * |                                                      |
36
 * |               Register/Attr/Temp Fetch               |
37
 * |                                                      |
38
 * --------------------------------------------------------
39
 * |        |        |        |        |        |         |
40
 * |  Mul0  |  Mul1  |  Add0  |  Add1  |  Cplx  |  Pass   |
41
 * |        |        |        |        |        |         |
42
 * --------------------------------------------------------
43
 * |                 |                          |         |
44
 * |    Complex1     |   Temp/Register/Varying  |  Pass   |
45
 * |    Stage 2      |         Store            | Stage 2 |
46
 * |                 |                          |         |
47
 * --------------------------------------------------------
48
 *
49
 * Because of this setup, storing a register has a latency of three cycles.
50
 * Also, the register file is organized into 4-component vectors, and the
51
 * load stage can only load two vectors at a time. Aside from these highly
52
 * constrained register load/store units, there is an explicit bypass
53
 * network, where each unit (mul0/mul1/etc.) can access the results of the
54
 * any unit from the previous two cycles directly, except for the complex
55
 * unit whose result can only be accessed for one cycle (since it's expected
56
 * to be used directly by the complex2 instruction in the following cycle).
57
 *
58
 * Because of the very restricted register file, and because only rarely are
59
 * all the units in use at the same time, it can be very beneficial to use
60
 * the unused units to "thread" a value from source to destination by using
61
 * moves in the otherwise-unused units, without involving the register file
62
 * at all. It's very difficult to fully exploit this with a traditional
63
 * scheduler, so we need to do something a little un-traditional. The 512
64
 * instruction limit means that for more complex shaders, we need to do as
65
 * well as possible or else the app won't even work.
66
 *
67
 * The scheduler works by considering the bypass network as a kind of
68
 * register file. It's a quite unusual register file, since registers have to
69
 * be assigned "on the fly" as we schedule operations, but with some care, we
70
 * can use something conceptually similar to a linear-scan allocator to
71
 * successfully schedule nodes to instructions without running into
72
 * conflicts.
73
 *
74
 * Values in the IR are separated into normal values, or "value registers",
75
 * which is what normal nodes like add, mul, etc. produce, and which only
76
 * live inside one basic block, and registers, which can span multiple basic
77
 * blocks but have to be accessed via special load_reg/store_reg nodes. RA
78
 * assigns physical registers to both value registers and normal registers,
79
 * treating load_reg/store_reg as a move instruction, but these are only used
80
 * directly for normal registers -- the physreg assigned to a value register
81
 * is "fake," and is only used inside the scheduler. Before scheduling we
82
 * insert read-after-write dependencies, even for value registers, as if
83
 * we're going to use those, but then we throw them away. For example, if we
84
 * had something like:
85
 *
86
 * (*)r2 = add (*)r1, (*)r2
87
 * (*)r1 = load_reg r0
88
 *
89
 * we'd insert a write-after-read dependency between the add and load_reg,
90
 * even though the starred registers aren't actually used by the scheduler
91
 * after this step. This step is crucial since it guarantees that during any
92
 * point in the schedule, the number of live registers + live value registers
93
 * will never exceed the capacity of the register file and the bypass network
94
 * combined. This is because each live register/value register will have a
95
 * different fake number, thanks to the fake dependencies inserted before
96
 * scheduling. This allows us to not have to worry about spilling to
97
 * temporaries, which is only done ahead of time.
98
 *
99
 * The scheduler is a bottom-up scheduler. It keeps track of each live value
100
 * register, and decides on-the-fly which value registers to keep in the
101
 * bypass network and which to "spill" to registers. Of particular importance
102
 * is the "ready list," which consists of "input nodes" (nodes that produce a
103
 * value that can be consumed via the bypass network), both "partially ready"
104
 * (only some of the uses have been scheduled) and "fully ready" (all uses
105
 * have been scheduled), as well as other non-input nodes like register
106
 * stores. Each input node on the ready list represents a live value register
107
 * before the current instruction. There must be at most 11 such input nodes
108
 * at all times, since there are only 11 slots in the next two instructions
109
 * which can reach the current instruction.
110
 *
111
 * An input node is a "max node" if it has a use two cycles ago, which must be
112
 * connected to a definition this cycle. Otherwise it may be a "next max node"
113
 * if it will be a max node on the next instruction (i.e. it has a use at most
114
 * one cycle ago), or it may be neither if all of its uses are this cycle. As
115
 * we keep adding instructions to the front, input nodes graduate from
116
 * neither, to next max, to max, unless we decide to insert a move to keep it
117
 * alive longer, at which point any uses after the current instruction are
118
 * rewritten to be uses of the move so that the original node returns to
119
 * neither. The scheduler decides which nodes to try freely, but we have to
120
 * reserve slots for two different reasons: (1) out of the 5 non-complex
121
 * slots, we reserve a slot for each max node, so that we can connect a
122
 * definition to the use 2 cycles ago. (2) Out of all 6 slots, we reserve a
123
 * slot for every next-max node above 5, so that for the next instruction
124
 * there are no more than 5 max nodes. When a max or next-max node gets
125
 * scheduled, the corresponding reservation is reduced by one. At the end, we
126
 * insert moves for every slot that was reserved. The reservation is actually
127
 * managed by nir_instr, and all we have to do is tell it how many to reserve
128
 * at the beginning and then tell it which nodes are max/next-max nodes. When
129
 * we start scheduling an instruction, there will be at most 5 max nodes
130
 * thanks to the previous instruction's next-max reservation/move insertion.
131
 * Since there are at most 11 total input nodes, if there are N max nodes,
132
 * there are at most 11 - N next-max nodes, and therefore at most 11 - N - 5 =
133
 * 6 - N slots need to be reserved for next-max nodes, and so at most
134
 * 6 - N + N = 6 slots need to be reserved in total, exactly the total number
135
 * of slots. So, thanks to the total input node restriction, we will never
136
 * need to reserve too many slots.
137
 *
138
 * It sometimes happens that scheduling a given node will violate this total
139
 * input node restriction, or that a reservation will mean that we can't
140
 * schedule it. We first schedule a node "speculatively" to see if this is a
141
 * problem. If some of the node's sources are loads, then we can schedule
142
 * the node and its dependent loads in one swoop to avoid going over the
143
 * pressure limit. If that fails, we can try to spill a ready or
144
 * partially-ready input node to a register by rewriting all of its uses to
145
 * refer to a register load. This removes it from the list of ready and
146
 * partially ready input nodes as all of its uses are now unscheduled. If
147
 * successful, we can then proceed with scheduling the original node. All of
148
 * this happens "speculatively," meaning that afterwards the node is removed
149
 * and the entire state of the scheduler is reverted to before it was tried, to
150
 * ensure that we never get into an invalid state and run out of spots for
151
 * moves. In try_nodes(), we try to schedule each node speculatively on the
152
 * ready list, keeping only the nodes that could be successfully scheduled, so
153
 * that when we finally decide which node to actually schedule, we know it
154
 * will succeed.  This is how we decide on the fly which values go in
155
 * registers and which go in the bypass network. Note that "unspilling" a
156
 * value is simply a matter of scheduling the store_reg instruction created
157
 * when we spill.
158
 *
159
 * The careful accounting of live value registers, reservations for moves, and
160
 * speculative scheduling guarantee that we never run into a failure case
161
 * while scheduling. However, we need to make sure that this scheduler will
162
 * not get stuck in an infinite loop, i.e. that we'll always make forward
163
 * progress by eventually scheduling a non-move node. If we run out of value
164
 * registers, then we may have to spill a node to a register. If we
165
 * were to schedule one of the fully-ready nodes, then we'd have 11 + N live
166
 * value registers before the current instruction. But since there are at most
167
 * 64+11 live registers and register values total thanks to the fake
168
 * dependencies we inserted before scheduling, there are at most 64 - N live
169
 * physical registers, and therefore there are at least N registers available
170
 * for spilling. Not all these registers will be available immediately, since
171
 * in order to spill a node to a given register we have to ensure that there
172
 * are slots available to rewrite every use to a load instruction, and that
173
 * may not be the case. There may also be intervening writes which prevent
174
 * some registers from being used. However, these are all temporary problems,
175
 * since as we create each instruction, we create additional register load
176
 * slots that can be freely used for spilling, and we create more move nodes
177
 * which means that the uses of the nodes we're trying to spill keep moving
178
 * forward. This means that eventually, these problems will go away, at which
179
 * point we'll be able to spill a node successfully, so eventually we'll be
180
 * able to schedule the first node on the ready list.
181
 */
182

183
typedef struct {
184
   /* This is the list of ready and partially-ready nodes. A partially-ready
185
    * node must have at least one input dependency already scheduled.
186
    */
187
   struct list_head ready_list;
188

189
   /* The number of ready or partially-ready nodes with at least one input
190
    * dependency already scheduled. In other words, the number of live value
191
    * registers. This must be at most 11.
192
    */
193
   int ready_list_slots;
194

195
   /* The physical registers live into the current instruction. */
196
   uint64_t live_physregs;
197

198
   /* The current instruction. */
199
   gpir_instr *instr;
200

201
   /* The current basic block. */
202
   gpir_block *block;
203

204
   /* True if at least one node failed to schedule due to lack of available
205
    * value registers.
206
    */
207
   bool try_spill_all;
208

209
   /* The number of max nodes needed to spill to successfully schedule the
210
    * instruction.
211
    */
212
   int max_node_spill_needed;
213

214
   /* The number of max and next-max nodes needed to spill to successfully
215
    * schedule the instruction.
216
    */
217
   int total_spill_needed;
218

219
   /* For each physical register, a linked list of loads associated with it in
220
    * this block. When we spill a value to a given register, and there are
221
    * existing loads associated with it that haven't been scheduled yet, we
222
    * have to make sure that the corresponding unspill happens after the last
223
    * original use has happened, i.e. is scheduled before.
224
    */
225
   struct list_head physreg_reads[GPIR_PHYSICAL_REG_NUM];
226
} sched_ctx;
227

228
static int gpir_min_dist_alu(gpir_dep *dep)
229
{
230
   switch (dep->pred->op) {
231
   case gpir_op_load_uniform:
232
   case gpir_op_load_temp:
233
   case gpir_op_load_reg:
234
   case gpir_op_load_attribute:
235
      return 0;
236

237
   case gpir_op_complex1:
238
      return 2;
239

240
   default:
241
      return 1;
242
   }
243
}
244

245
static int gpir_get_min_dist(gpir_dep *dep)
246
{
247
   switch (dep->type) {
248
   case GPIR_DEP_INPUT:
249
      switch (dep->succ->op) {
250
      case gpir_op_store_temp:
251
      case gpir_op_store_reg:
252
      case gpir_op_store_varying:
253
         /* Stores must use an alu node as input. Also, complex1 takes two
254
          * cycles, which means that its result cannot be stored to a register
255
          * as part of the normal path, and therefore it must also have a move
256
          * inserted.
257
          */
258
         if (dep->pred->type == gpir_node_type_load ||
259
             dep->pred->op == gpir_op_complex1)
260
            return INT_MAX >> 2;
261
         else
262
            return 0;
263

264
      default:
265
         return gpir_min_dist_alu(dep);
266
      }
267

268
   case GPIR_DEP_OFFSET:
269
      assert(dep->succ->op == gpir_op_store_temp);
270
      return gpir_min_dist_alu(dep);
271

272
   case GPIR_DEP_READ_AFTER_WRITE:
273
      if (dep->succ->op == gpir_op_load_temp &&
274
          dep->pred->op == gpir_op_store_temp) {
275
         return 4;
276
      } else if (dep->succ->op == gpir_op_load_reg &&
277
                 dep->pred->op == gpir_op_store_reg) {
278
         return 3;
279
      } else if ((dep->pred->op == gpir_op_store_temp_load_off0 ||
280
                  dep->pred->op == gpir_op_store_temp_load_off1 ||
281
                  dep->pred->op == gpir_op_store_temp_load_off2) &&
282
                 dep->succ->op == gpir_op_load_uniform) {
283
         return 4;
284
      } else {
285
         /* Fake dependency */
286
         return 0;
287
      }
288

289
   case GPIR_DEP_WRITE_AFTER_READ:
290
      return 0;
291
   }
292

293
   return 0;
294
}
295

296
static int gpir_max_dist_alu(gpir_dep *dep)
297
{
298
   switch (dep->pred->op) {
299
   case gpir_op_load_uniform:
300
   case gpir_op_load_temp:
301
      return 0;
302
   case gpir_op_load_attribute:
303
      return 1;
304
   case gpir_op_load_reg:
305
      if (dep->pred->sched.pos < GPIR_INSTR_SLOT_REG0_LOAD0 ||
306
          dep->pred->sched.pos > GPIR_INSTR_SLOT_REG0_LOAD3)
307
         return 0;
308
      else
309
         return 1;
310
   case gpir_op_exp2_impl:
311
   case gpir_op_log2_impl:
312
   case gpir_op_rcp_impl:
313
   case gpir_op_rsqrt_impl:
314
   case gpir_op_store_temp_load_off0:
315
   case gpir_op_store_temp_load_off1:
316
   case gpir_op_store_temp_load_off2:
317
      return 1;
318
   case gpir_op_mov:
319
      if (dep->pred->sched.pos == GPIR_INSTR_SLOT_COMPLEX)
320
         return 1;
321
      else
322
         return 2;
323
   default:
324
      return 2;
325
   }
326
}
327

328
static int gpir_get_max_dist(gpir_dep *dep)
329
{
330
   switch (dep->type) {
331
   case GPIR_DEP_INPUT:
332
      switch (dep->succ->op) {
333
      case gpir_op_store_temp:
334
      case gpir_op_store_reg:
335
      case gpir_op_store_varying:
336
         return 0;
337

338
      default:
339
         return gpir_max_dist_alu(dep);
340
      }
341

342
   case GPIR_DEP_OFFSET:
343
      assert(dep->succ->op == gpir_op_store_temp);
344
      return gpir_max_dist_alu(dep);
345

346
   default:
347
      return INT_MAX >> 2; /* Don't want to overflow... */
348
   }
349
}
350

351
static void schedule_update_distance(gpir_node *node)
352
{
353
   if (gpir_node_is_leaf(node)) {
354
      node->sched.dist = 0;
355
      return;
356
   }
357

358
   gpir_node_foreach_pred(node, dep) {
359
      gpir_node *pred = dep->pred;
360

361
      if (pred->sched.dist < 0)
362
         schedule_update_distance(pred);
363

364
      int dist = pred->sched.dist + gpir_min_dist_alu(dep);
365
      if (node->sched.dist < dist)
366
         node->sched.dist = dist;
367
   }
368
}
369

370
static bool gpir_is_input_node(gpir_node *node)
371
{
372
   gpir_node_foreach_succ(node, dep) {
373
      if (dep->type == GPIR_DEP_INPUT)
374
         return true;
375
   }
376
   return false;
377
}
378

379

380
/* Get the number of slots required for a node on the ready list.
381
 */
382
static int gpir_get_slots_required(gpir_node *node)
383
{
384
   if (!gpir_is_input_node(node))
385
      return 0;
386

387
   /* Note that we assume every node only consumes one slot, even dual-slot
388
    * instructions. While dual-slot instructions may consume more than one
389
    * slot, we can always safely insert a move if it turns out that there
390
    * isn't enough space for them. There's the risk that we get stuck in an
391
    * infinite loop if all the fully ready nodes are dual-slot nodes, but we
392
    * rely on spilling to registers to save us here.
393
    */
394
   return 1;
395
}
396

397
static void ASSERTED verify_ready_list(sched_ctx *ctx)
398
{
399
   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
400
      if (!gpir_is_input_node(node)) {
401
         assert(node->sched.ready);
402
      }
403

404
      if (node->sched.ready) {
405
         /* Every successor must have been scheduled */
406
         gpir_node_foreach_succ(node, dep) {
407
            assert(dep->succ->sched.instr);
408
         }
409
      } else {
410
         /* There must be at least one successor that's not scheduled. */
411
         bool unscheduled = false;
412
         gpir_node_foreach_succ(node, dep) {
413
            unscheduled |= !(dep->succ->sched.instr);
414
         }
415

416
         assert(unscheduled);
417
      }
418
   }
419
}
420

421
static void schedule_insert_ready_list(sched_ctx *ctx,
422
                                       gpir_node *insert_node)
423
{
424
   /* if this node is fully ready or partially ready
425
    *   fully ready: all successors have been scheduled
426
    *   partially ready: part of input successors have been scheduled
427
    *
428
    * either fully ready or partially ready node need be inserted to
429
    * the ready list, but we only schedule a move node for partially
430
    * ready node.
431
    */
432
   bool ready = true, insert = false;
433
   gpir_node_foreach_succ(insert_node, dep) {
434
      gpir_node *succ = dep->succ;
435
      if (succ->sched.instr) {
436
         if (dep->type == GPIR_DEP_INPUT)
437
            insert = true;
438
      }
439
      else
440
         ready = false;
441
   }
442

443
   insert_node->sched.ready = ready;
444
   /* for root node */
445
   insert |= ready;
446

447
   if (!insert || insert_node->sched.inserted)
448
      return;
449

450
   struct list_head *insert_pos = &ctx->ready_list;
451
   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
452
      if ((insert_node->sched.dist > node->sched.dist ||
453
          gpir_op_infos[insert_node->op].schedule_first) &&
454
          !gpir_op_infos[node->op].schedule_first) {
455
         insert_pos = &node->list;
456
         break;
457
      }
458
   }
459

460
   list_addtail(&insert_node->list, insert_pos);
461
   insert_node->sched.inserted = true;
462
   ctx->ready_list_slots += gpir_get_slots_required(insert_node);
463
}
464

465
static int gpir_get_max_start(gpir_node *node)
466
{
467
   int max_start = 0;
468

469
   /* find the max start instr constrainted by all successors */
470
   gpir_node_foreach_succ(node, dep) {
471
      gpir_node *succ = dep->succ;
472
      if (!succ->sched.instr)
473
         continue;
474

475
      int start = succ->sched.instr->index + gpir_get_min_dist(dep);
476
      if (start > max_start)
477
         max_start = start;
478
   }
479

480
   return max_start;
481
}
482

483
static int gpir_get_min_end(gpir_node *node)
484
{
485
   int min_end = INT_MAX;
486

487
   /* find the min end instr constrainted by all successors */
488
   gpir_node_foreach_succ(node, dep) {
489
      gpir_node *succ = dep->succ;
490
      if (!succ->sched.instr)
491
         continue;
492

493
      int end = succ->sched.instr->index + gpir_get_max_dist(dep);
494
      if (end < min_end)
495
         min_end = end;
496
   }
497

498
   return min_end;
499
}
500

501
static gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node)
502
{
503
   gpir_load_node *load = gpir_node_to_load(node);
504

505
   for (int i = GPIR_INSTR_SLOT_REG0_LOAD0; i <= GPIR_INSTR_SLOT_MEM_LOAD3; i++) {
506
      if (!instr->slots[i])
507
         continue;
508

509
      gpir_load_node *iload = gpir_node_to_load(instr->slots[i]);
510
      if (load->node.op == iload->node.op &&
511
          load->index == iload->index &&
512
          load->component == iload->component)
513
         return &iload->node;
514
   }
515
   return NULL;
516
}
517

518
/* Simply place the node into the given instruction without trying to deal
519
 * with liveness or the ready list. This will only fail if the instruction
520
 * cannot be placed due to a lack of available slots. In addition to normal
521
 * node placement, this is also used for placing loads when spilling to
522
 * registers.
523
 */
524
static bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node)
525
{
526
   if (node->type == gpir_node_type_load) {
527
      gpir_node *load = gpir_sched_instr_has_load(instr, node);
528
      if (load) {
529
         /* This node may have a store as a successor, in which case we have to
530
          * fail it exactly like below in order to later create a move node in
531
          * between.
532
          */
533
         if (instr->index < gpir_get_max_start(node))
534
            return false;
535

536
         gpir_debug("same load %d in instr %d for node %d\n",
537
                    load->index, instr->index, node->index);
538

539
         /* not really merge two node, just fake scheduled same place */
540
         node->sched.instr = load->sched.instr;
541
         node->sched.pos = load->sched.pos;
542
         return true;
543
      }
544
   }
545

546
   if (node->op == gpir_op_store_reg) {
547
      /* This register may be loaded in the next basic block, in which case
548
       * there still needs to be a 2 instruction gap. We do what the blob
549
       * seems to do and simply disable stores in the last two instructions of
550
       * the basic block.
551
       *
552
       * TODO: We may be able to do better than this, but we have to check
553
       * first if storing a register works across branches.
554
       */
555
      if (instr->index < 2)
556
         return false;
557
   }
558

559
   node->sched.instr = instr;
560

561
   int max_node_spill_needed = INT_MAX;
562
   int total_spill_needed = INT_MAX;
563
   int *slots = gpir_op_infos[node->op].slots;
564
   for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) {
565
      node->sched.pos = slots[i];
566
      if (instr->index >= gpir_get_max_start(node) &&
567
          instr->index <= gpir_get_min_end(node) &&
568
          gpir_instr_try_insert_node(instr, node))
569
         return true;
570
      if (ctx->instr->non_cplx_slot_difference ||
571
          ctx->instr->slot_difference) {
572
         /* If one of these fields is non-zero, then we could insert the node
573
          * here after spilling. To get an accurate count of how many nodes we
574
          * need to spill, we need to choose one of the positions where there
575
          * were nonzero slot differences, preferably one with the smallest
576
          * difference (so we don't have to spill as much).
577
          */
578
         if (ctx->instr->non_cplx_slot_difference < max_node_spill_needed ||
579
             ctx->instr->slot_difference < total_spill_needed) {
580
            max_node_spill_needed = ctx->instr->non_cplx_slot_difference;
581
            total_spill_needed = ctx->instr->slot_difference;
582
         }
583
      }
584
   }
585

586
   if (max_node_spill_needed != INT_MAX) {
587
      /* Indicate how many spill nodes are needed. */
588
      ctx->max_node_spill_needed = MAX2(ctx->max_node_spill_needed,
589
                                        max_node_spill_needed);
590
      ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
591
                                     total_spill_needed);
592
   }
593
   node->sched.instr = NULL;
594
   node->sched.pos = -1;
595
   return false;
596
}
597

598
/* Try to place just the node given, updating the ready list. If "speculative"
599
 * is true, then this is part of the pre-commit phase. If false, then we have
600
 * committed to placing this node, so update liveness and ready list
601
 * information.
602
 */
603

604
static bool schedule_try_place_node(sched_ctx *ctx, gpir_node *node,
605
                                    bool speculative)
606
{
607
   if (!_try_place_node(ctx, ctx->instr, node)) {
608
      if (!speculative)
609
         gpir_debug("failed to place %d\n", node->index);
610
      return false;
611
   }
612

613
   ctx->ready_list_slots -= gpir_get_slots_required(node);
614

615
   if (!speculative) {
616
      gpir_debug("placed node %d\n", node->index);
617

618
      /* We assume here that writes are placed before reads. If this changes,
619
       * then this needs to be updated.
620
       */
621
      if (node->op == gpir_op_store_reg) {
622
         gpir_store_node *store = gpir_node_to_store(node);
623
         ctx->live_physregs &=
624
            ~(1ull << (4 * store->index + store->component));
625
         if (store->child->sched.physreg_store == store)
626
            store->child->sched.physreg_store = NULL;
627
      }
628

629
      if (node->op == gpir_op_load_reg) {
630
         gpir_load_node *load = gpir_node_to_load(node);
631
         ctx->live_physregs |= 
632
            (1ull << (4 * load->index + load->component));
633
      }
634

635
      list_del(&node->list);
636
      list_add(&node->list, &ctx->block->node_list);
637
      gpir_node_foreach_pred(node, dep) {
638
         gpir_node *pred = dep->pred;
639
         schedule_insert_ready_list(ctx, pred);
640
      }
641
   } else {
642
      gpir_node_foreach_pred(node, dep) {
643
         gpir_node *pred = dep->pred;
644
         if (!pred->sched.inserted && dep->type == GPIR_DEP_INPUT)
645
            ctx->ready_list_slots += gpir_get_slots_required(pred);
646
      }
647
   }
648

649
   return true;
650
}
651

652
/* Create a new node with "node" as the child, replace all uses of "node" with
653
 * this new node, and replace "node" with it in the ready list.
654
 */
655
static gpir_node *create_replacement(sched_ctx *ctx, gpir_node *node,
656
                                     gpir_op op)
657
{
658

659
   gpir_alu_node *new_node = gpir_node_create(node->block, op);
660
   if (unlikely(!new_node))
661
      return NULL;
662

663
   new_node->children[0] = node;
664
   new_node->num_child = 1;
665

666
   new_node->node.sched.instr = NULL;
667
   new_node->node.sched.pos = -1;
668
   new_node->node.sched.dist = node->sched.dist;
669
   new_node->node.sched.max_node = node->sched.max_node;
670
   new_node->node.sched.next_max_node = node->sched.next_max_node;
671
   new_node->node.sched.complex_allowed = node->sched.complex_allowed;
672

673
   ctx->ready_list_slots--;
674
   list_del(&node->list);
675
   node->sched.max_node = false;
676
   node->sched.next_max_node = false;
677
   node->sched.ready = false;
678
   node->sched.inserted = false;
679
   gpir_node_replace_succ(&new_node->node, node);
680
   gpir_node_add_dep(&new_node->node, node, GPIR_DEP_INPUT);
681
   schedule_insert_ready_list(ctx, &new_node->node);
682
   return &new_node->node;
683
}
684

685
static gpir_node *create_move(sched_ctx *ctx, gpir_node *node)
686
{
687
   gpir_node *move = create_replacement(ctx, node, gpir_op_mov);
688
   gpir_debug("create move %d for %d\n", move->index, node->index);
689
   return move;
690
}
691

692
static gpir_node *create_postlog2(sched_ctx *ctx, gpir_node *node)
693
{
694
   assert(node->op == gpir_op_complex1);
695
   gpir_node *postlog2 = create_replacement(ctx, node, gpir_op_postlog2);
696
   gpir_debug("create postlog2 %d for %d\n", postlog2->index, node->index);
697
   return postlog2;
698
}
699

700
/* Once we schedule the successor, would the predecessor be fully ready? */
701
static bool pred_almost_ready(gpir_dep *dep)
702
{
703
   bool fully_ready = true;
704
   gpir_node_foreach_succ(dep->pred, other_dep) {
705
      gpir_node *succ = other_dep->succ;
706
      if (!succ->sched.instr && dep->succ != other_dep->succ) {
707
         fully_ready = false;
708
         break;
709
      }
710
   }
711

712
   return fully_ready;
713
}
714

715
/* Recursively try to schedule a node and all its dependent nodes that can fit
716
 * in the same  instruction. There is a simple heuristic scoring system to try
717
 * to group together nodes that load different components of the same input,
718
 * to avoid bottlenecking for operations like matrix multiplies that are
719
 * mostly input-bound.
720
 */
721

722
static int _schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
723
{
724
   if (!schedule_try_place_node(ctx, node, speculative))
725
      return INT_MIN;
726

727
   int score = 0;
728

729
   gpir_node_foreach_pred(node, dep) {
730
      if (dep->type != GPIR_DEP_INPUT)
731
         continue;
732

733
      int pred_score = INT_MIN;
734
      if (pred_almost_ready(dep)) {
735
         if (dep->pred->type == gpir_node_type_load || 
736
             node->type == gpir_node_type_store) {
737
            pred_score = _schedule_try_node(ctx, dep->pred, speculative);
738
         }
739
      }
740
      if (dep->pred->type == gpir_node_type_load || 
741
          node->type == gpir_node_type_store) {
742
         if (pred_score == INT_MIN) {
743
            if (node->op == gpir_op_mov) {
744
               /* The only moves on the ready list are for loads that we
745
                * couldn't schedule immediately, as created below. If we
746
                * couldn't schedule the load, there's no point scheduling
747
                * the move. The normal move threading logic will ensure
748
                * that another move is created if we're about to go too far
749
                * from the uses of this move.
750
                */
751
               assert(speculative);
752
               return INT_MIN;
753
            } else if (!speculative && dep->pred->type == gpir_node_type_load) {
754
               /* We couldn't schedule the load right away, so it will have
755
                * to happen in some earlier instruction and then be moved
756
                * into a value register and threaded to the use by "node".
757
                * We create the move right away, so that later we'll fail
758
                * to schedule it if there isn't a slot for a move
759
                * available.
760
                */
761
               create_move(ctx, dep->pred);
762
            }
763
            /* Penalize nodes whose dependent ops we couldn't schedule.
764
             */
765
            score--;
766
         } else {
767
            score += pred_score;
768
            continue;
769
         }
770
      }
771
   }
772

773
   return score;
774
}
775

776
/* If we speculatively tried a node, undo everything.
777
 */
778

779
static void schedule_undo_node(sched_ctx *ctx, gpir_node *node)
780
{
781
   gpir_instr_remove_node(ctx->instr, node);
782

783
   gpir_node_foreach_pred(node, dep) {
784
      gpir_node *pred = dep->pred;
785
      if (pred->sched.instr) {
786
         schedule_undo_node(ctx, pred);
787
      }
788
   }
789
}
790

791
/* Try to schedule a node. We also try to schedule any predecessors that can
792
 * be part of the same instruction. If "speculative" is true, then we don't
793
 * actually change any state, only returning the score were the node to be
794
 * scheduled, with INT_MIN meaning "cannot be scheduled at all".
795
 */
796
static int schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
797
{
798
   int prev_slots = ctx->ready_list_slots;
799

800
   int score = _schedule_try_node(ctx, node, speculative);
801

802
   if (ctx->ready_list_slots > GPIR_VALUE_REG_NUM) {
803
      assert(speculative);
804
      ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
805
                                     ctx->ready_list_slots - GPIR_VALUE_REG_NUM);
806
      score = INT_MIN;
807
   }
808

809
   if (speculative) {
810
      ctx->ready_list_slots = prev_slots;
811
      if (node->sched.instr)
812
         schedule_undo_node(ctx, node);
813
   }
814

815
   return score;
816
}
817

818
/* This is called when we want to spill "node" by inserting loads at its uses.
819
 * It returns all the possible registers we can use so that all the loads will
820
 * successfully be inserted. Also return the first instruction we'll need to
821
 * insert a load for.
822
 */
823

824
static uint64_t get_available_regs(sched_ctx *ctx, gpir_node *node,
825
                                   int *min_index)
826
{
827
   uint64_t available = ~0ull;
828
   gpir_node_foreach_succ(node, dep) {
829
      if (dep->type != GPIR_DEP_INPUT)
830
         continue;
831

832
      gpir_node *use = dep->succ;
833
      gpir_instr *instr = use->sched.instr;
834

835
      if (!instr) {
836
         /* This use isn't scheduled, so no need to spill it. */
837
         continue;
838
      }
839

840
      if (use->type == gpir_node_type_store) {
841
         /* We're trying to spill something that was recently stored... just
842
          * bail out.
843
          */
844
         return 0;
845
      }
846

847
      if (use->op == gpir_op_mov && instr == ctx->instr) {
848
         /* We try to spill the sources of this move, so we can free up space
849
          * in the current instruction.
850
          *
851
          * TODO: should we go back further? It might let us schedule the
852
          * write earlier in some cases, but then we might fail to spill.
853
          */
854
         available &= get_available_regs(ctx, use, min_index);
855
      } else {
856
         if (instr->index < *min_index)
857
            *min_index = instr->index;
858

859
         uint64_t use_available = 0;
860

861
         if (instr->reg0_use_count == 0)
862
            use_available = ~0ull;
863
         else if (!instr->reg0_is_attr)
864
            use_available = 0xfull << (4 * instr->reg0_index);
865

866
         if (instr->reg1_use_count == 0)
867
            use_available = ~0ull;
868
         else
869
            use_available |= 0xfull << (4 * instr->reg1_index);
870

871
         available &= use_available;
872
      }
873
   }
874

875
   return available;
876
}
877

878
/* Using "min_index" returned by get_available_regs(), figure out which
879
 * registers are killed by a write after or during the current instruction and
880
 * hence we can't use for spilling. Writes that haven't been scheduled yet
881
 * should be reflected in live_physregs.
882
 */
883

884
static uint64_t get_killed_regs(sched_ctx *ctx, int min_index)
885
{
886
   uint64_t killed = 0;
887

888
   list_for_each_entry(gpir_instr, instr, &ctx->block->instr_list, list) {
889
      if (instr->index <= min_index)
890
         break;
891

892
      for (int slot = GPIR_INSTR_SLOT_STORE0; slot <= GPIR_INSTR_SLOT_STORE3; 
893
           slot++) {
894
         if (!instr->slots[slot])
895
            continue;
896

897
         gpir_store_node *store = gpir_node_to_store(instr->slots[slot]);
898
         if (store->node.op != gpir_op_store_reg)
899
            continue;
900

901
         killed |= 1ull << (4 * store->index + store->component);
902
      }
903
   }
904

905
   return killed;
906
}
907

908
/* Actually spill a node so that it is no longer in the ready list. Note that
909
 * this must exactly follow the logic of get_available_regs() or else the
910
 * loads could fail to schedule.
911
 */
912

913
static void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store)
914
{
915
   gpir_node_foreach_succ_safe(node, dep) {
916
      if (dep->type != GPIR_DEP_INPUT)
917
         continue;
918

919
      gpir_node *use = dep->succ;
920
      gpir_instr *instr = use->sched.instr;
921

922
      if (!instr)
923
         continue;
924

925
      if (use->op == gpir_op_mov && instr == ctx->instr) {
926
         spill_node(ctx, use, store);
927
      } else {
928
         gpir_load_node *load = gpir_node_create(ctx->block, gpir_op_load_reg);
929
         load->index = store->index;
930
         load->component = store->component;
931
         list_add(&load->node.list, &ctx->block->node_list);
932
         gpir_node_replace_child(dep->succ, dep->pred, &load->node);
933
         gpir_node_replace_pred(dep, &load->node);
934
         gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE);
935
         gpir_debug("spilling use %d of node %d to load node %d\n",
936
                    use->index, node->index, load->node.index);
937
         ASSERTED bool result = _try_place_node(ctx, use->sched.instr, &load->node);
938
         assert(result);
939
      }
940
   }
941

942
   if (node->op == gpir_op_mov) {
943
      /* We replaced all the uses of the move, so it's dead now. */
944
      gpir_instr_remove_node(node->sched.instr, node);
945
      gpir_node_delete(node);
946
   } else {
947
      /* We deleted all the uses of the node except the store, so it's not
948
       * live anymore.
949
       */
950
      list_del(&node->list);
951
      node->sched.inserted = false;
952
      ctx->ready_list_slots--;
953
      if (node->sched.max_node) {
954
         node->sched.max_node = false;
955
         ctx->instr->alu_num_slot_needed_by_max--;
956
      }
957
      if (node->sched.next_max_node) {
958
         node->sched.next_max_node = false;
959
         ctx->instr->alu_num_unscheduled_next_max--;
960
      }
961
   }
962
}
963

964
static bool used_by_store(gpir_node *node, gpir_instr *instr)
965
{
966
   gpir_node_foreach_succ(node, dep) {
967
      if (dep->type != GPIR_DEP_INPUT)
968
         continue;
969

970
      if (dep->succ->type == gpir_node_type_store &&
971
          dep->succ->sched.instr == instr)
972
         return true;
973
   }
974

975
   return false;
976
}
977

978
static gpir_node *consuming_postlog2(gpir_node *node)
979
{
980
   if (node->op != gpir_op_complex1)
981
      return NULL;
982

983
   gpir_node_foreach_succ(node, dep) {
984
      if (dep->type != GPIR_DEP_INPUT)
985
         continue;
986
      if (dep->succ->op == gpir_op_postlog2)
987
         return dep->succ;
988
      else
989
         return NULL;
990
   }
991

992
   return NULL;
993
}
994

995
static bool try_spill_node(sched_ctx *ctx, gpir_node *node)
996
{
997
   assert(node->op != gpir_op_mov);
998

999
   if (used_by_store(node, ctx->instr))
1000
      return false;
1001

1002
   gpir_debug("trying to spill %d\n", node->index);
1003

1004
   int min_instr = INT_MAX;
1005
   uint64_t available = get_available_regs(ctx, node, &min_instr);
1006
   available &= ~get_killed_regs(ctx, min_instr);
1007

1008
   if (node->sched.physreg_store) {
1009
      gpir_store_node *store = node->sched.physreg_store;
1010
      if (!(available & (1ull << (4 * store->index + store->component))))
1011
         return false;
1012
   } else {
1013
      available &= ~ctx->live_physregs;
1014

1015
      if (available == 0)
1016
         return false;
1017

1018
      /* Don't spill complex1 if it's used postlog2, turn the postlog2 into a
1019
       * move, replace the complex1 with postlog2 and spill that instead. The
1020
       * store needs a move anyways so the postlog2 is usually free.
1021
       */
1022
      gpir_node *postlog2 = consuming_postlog2(node);
1023
      if (postlog2) {
1024
         postlog2->op = gpir_op_mov;
1025
         node = create_postlog2(ctx, node);
1026
      }
1027

1028
      /* TODO: use a better heuristic for choosing an available register? */
1029
      int physreg = ffsll(available) - 1;
1030

1031
      ctx->live_physregs |= (1ull << physreg);
1032

1033
      gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg);
1034
      store->index = physreg / 4;
1035
      store->component = physreg % 4;
1036
      store->child = node;
1037
      store->node.sched.max_node = false;
1038
      store->node.sched.next_max_node = false;
1039
      store->node.sched.complex_allowed = false;
1040
      store->node.sched.pos = -1;
1041
      store->node.sched.instr = NULL;
1042
      store->node.sched.inserted = false;
1043
      store->node.sched.dist = node->sched.dist;
1044
      if (node->op == gpir_op_complex1) {
1045
         /* Complex1 cannot be directly stored, and has a latency of 2 */
1046
         store->node.sched.dist += 2;
1047
      }
1048
      node->sched.physreg_store = store;
1049
      gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT);
1050

1051
      list_for_each_entry(gpir_load_node, load,
1052
                          &ctx->physreg_reads[physreg], reg_link) {
1053
         gpir_node_add_dep(&store->node, &load->node, GPIR_DEP_WRITE_AFTER_READ);
1054
         if (load->node.sched.ready) {
1055
            list_del(&load->node.list);
1056
            load->node.sched.ready = false;
1057
         }
1058
      }
1059

1060
      node->sched.ready = false;
1061
      schedule_insert_ready_list(ctx, &store->node);
1062
   }
1063

1064
   gpir_debug("spilling %d to $%d.%c, store %d\n", node->index,
1065
              node->sched.physreg_store->index,
1066
              "xyzw"[node->sched.physreg_store->component],
1067
              node->sched.physreg_store->node.index);
1068

1069
   spill_node(ctx, node, node->sched.physreg_store);
1070

1071
   return true;
1072
}
1073

1074
static bool try_spill_nodes(sched_ctx *ctx, gpir_node *orig_node)
1075
{
1076
   /* First, try to spill max nodes. */
1077
   list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
1078
      if (ctx->max_node_spill_needed <= 0)
1079
         break;
1080

1081
      /* orig_node is the node we're trying to schedule, so spilling it makes
1082
       * no sense. Also don't try to spill any nodes in front of it, since
1083
       * they might be scheduled instead.
1084
       */
1085
      if (node == orig_node)
1086
         break;
1087

1088
      if (node->op == gpir_op_mov) {
1089
         /* Don't try to spill loads, since that only adds another load and
1090
          * store which is likely pointless.
1091
          */
1092
         continue;
1093
      }
1094

1095
      if (!gpir_is_input_node(node) || !node->sched.max_node)
1096
         continue;
1097

1098
      if (try_spill_node(ctx, node)) {
1099
         ctx->max_node_spill_needed--;
1100
         ctx->total_spill_needed--;
1101
      }
1102
   }
1103

1104
   /* Now, try to spill the remaining nodes. */
1105
   list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
1106
      if (ctx->total_spill_needed <= 0)
1107
         break;
1108

1109
      if (node == orig_node)
1110
         break;
1111

1112
      if (node->op == gpir_op_mov)
1113
         continue;
1114

1115
      if (!gpir_is_input_node(node) ||
1116
          !(node->sched.max_node || node->sched.next_max_node))
1117
         continue;
1118

1119
      if (try_spill_node(ctx, node))
1120
         ctx->total_spill_needed--;
1121
   }
1122

1123
   return ctx->total_spill_needed <= 0 && ctx->max_node_spill_needed <= 0;
1124
}
1125

1126
static int ASSERTED gpir_get_curr_ready_list_slots(sched_ctx *ctx)
1127
{
1128
   int total = 0;
1129
   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1130
      total += gpir_get_slots_required(node);
1131
   }
1132

1133
   return total;
1134
}
1135

1136
/* What gpir_get_min_end() would return if node were replaced with a move
1137
 * instruction not in the complex slot. Normally this is 2 + min_end, except
1138
 * for some store instructions which must have the move node in the same
1139
 * instruction.
1140
 */
1141
static int gpir_get_min_end_as_move(gpir_node *node)
1142
{
1143
   int min = INT_MAX;
1144
   gpir_node_foreach_succ(node, dep) {
1145
      gpir_node *succ = dep->succ;
1146
      if (succ->sched.instr && dep->type == GPIR_DEP_INPUT) {
1147
         switch (succ->op) {
1148
         case gpir_op_store_temp:
1149
         case gpir_op_store_reg:
1150
         case gpir_op_store_varying:
1151
            continue;
1152
         default:
1153
            break;
1154
         }
1155
         if (min > succ->sched.instr->index + 2)
1156
            min = succ->sched.instr->index + 2;
1157
      }
1158
   }
1159
   return min;
1160
}
1161

1162
/* The second source for add0, add1, mul0, and mul1 units cannot be complex.
1163
 * The hardware overwrites the add second sources with 0 and mul second
1164
 * sources with 1. This can be a problem if we need to insert more next-max
1165
 * moves but we only have values that can't use the complex unit for moves.
1166
 *
1167
 * Fortunately, we only need to insert a next-max move if there are more than
1168
 * 5 next-max nodes, but there are only 4 sources in the previous instruction
1169
 * that make values not complex-capable, which means there can be at most 4
1170
 * non-complex-capable values. Hence there will always be at least two values
1171
 * that can be rewritten to use a move in the complex slot. However, we have
1172
 * to be careful not to waste those values by putting both of them in a
1173
 * non-complex slot. This is handled for us by gpir_instr, which will reject
1174
 * such instructions. We just need to tell it which nodes can use complex, and
1175
 * it will do the accounting to figure out what is safe.
1176
 */
1177

1178
static bool can_use_complex(gpir_node *node)
1179
{
1180
   gpir_node_foreach_succ(node, dep) {
1181
      if (dep->type != GPIR_DEP_INPUT)
1182
         continue;
1183

1184
      gpir_node *succ = dep->succ;
1185
      if (succ->type != gpir_node_type_alu ||
1186
          !succ->sched.instr)
1187
         continue;
1188

1189
      /* Note: this must be consistent with gpir_codegen_{mul,add}_slot{0,1}
1190
       */
1191
      gpir_alu_node *alu = gpir_node_to_alu(succ);
1192
      switch (alu->node.op) {
1193
      case gpir_op_complex1:
1194
         /* complex1 puts its third source in the fourth slot */
1195
         if (alu->children[1] == node || alu->children[2] == node)
1196
            return false;
1197
         break;
1198
      case gpir_op_complex2:
1199
         /* complex2 has its source duplicated, since it actually takes two
1200
          * sources but we only ever use it with both sources the same. Hence
1201
          * its source can never be the complex slot.
1202
          */
1203
         return false;
1204
      case gpir_op_select:
1205
         /* Select has its sources rearranged */
1206
         if (alu->children[0] == node)
1207
            return false;
1208
         break;
1209
      default:
1210
         assert(alu->num_child <= 2);
1211
         if (alu->num_child == 2 && alu->children[1] == node)
1212
            return false;
1213
         break;
1214
      }
1215
   }
1216

1217
   return true;
1218
}
1219

1220
/* Initialize node->sched.max_node and node->sched.next_max_node for every
1221
 * input node on the ready list. We should only need to do this once per
1222
 * instruction, at the beginning, since we never add max nodes to the ready
1223
 * list.
1224
 */
1225

1226
static void sched_find_max_nodes(sched_ctx *ctx)
1227
{
1228
   ctx->instr->alu_num_unscheduled_next_max = 0;
1229
   ctx->instr->alu_num_slot_needed_by_max = 0;
1230

1231
   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1232
      if (!gpir_is_input_node(node))
1233
         continue;
1234

1235
      int min_end_move = gpir_get_min_end_as_move(node);
1236
      node->sched.max_node = (min_end_move == ctx->instr->index);
1237
      node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);
1238
      if (node->sched.next_max_node)
1239
         node->sched.complex_allowed = can_use_complex(node);
1240

1241
      if (node->sched.max_node)
1242
         ctx->instr->alu_num_slot_needed_by_max++;
1243
      if (node->sched.next_max_node)
1244
         ctx->instr->alu_num_unscheduled_next_max++;
1245
   }
1246
}
1247

1248
/* Verify the invariants described in gpir.h, as well as making sure the
1249
 * counts are correct.
1250
 */
1251
static void ASSERTED verify_max_nodes(sched_ctx *ctx)
1252
{
1253
   int alu_num_slot_needed_by_max = 0;
1254
   int alu_num_unscheduled_next_max = 0;
1255
   int alu_num_slot_needed_by_store = 0;
1256
   int alu_num_slot_needed_by_non_cplx_store = 0;
1257
   ASSERTED int alu_max_allowed_next_max = 5;
1258

1259
   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1260
      if (!gpir_is_input_node(node))
1261
         continue;
1262

1263
      if (node->sched.max_node)
1264
         alu_num_slot_needed_by_max++;
1265
      if (node->sched.next_max_node)
1266
         alu_num_unscheduled_next_max++;
1267
      if (used_by_store(node, ctx->instr)) {
1268
         alu_num_slot_needed_by_store++;
1269
         if (node->sched.next_max_node && !node->sched.complex_allowed)
1270
            alu_num_slot_needed_by_non_cplx_store++;
1271
      }
1272
   }
1273

1274
   if (ctx->instr->slots[GPIR_INSTR_SLOT_MUL0] &&
1275
       ctx->instr->slots[GPIR_INSTR_SLOT_MUL0]->op == gpir_op_complex1)
1276
      alu_max_allowed_next_max = 4;
1277

1278
   assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
1279
   assert(ctx->instr->alu_num_unscheduled_next_max == alu_num_unscheduled_next_max);
1280
   assert(ctx->instr->alu_max_allowed_next_max == alu_max_allowed_next_max);
1281
   assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
1282
   assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store ==
1283
          alu_num_slot_needed_by_non_cplx_store);
1284
   assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_unscheduled_next_max - alu_max_allowed_next_max, 0));
1285
   assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store);
1286
}
1287

1288
static bool try_node(sched_ctx *ctx)
1289
{
1290
   gpir_node *best_node = NULL;
1291
   int best_score = INT_MIN;
1292

1293
   /* Spilling will delete arbitrary nodes after the current one in the ready
1294
    * list, which means that we always need to look up the next node in the
1295
    * list at the end of each iteration. While list_for_each_entry() works for
1296
    * this purpose, its sanity checking assumes that you don't want to modify
1297
    * the list at all. We know better here, so we have to open-code
1298
    * list_for_each_entry() without the check in order to not assert.
1299
    */
1300
   for (gpir_node *node = LIST_ENTRY(gpir_node, ctx->ready_list.next, list);
1301
        &node->list != &ctx->ready_list;
1302
        node = LIST_ENTRY(gpir_node, node->list.next, list)) {
1303
      if (best_score != INT_MIN) {
1304
         if (node->sched.dist < best_node->sched.dist)
1305
            break;
1306
      }
1307

1308
      if (node->sched.ready) {
1309
         ctx->total_spill_needed = 0;
1310
         ctx->max_node_spill_needed = 0;
1311
         int score = schedule_try_node(ctx, node, true);
1312
         if (score == INT_MIN && !best_node &&
1313
             ctx->total_spill_needed > 0 &&
1314
             try_spill_nodes(ctx, node)) {
1315
            score = schedule_try_node(ctx, node, true);
1316
         }
1317

1318
         /* schedule_first nodes must be scheduled if possible */
1319
         if (gpir_op_infos[node->op].schedule_first && score != INT_MIN) {
1320
            best_node = node;
1321
            best_score = score;
1322
            break;
1323
         }
1324

1325
         if (score > best_score) {
1326
            best_score = score;
1327
            best_node = node;
1328
         }
1329
      }
1330
   }
1331

1332
   if (best_node) {
1333
      gpir_debug("scheduling %d (score = %d)%s\n", best_node->index,
1334
                 best_score, best_node->sched.max_node ? " (max)" : "");
1335
      ASSERTED int score = schedule_try_node(ctx, best_node, false);
1336
      assert(score != INT_MIN);
1337
      return true;
1338
   }
1339

1340
   return false;
1341
}
1342

1343
static void place_move(sched_ctx *ctx, gpir_node *node)
1344
{
1345
   /* For complex1 that is consumed by a postlog2, we cannot allow any moves
1346
    * in between. Convert the postlog2 to a move and insert a new postlog2,
1347
    * and try to schedule it again in try_node().
1348
    */
1349
   gpir_node *postlog2 = consuming_postlog2(node);
1350
   if (postlog2) {
1351
      postlog2->op = gpir_op_mov;
1352
      create_postlog2(ctx, node);
1353
      return;
1354
   }
1355

1356
   gpir_node *move = create_move(ctx, node);
1357
   gpir_node_foreach_succ_safe(move, dep) {
1358
      gpir_node *succ = dep->succ;
1359
      if (!succ->sched.instr ||
1360
          ctx->instr->index < succ->sched.instr->index + gpir_get_min_dist(dep)) {
1361
         gpir_node_replace_pred(dep, node);
1362
         if (dep->type == GPIR_DEP_INPUT)
1363
            gpir_node_replace_child(succ, move, node);
1364
      }
1365
   }
1366
   ASSERTED int score = schedule_try_node(ctx, move, false);
1367
   assert(score != INT_MIN);
1368
}
1369

1370
/* For next-max nodes, not every node can be offloaded to a move in the
1371
 * complex slot. If we run out of non-complex slots, then such nodes cannot
1372
 * have moves placed for them. There should always be sufficient
1373
 * complex-capable nodes so that this isn't a problem. We also disallow moves
1374
 * for schedule_first nodes here.
1375
 */
1376
static bool can_place_move(sched_ctx *ctx, gpir_node *node)
1377
{
1378
   if (gpir_op_infos[node->op].schedule_first)
1379
      return false;
1380

1381
   if (!node->sched.next_max_node)
1382
      return true;
1383

1384
   if (node->sched.complex_allowed)
1385
      return true;
1386

1387
   return ctx->instr->alu_non_cplx_slot_free > 0;
1388
}
1389

1390
static bool sched_move(sched_ctx *ctx)
1391
{
1392
   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1393
      if (node->sched.max_node) {
1394
         place_move(ctx, node);
1395
         return true;
1396
      }
1397
   }
1398

1399
   if (ctx->instr->alu_num_slot_needed_by_store > 0) {
1400
      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1401
         if (used_by_store(node, ctx->instr)) {
1402
            place_move(ctx, node);
1403
            /* If we have a store of a load, then we need to make sure that we
1404
             * immediately schedule the dependent load, or create a move
1405
             * instruction for it, like we would with a normal instruction.
1406
             * The rest of the code isn't set up to handle load nodes in the
1407
             * ready list -- see the comments in _schedule_try_node().
1408
             */
1409
            if (node->type == gpir_node_type_load) {
1410
               if (!schedule_try_place_node(ctx, node, false)) {
1411
                  create_move(ctx, node);
1412
               }
1413
            }
1414
            return true;
1415
         }
1416
      }
1417
   }
1418

1419
   /* complex1 is a bit a special case, since it has a latency of 2 cycles.
1420
    * Once it is fully ready, we need to group all its uses in the same
1421
    * instruction, and then we need to avoid creating any moves in the next
1422
    * cycle in order to get it scheduled. Failing to do any of these things
1423
    * could result in a cycle penalty, or even worse, an infinite loop of
1424
    * inserting moves. If it is a next-max node and ready, then it has a use
1425
    * in the previous cycle. If it has a use in the current cycle as well,
1426
    * then we want to insert a move node to make it ready in two cycles -- if
1427
    * we don't, then there will be at least a one cycle penalty. Otherwise, it
1428
    * will be ready next cycle, and we shouldn't insert a move node, or else
1429
    * we'll also have a one cycle penalty. 
1430
    */
1431
   if (ctx->instr->alu_num_slot_free > 0) {
1432
      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1433
         if (!can_place_move(ctx, node))
1434
            continue;
1435

1436
         if (node->sched.next_max_node && node->op == gpir_op_complex1 &&
1437
             node->sched.ready) {
1438
            bool skip = true;
1439
            gpir_node_foreach_succ(node, dep) {
1440
               if (dep->type != GPIR_DEP_INPUT)
1441
                  continue;
1442

1443
               gpir_node *succ = dep->succ;
1444

1445
               if (!succ->sched.instr ||
1446
                   succ->sched.instr->index != ctx->instr->index - 1) {
1447
                  skip = false;
1448
                  break;
1449
               }
1450
            }
1451

1452
            if (skip)
1453
               continue;
1454

1455
            place_move(ctx, node);
1456
            return true;
1457
         }
1458
      }
1459
   }
1460

1461
   /* Once we've made all the required moves, we're free to use any extra
1462
    * slots to schedule more moves for next max nodes. Besides sometimes being
1463
    * necessary, this can free up extra space in the next instruction. We walk
1464
    * from back to front so that we pick nodes less likely to be scheduled
1465
    * next first -- an extra move would be unnecessary there. But make sure
1466
    * not to handle the complex1 case handled above.
1467
    */
1468
   if (ctx->instr->alu_num_slot_free > 0) {
1469
      list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {
1470
         if (!can_place_move(ctx, node))
1471
            continue;
1472

1473
         if (node->sched.next_max_node &&
1474
             !(node->op == gpir_op_complex1 && node->sched.ready)) {
1475
            place_move(ctx, node);
1476
            return true;
1477
         }
1478
      }
1479
   }
1480

1481
   /* We may have skipped complex1 above, but if we run out of space, we still
1482
    * need to insert the move.
1483
    */
1484

1485
   if (ctx->instr->alu_num_unscheduled_next_max >
1486
       ctx->instr->alu_max_allowed_next_max) {
1487
      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1488
         if (!can_place_move(ctx, node))
1489
            continue;
1490

1491
         if (node->sched.next_max_node) {
1492
            place_move(ctx, node);
1493
            return true;
1494
         }
1495
      }
1496
   }
1497

1498

1499
   return false;
1500
}
1501

1502
static bool gpir_sched_instr_pass(sched_ctx *ctx)
1503
{
1504
   if (try_node(ctx))
1505
      return true;
1506

1507
   if (sched_move(ctx))
1508
      return true;
1509

1510
   return false;
1511
}
1512

1513
static void schedule_print_pre_one_instr(sched_ctx *ctx)
1514
{
1515
   if (!(lima_debug & LIMA_DEBUG_GP))
1516
      return;
1517

1518
   printf("instr %d for ready list:", ctx->instr->index);
1519
   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1520
      printf(" %d/%c (%d, %d, %s)", node->index, node->sched.ready ? 'r' : 'p',
1521
             node->sched.dist, gpir_get_slots_required(node),
1522
             node->sched.max_node ? "max" : (node->sched.next_max_node ? "next" : "none"));
1523
   }
1524
   printf("\nlive physregs: ");
1525
   for (unsigned i = 0; i < 16; i++) {
1526
      if (ctx->live_physregs & (0xfull << (4 * i))) {
1527
         printf("$%d.", i);
1528
         for (unsigned j = 0; j < 4; j++) {
1529
            if (ctx->live_physregs & (1ull << (4 * i + j)))
1530
               printf("%c", "xyzw"[j]);
1531
         }
1532
         printf(" ");
1533
      }
1534
   }
1535
   printf("\n");
1536
}
1537

1538
static void schedule_print_post_one_instr(gpir_instr *instr)
1539
{
1540
   if (!(lima_debug & LIMA_DEBUG_GP))
1541
      return;
1542

1543
   printf("post schedule instr");
1544
   for (int i = 0; i < GPIR_INSTR_SLOT_NUM; i++) {
1545
      if (instr->slots[i])
1546
         printf(" %d/%d", i, instr->slots[i]->index);
1547
   }
1548
   printf("\n");
1549
}
1550

1551

1552
static bool schedule_one_instr(sched_ctx *ctx)
1553
{
1554
   gpir_instr *instr = gpir_instr_create(ctx->block);
1555
   if (unlikely(!instr))
1556
      return false;
1557

1558
   ctx->instr = instr;
1559

1560
   sched_find_max_nodes(ctx);
1561
   schedule_print_pre_one_instr(ctx);
1562

1563
   while (gpir_sched_instr_pass(ctx)) {
1564
      assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx));
1565
#ifndef NDEBUG
1566
      verify_max_nodes(ctx);
1567
      verify_ready_list(ctx);
1568
#endif
1569
   }
1570

1571
   schedule_print_post_one_instr(instr);
1572
   return true;
1573
}
1574

1575
static bool schedule_block(gpir_block *block)
1576
{
1577
   /* calculate distance */
1578
   list_for_each_entry(gpir_node, node, &block->node_list, list) {
1579
      if (gpir_node_is_root(node))
1580
         schedule_update_distance(node);
1581
   }
1582

1583
   sched_ctx ctx;
1584
   list_inithead(&ctx.ready_list);
1585
   ctx.block = block;
1586
   ctx.ready_list_slots = 0;
1587
   ctx.live_physregs = block->live_out_phys;
1588

1589
   for (unsigned i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) {
1590
      list_inithead(&ctx.physreg_reads[i]);
1591
   }
1592

1593
   /* construct the ready list from root nodes */
1594
   list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
1595
      /* Add to physreg_reads */
1596
      if (node->op == gpir_op_load_reg) {
1597
         gpir_load_node *load = gpir_node_to_load(node);
1598
         unsigned index = 4 * load->index + load->component;
1599
         list_addtail(&load->reg_link, &ctx.physreg_reads[index]);
1600
      }
1601

1602
      if (gpir_node_is_root(node))
1603
         schedule_insert_ready_list(&ctx, node);
1604
   }
1605

1606
   list_inithead(&block->node_list);
1607
   while (!list_is_empty(&ctx.ready_list)) {
1608
      if (!schedule_one_instr(&ctx))
1609
         return false;
1610
   }
1611

1612
   return true;
1613
}
1614

1615
static void add_fake_dep(gpir_node *node, gpir_node *dep_node,
1616
                         gpir_node *last_written[])
1617
{
1618
      gpir_node_foreach_pred(node, dep) {
1619
         if (dep->type == GPIR_DEP_INPUT) {
1620
            int index = dep->pred->value_reg;
1621
            if (index >= 0 && last_written[index]) {
1622
               gpir_node_add_dep(last_written[index], dep_node,
1623
                                 GPIR_DEP_WRITE_AFTER_READ);
1624
            }
1625
            if (gpir_op_infos[dep->pred->op].schedule_first) {
1626
               /* Insert fake dependencies for any schedule_first children on
1627
                * this node as well. This guarantees that as soon as
1628
                * "dep_node" is ready to schedule, all of its schedule_first
1629
                * children, grandchildren, etc. are ready so that they can be
1630
                * scheduled as soon as possible.
1631
                */
1632
               add_fake_dep(dep->pred, dep_node, last_written);
1633
            }
1634
         }
1635
      }
1636
}
1637

1638
static void schedule_build_dependency(gpir_block *block)
1639
{
1640
   gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0};
1641

1642
   /* merge dummy_f/m to the node created from */
1643
   list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
1644
      if (node->op == gpir_op_dummy_m) {
1645
         gpir_alu_node *alu = gpir_node_to_alu(node);
1646
         gpir_node *origin = alu->children[0];
1647
         gpir_node *dummy_f = alu->children[1];
1648

1649
         gpir_node_foreach_succ(node, dep) {
1650
            gpir_node *succ = dep->succ;
1651
            /* origin and node may have same succ (by VREG/INPUT or
1652
             * VREG/VREG dep), so use gpir_node_add_dep() instead of
1653
             * gpir_node_replace_pred() */
1654
            gpir_node_add_dep(succ, origin, dep->type);
1655
            gpir_node_replace_child(succ, node, origin);
1656
         }
1657
         gpir_node_delete(dummy_f);
1658
         gpir_node_delete(node);
1659
      }
1660
   }
1661

1662
   memset(last_written, 0, sizeof(last_written));
1663

1664
   /* False dependencies. For value registers, these exist only to make sure
1665
    * that the maximum pressure isn't exceeded and are hence "fake".
1666
    */
1667
   list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
1668
      if (node->op == gpir_op_load_reg) {
1669
         gpir_load_node *load = gpir_node_to_load(node);
1670
         unsigned index = 4 * load->index + load->component;
1671
         if (last_written[index]) {
1672
            gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ);
1673
         }
1674
      } else if (node->op == gpir_op_store_reg) {
1675
         gpir_store_node *store = gpir_node_to_store(node);
1676
         unsigned index = 4 * store->index + store->component;
1677
         last_written[index] = node;
1678
      } else {
1679
         add_fake_dep(node, node, last_written);
1680
      }
1681

1682
      if (node->value_reg >= 0)
1683
         last_written[node->value_reg] = node;
1684
   }
1685
}
1686

1687
static void print_statistic(gpir_compiler *comp, int save_index)
1688
{
1689
   int num_nodes[gpir_op_num] = {0};
1690
   int num_created_nodes[gpir_op_num] = {0};
1691

1692
   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1693
      list_for_each_entry(gpir_node, node, &block->node_list, list) {
1694
         num_nodes[node->op]++;
1695
         if (node->index >= save_index)
1696
            num_created_nodes[node->op]++;
1697
      }
1698
   }
1699

1700
   printf("====== gpir scheduler statistic ======\n");
1701
   printf("---- how many nodes are scheduled ----\n");
1702
   int n = 0, l = 0;
1703
   for (int i = 0; i < gpir_op_num; i++) {
1704
      if (num_nodes[i]) {
1705
         printf("%10s:%-6d", gpir_op_infos[i].name, num_nodes[i]);
1706
         n += num_nodes[i];
1707
         if (!(++l % 4))
1708
            printf("\n");
1709
      }
1710
   }
1711
   if (l % 4)
1712
      printf("\n");
1713
   printf("\ntotal: %d\n", n);
1714

1715
   printf("---- how many nodes are created ----\n");
1716
   n = l = 0;
1717
   for (int i = 0; i < gpir_op_num; i++) {
1718
      if (num_created_nodes[i]) {
1719
         printf("%10s:%-6d", gpir_op_infos[i].name, num_created_nodes[i]);
1720
         n += num_created_nodes[i];
1721
         if (!(++l % 4))
1722
            printf("\n");
1723
      }
1724
   }
1725
   if (l % 4)
1726
      printf("\n");
1727
   printf("\ntotal: %d\n", n);
1728
   printf("------------------------------------\n");
1729
}
1730

1731
bool gpir_schedule_prog(gpir_compiler *comp)
1732
{
1733
   int save_index = comp->cur_index;
1734

1735
   /* init schedule info */
1736
   int index = 0;
1737
   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1738
      block->sched.instr_index = 0;
1739
      list_for_each_entry(gpir_node, node, &block->node_list, list) {
1740
         node->sched.instr = NULL;
1741
         node->sched.pos = -1;
1742
         node->sched.index = index++;
1743
         node->sched.dist = -1;
1744
         /* TODO when we support multiple basic blocks, we need a way to keep
1745
          * track of this for physregs allocated before the scheduler.
1746
          */
1747
         node->sched.physreg_store = NULL;
1748
         node->sched.ready = false;
1749
         node->sched.inserted = false;
1750
         node->sched.complex_allowed = false;
1751
         node->sched.max_node = false;
1752
         node->sched.next_max_node = false;
1753
      }
1754
   }
1755

1756
   /* build dependency */
1757
   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1758
      schedule_build_dependency(block);
1759
   }
1760

1761
   //gpir_debug("after scheduler build reg dependency\n");
1762
   //gpir_node_print_prog_dep(comp);
1763

1764
   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1765
      if (!schedule_block(block)) {
1766
         gpir_error("fail schedule block\n");
1767
         return false;
1768
      }
1769
   }
1770

1771
   if (lima_debug & LIMA_DEBUG_GP) {
1772
      print_statistic(comp, save_index);
1773
      gpir_instr_print_prog(comp);
1774
   }
1775

1776
   return true;
1777
}
1778

1779
Product

Resources

Company