CoCalc -- aco_spill.cpp

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_spill.cpp
⁴⁵⁵⁰ views
1
/*
2
 * Copyright © 2018 Valve Corporation
3
 * Copyright © 2018 Google
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
 * and/or sell copies of the Software, and to permit persons to whom the
10
 * Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
 * IN THE SOFTWARE.
23
 *
24
 */
25

26
#include "aco_builder.h"
27
#include "aco_ir.h"
28

29
#include "common/sid.h"
30

31
#include <map>
32
#include <set>
33
#include <stack>
34
#include <unordered_set>
35
#include <vector>
36

37
/*
38
 * Implements the spilling algorithm on SSA-form from
39
 * "Register Spilling and Live-Range Splitting for SSA-Form Programs"
40
 * by Matthias Braun and Sebastian Hack.
41
 */
42

43
namespace aco {
44

45
namespace {
46

47
struct remat_info {
48
   Instruction* instr;
49
};
50

51
struct spill_ctx {
52
   RegisterDemand target_pressure;
53
   Program* program;
54
   std::vector<std::vector<RegisterDemand>> register_demand;
55
   std::vector<std::map<Temp, Temp>> renames;
56
   std::vector<std::map<Temp, uint32_t>> spills_entry;
57
   std::vector<std::map<Temp, uint32_t>> spills_exit;
58
   std::vector<bool> processed;
59
   std::stack<Block*> loop_header;
60
   std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_start;
61
   std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_end;
62
   std::vector<std::pair<RegClass, std::unordered_set<uint32_t>>> interferences;
63
   std::vector<std::vector<uint32_t>> affinities;
64
   std::vector<bool> is_reloaded;
65
   std::map<Temp, remat_info> remat;
66
   std::map<Instruction*, bool> remat_used;
67
   unsigned wave_size;
68

69
   spill_ctx(const RegisterDemand target_pressure_, Program* program_,
70
             std::vector<std::vector<RegisterDemand>> register_demand_)
71
       : target_pressure(target_pressure_), program(program_),
72
         register_demand(std::move(register_demand_)), renames(program->blocks.size()),
73
         spills_entry(program->blocks.size()), spills_exit(program->blocks.size()),
74
         processed(program->blocks.size(), false), wave_size(program->wave_size)
75
   {}
76

77
   void add_affinity(uint32_t first, uint32_t second)
78
   {
79
      unsigned found_first = affinities.size();
80
      unsigned found_second = affinities.size();
81
      for (unsigned i = 0; i < affinities.size(); i++) {
82
         std::vector<uint32_t>& vec = affinities[i];
83
         for (uint32_t entry : vec) {
84
            if (entry == first)
85
               found_first = i;
86
            else if (entry == second)
87
               found_second = i;
88
         }
89
      }
90
      if (found_first == affinities.size() && found_second == affinities.size()) {
91
         affinities.emplace_back(std::vector<uint32_t>({first, second}));
92
      } else if (found_first < affinities.size() && found_second == affinities.size()) {
93
         affinities[found_first].push_back(second);
94
      } else if (found_second < affinities.size() && found_first == affinities.size()) {
95
         affinities[found_second].push_back(first);
96
      } else if (found_first != found_second) {
97
         /* merge second into first */
98
         affinities[found_first].insert(affinities[found_first].end(),
99
                                        affinities[found_second].begin(),
100
                                        affinities[found_second].end());
101
         affinities.erase(std::next(affinities.begin(), found_second));
102
      } else {
103
         assert(found_first == found_second);
104
      }
105
   }
106

107
   void add_interference(uint32_t first, uint32_t second)
108
   {
109
      if (interferences[first].first.type() != interferences[second].first.type())
110
         return;
111

112
      bool inserted = interferences[first].second.insert(second).second;
113
      if (inserted)
114
         interferences[second].second.insert(first);
115
   }
116

117
   uint32_t allocate_spill_id(RegClass rc)
118
   {
119
      interferences.emplace_back(rc, std::unordered_set<uint32_t>());
120
      is_reloaded.push_back(false);
121
      return next_spill_id++;
122
   }
123

124
   uint32_t next_spill_id = 0;
125
};
126

127
int32_t
128
get_dominator(int idx_a, int idx_b, Program* program, bool is_linear)
129
{
130

131
   if (idx_a == -1)
132
      return idx_b;
133
   if (idx_b == -1)
134
      return idx_a;
135
   if (is_linear) {
136
      while (idx_a != idx_b) {
137
         if (idx_a > idx_b)
138
            idx_a = program->blocks[idx_a].linear_idom;
139
         else
140
            idx_b = program->blocks[idx_b].linear_idom;
141
      }
142
   } else {
143
      while (idx_a != idx_b) {
144
         if (idx_a > idx_b)
145
            idx_a = program->blocks[idx_a].logical_idom;
146
         else
147
            idx_b = program->blocks[idx_b].logical_idom;
148
      }
149
   }
150
   assert(idx_a != -1);
151
   return idx_a;
152
}
153

154
void
155
next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>& worklist)
156
{
157
   Block* block = &ctx.program->blocks[block_idx];
158
   std::map<Temp, std::pair<uint32_t, uint32_t>> next_uses = ctx.next_use_distances_end[block_idx];
159

160
   /* to compute the next use distance at the beginning of the block, we have to add the block's
161
    * size */
162
   for (std::map<Temp, std::pair<uint32_t, uint32_t>>::iterator it = next_uses.begin();
163
        it != next_uses.end(); ++it)
164
      it->second.second = it->second.second + block->instructions.size();
165

166
   int idx = block->instructions.size() - 1;
167
   while (idx >= 0) {
168
      aco_ptr<Instruction>& instr = block->instructions[idx];
169

170
      if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi)
171
         break;
172

173
      for (const Definition& def : instr->definitions) {
174
         if (def.isTemp())
175
            next_uses.erase(def.getTemp());
176
      }
177

178
      for (const Operand& op : instr->operands) {
179
         /* omit exec mask */
180
         if (op.isFixed() && op.physReg() == exec)
181
            continue;
182
         if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())
183
            continue;
184
         if (op.isTemp())
185
            next_uses[op.getTemp()] = {block_idx, idx};
186
      }
187
      idx--;
188
   }
189

190
   assert(block_idx != 0 || next_uses.empty());
191
   ctx.next_use_distances_start[block_idx] = next_uses;
192
   while (idx >= 0) {
193
      aco_ptr<Instruction>& instr = block->instructions[idx];
194
      assert(instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi);
195

196
      if (!instr->definitions[0].isTemp()) {
197
         idx--;
198
         continue;
199
      }
200

201
      auto it = next_uses.find(instr->definitions[0].getTemp());
202
      std::pair<uint32_t, uint32_t> distance =
203
         it == next_uses.end() ? std::make_pair(block_idx, 0u) : it->second;
204
      for (unsigned i = 0; i < instr->operands.size(); i++) {
205
         unsigned pred_idx =
206
            instr->opcode == aco_opcode::p_phi ? block->logical_preds[i] : block->linear_preds[i];
207
         if (instr->operands[i].isTemp()) {
208
            if (ctx.next_use_distances_end[pred_idx].find(instr->operands[i].getTemp()) ==
209
                   ctx.next_use_distances_end[pred_idx].end() ||
210
                ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] != distance)
211
               worklist.insert(pred_idx);
212
            ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] = distance;
213
         }
214
      }
215
      next_uses.erase(instr->definitions[0].getTemp());
216
      idx--;
217
   }
218

219
   /* all remaining live vars must be live-out at the predecessors */
220
   for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_uses) {
221
      Temp temp = pair.first;
222
      uint32_t distance = pair.second.second;
223
      uint32_t dom = pair.second.first;
224
      std::vector<unsigned>& preds = temp.is_linear() ? block->linear_preds : block->logical_preds;
225
      for (unsigned pred_idx : preds) {
226
         if (ctx.program->blocks[pred_idx].loop_nest_depth > block->loop_nest_depth)
227
            distance += 0xFFFF;
228
         if (ctx.next_use_distances_end[pred_idx].find(temp) !=
229
             ctx.next_use_distances_end[pred_idx].end()) {
230
            dom = get_dominator(dom, ctx.next_use_distances_end[pred_idx][temp].first, ctx.program,
231
                                temp.is_linear());
232
            distance = std::min(ctx.next_use_distances_end[pred_idx][temp].second, distance);
233
         }
234
         if (ctx.next_use_distances_end[pred_idx][temp] !=
235
             std::pair<uint32_t, uint32_t>{dom, distance})
236
            worklist.insert(pred_idx);
237
         ctx.next_use_distances_end[pred_idx][temp] = {dom, distance};
238
      }
239
   }
240
}
241

242
void
243
compute_global_next_uses(spill_ctx& ctx)
244
{
245
   ctx.next_use_distances_start.resize(ctx.program->blocks.size());
246
   ctx.next_use_distances_end.resize(ctx.program->blocks.size());
247
   std::set<uint32_t> worklist;
248
   for (Block& block : ctx.program->blocks)
249
      worklist.insert(block.index);
250

251
   while (!worklist.empty()) {
252
      std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
253
      unsigned block_idx = *b_it;
254
      worklist.erase(block_idx);
255
      next_uses_per_block(ctx, block_idx, worklist);
256
   }
257
}
258

259
bool
260
should_rematerialize(aco_ptr<Instruction>& instr)
261
{
262
   /* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */
263
   if (instr->format != Format::VOP1 && instr->format != Format::SOP1 &&
264
       instr->format != Format::PSEUDO && instr->format != Format::SOPK)
265
      return false;
266
   /* TODO: pseudo-instruction rematerialization is only supported for
267
    * p_create_vector/p_parallelcopy */
268
   if (instr->isPseudo() && instr->opcode != aco_opcode::p_create_vector &&
269
       instr->opcode != aco_opcode::p_parallelcopy)
270
      return false;
271
   if (instr->isSOPK() && instr->opcode != aco_opcode::s_movk_i32)
272
      return false;
273

274
   for (const Operand& op : instr->operands) {
275
      /* TODO: rematerialization using temporaries isn't yet supported */
276
      if (!op.isConstant())
277
         return false;
278
   }
279

280
   /* TODO: rematerialization with multiple definitions isn't yet supported */
281
   if (instr->definitions.size() > 1)
282
      return false;
283

284
   return true;
285
}
286

287
aco_ptr<Instruction>
288
do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id)
289
{
290
   std::map<Temp, remat_info>::iterator remat = ctx.remat.find(tmp);
291
   if (remat != ctx.remat.end()) {
292
      Instruction* instr = remat->second.instr;
293
      assert((instr->isVOP1() || instr->isSOP1() || instr->isPseudo() || instr->isSOPK()) &&
294
             "unsupported");
295
      assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector ||
296
              instr->opcode == aco_opcode::p_parallelcopy) &&
297
             "unsupported");
298
      assert(instr->definitions.size() == 1 && "unsupported");
299

300
      aco_ptr<Instruction> res;
301
      if (instr->isVOP1()) {
302
         res.reset(create_instruction<VOP1_instruction>(
303
            instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
304
      } else if (instr->isSOP1()) {
305
         res.reset(create_instruction<SOP1_instruction>(
306
            instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
307
      } else if (instr->isPseudo()) {
308
         res.reset(create_instruction<Pseudo_instruction>(
309
            instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
310
      } else if (instr->isSOPK()) {
311
         res.reset(create_instruction<SOPK_instruction>(
312
            instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
313
         res->sopk().imm = instr->sopk().imm;
314
      }
315
      for (unsigned i = 0; i < instr->operands.size(); i++) {
316
         res->operands[i] = instr->operands[i];
317
         if (instr->operands[i].isTemp()) {
318
            assert(false && "unsupported");
319
            if (ctx.remat.count(instr->operands[i].getTemp()))
320
               ctx.remat_used[ctx.remat[instr->operands[i].getTemp()].instr] = true;
321
         }
322
      }
323
      res->definitions[0] = Definition(new_name);
324
      return res;
325
   } else {
326
      aco_ptr<Pseudo_instruction> reload{
327
         create_instruction<Pseudo_instruction>(aco_opcode::p_reload, Format::PSEUDO, 1, 1)};
328
      reload->operands[0] = Operand::c32(spill_id);
329
      reload->definitions[0] = Definition(new_name);
330
      ctx.is_reloaded[spill_id] = true;
331
      return reload;
332
   }
333
}
334

335
void
336
get_rematerialize_info(spill_ctx& ctx)
337
{
338
   for (Block& block : ctx.program->blocks) {
339
      bool logical = false;
340
      for (aco_ptr<Instruction>& instr : block.instructions) {
341
         if (instr->opcode == aco_opcode::p_logical_start)
342
            logical = true;
343
         else if (instr->opcode == aco_opcode::p_logical_end)
344
            logical = false;
345
         if (logical && should_rematerialize(instr)) {
346
            for (const Definition& def : instr->definitions) {
347
               if (def.isTemp()) {
348
                  ctx.remat[def.getTemp()] = remat_info{instr.get()};
349
                  ctx.remat_used[instr.get()] = false;
350
               }
351
            }
352
         }
353
      }
354
   }
355
}
356

357
std::vector<std::map<Temp, uint32_t>>
358
local_next_uses(spill_ctx& ctx, Block* block)
359
{
360
   std::vector<std::map<Temp, uint32_t>> local_next_uses(block->instructions.size());
361

362
   std::map<Temp, uint32_t> next_uses;
363
   for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair :
364
        ctx.next_use_distances_end[block->index])
365
      next_uses[pair.first] = pair.second.second + block->instructions.size();
366

367
   for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {
368
      aco_ptr<Instruction>& instr = block->instructions[idx];
369
      if (!instr)
370
         break;
371
      if (instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi)
372
         break;
373

374
      for (const Operand& op : instr->operands) {
375
         if (op.isFixed() && op.physReg() == exec)
376
            continue;
377
         if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())
378
            continue;
379
         if (op.isTemp())
380
            next_uses[op.getTemp()] = idx;
381
      }
382
      for (const Definition& def : instr->definitions) {
383
         if (def.isTemp())
384
            next_uses.erase(def.getTemp());
385
      }
386
      local_next_uses[idx] = next_uses;
387
   }
388
   return local_next_uses;
389
}
390

391
RegisterDemand
392
get_demand_before(spill_ctx& ctx, unsigned block_idx, unsigned idx)
393
{
394
   if (idx == 0) {
395
      RegisterDemand demand = ctx.register_demand[block_idx][idx];
396
      aco_ptr<Instruction>& instr = ctx.program->blocks[block_idx].instructions[idx];
397
      aco_ptr<Instruction> instr_before(nullptr);
398
      return get_demand_before(demand, instr, instr_before);
399
   } else {
400
      return ctx.register_demand[block_idx][idx - 1];
401
   }
402
}
403

404
RegisterDemand
405
get_live_in_demand(spill_ctx& ctx, unsigned block_idx)
406
{
407
   unsigned idx = 0;
408
   RegisterDemand reg_pressure = RegisterDemand();
409
   Block& block = ctx.program->blocks[block_idx];
410
   for (aco_ptr<Instruction>& phi : block.instructions) {
411
      if (!is_phi(phi))
412
         break;
413
      idx++;
414

415
      /* Killed phi definitions increase pressure in the predecessor but not
416
       * the block they're in. Since the loops below are both to control
417
       * pressure of the start of this block and the ends of it's
418
       * predecessors, we need to count killed unspilled phi definitions here. */
419
      if (phi->definitions[0].isTemp() && phi->definitions[0].isKill() &&
420
          !ctx.spills_entry[block_idx].count(phi->definitions[0].getTemp()))
421
         reg_pressure += phi->definitions[0].getTemp();
422
   }
423

424
   reg_pressure += get_demand_before(ctx, block_idx, idx);
425

426
   /* Consider register pressure from linear predecessors. This can affect
427
    * reg_pressure if the branch instructions define sgprs. */
428
   for (unsigned pred : block.linear_preds)
429
      reg_pressure.sgpr =
430
         std::max<int16_t>(reg_pressure.sgpr, ctx.register_demand[pred].back().sgpr);
431

432
   return reg_pressure;
433
}
434

435
RegisterDemand
436
init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
437
{
438
   RegisterDemand spilled_registers;
439

440
   /* first block, nothing was spilled before */
441
   if (block_idx == 0)
442
      return {0, 0};
443

444
   /* next use distances at the beginning of the current block */
445
   auto& next_use_distances = ctx.next_use_distances_start[block_idx];
446

447
   /* loop header block */
448
   if (block->loop_nest_depth > ctx.program->blocks[block_idx - 1].loop_nest_depth) {
449
      assert(block->linear_preds[0] == block_idx - 1);
450
      assert(block->logical_preds[0] == block_idx - 1);
451

452
      /* create new loop_info */
453
      ctx.loop_header.emplace(block);
454

455
      /* check how many live-through variables should be spilled */
456
      RegisterDemand reg_pressure = get_live_in_demand(ctx, block_idx);
457
      RegisterDemand loop_demand = reg_pressure;
458
      unsigned i = block_idx;
459
      while (ctx.program->blocks[i].loop_nest_depth >= block->loop_nest_depth) {
460
         assert(ctx.program->blocks.size() > i);
461
         loop_demand.update(ctx.program->blocks[i++].register_demand);
462
      }
463
      unsigned loop_end = i;
464

465
      for (auto spilled : ctx.spills_exit[block_idx - 1]) {
466
         auto it = next_use_distances.find(spilled.first);
467

468
         /* variable is not live at loop entry: probably a phi operand */
469
         if (it == next_use_distances.end())
470
            continue;
471

472
         /* keep constants and live-through variables spilled */
473
         if (it->second.first >= loop_end || ctx.remat.count(spilled.first)) {
474
            ctx.spills_entry[block_idx][spilled.first] = spilled.second;
475
            spilled_registers += spilled.first;
476
            loop_demand -= spilled.first;
477
         }
478
      }
479

480
      /* select live-through variables and constants */
481
      RegType type = RegType::vgpr;
482
      while (loop_demand.exceeds(ctx.target_pressure)) {
483
         /* if VGPR demand is low enough, select SGPRs */
484
         if (type == RegType::vgpr && loop_demand.vgpr <= ctx.target_pressure.vgpr)
485
            type = RegType::sgpr;
486
         /* if SGPR demand is low enough, break */
487
         if (type == RegType::sgpr && loop_demand.sgpr <= ctx.target_pressure.sgpr)
488
            break;
489

490
         unsigned distance = 0;
491
         Temp to_spill;
492
         for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {
493
            if (pair.first.type() == type &&
494
                (pair.second.first >= loop_end ||
495
                 (ctx.remat.count(pair.first) && type == RegType::sgpr)) &&
496
                pair.second.second > distance &&
497
                ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
498
               to_spill = pair.first;
499
               distance = pair.second.second;
500
            }
501
         }
502

503
         /* select SGPRs or break */
504
         if (distance == 0) {
505
            if (type == RegType::sgpr)
506
               break;
507
            type = RegType::sgpr;
508
            continue;
509
         }
510

511
         uint32_t spill_id;
512
         if (ctx.spills_exit[block_idx - 1].find(to_spill) ==
513
             ctx.spills_exit[block_idx - 1].end()) {
514
            spill_id = ctx.allocate_spill_id(to_spill.regClass());
515
         } else {
516
            spill_id = ctx.spills_exit[block_idx - 1][to_spill];
517
         }
518

519
         ctx.spills_entry[block_idx][to_spill] = spill_id;
520
         spilled_registers += to_spill;
521
         loop_demand -= to_spill;
522
      }
523

524
      /* shortcut */
525
      if (!loop_demand.exceeds(ctx.target_pressure))
526
         return spilled_registers;
527

528
      /* if reg pressure is too high at beginning of loop, add variables with furthest use */
529
      reg_pressure -= spilled_registers;
530

531
      while (reg_pressure.exceeds(ctx.target_pressure)) {
532
         unsigned distance = 0;
533
         Temp to_spill;
534
         type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;
535

536
         for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {
537
            if (pair.first.type() == type && pair.second.second > distance &&
538
                ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
539
               to_spill = pair.first;
540
               distance = pair.second.second;
541
            }
542
         }
543
         assert(distance != 0);
544
         ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());
545
         spilled_registers += to_spill;
546
         reg_pressure -= to_spill;
547
      }
548

549
      return spilled_registers;
550
   }
551

552
   /* branch block */
553
   if (block->linear_preds.size() == 1 && !(block->kind & block_kind_loop_exit)) {
554
      /* keep variables spilled if they are alive and not used in the current block */
555
      unsigned pred_idx = block->linear_preds[0];
556
      for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
557
         if (pair.first.type() == RegType::sgpr &&
558
             next_use_distances.find(pair.first) != next_use_distances.end() &&
559
             next_use_distances[pair.first].first != block_idx) {
560
            ctx.spills_entry[block_idx].insert(pair);
561
            spilled_registers.sgpr += pair.first.size();
562
         }
563
      }
564
      if (block->logical_preds.size() == 1) {
565
         pred_idx = block->logical_preds[0];
566
         for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
567
            if (pair.first.type() == RegType::vgpr &&
568
                next_use_distances.find(pair.first) != next_use_distances.end() &&
569
                next_use_distances[pair.first].first != block_idx) {
570
               ctx.spills_entry[block_idx].insert(pair);
571
               spilled_registers.vgpr += pair.first.size();
572
            }
573
         }
574
      }
575

576
      /* if register demand is still too high, we just keep all spilled live vars
577
       * and process the block */
578
      if (block->register_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) {
579
         pred_idx = block->linear_preds[0];
580
         for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
581
            if (pair.first.type() == RegType::sgpr &&
582
                next_use_distances.find(pair.first) != next_use_distances.end() &&
583
                ctx.spills_entry[block_idx].insert(pair).second) {
584
               spilled_registers.sgpr += pair.first.size();
585
            }
586
         }
587
      }
588
      if (block->register_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr &&
589
          block->logical_preds.size() == 1) {
590
         pred_idx = block->logical_preds[0];
591
         for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
592
            if (pair.first.type() == RegType::vgpr &&
593
                next_use_distances.find(pair.first) != next_use_distances.end() &&
594
                ctx.spills_entry[block_idx].insert(pair).second) {
595
               spilled_registers.vgpr += pair.first.size();
596
            }
597
         }
598
      }
599

600
      return spilled_registers;
601
   }
602

603
   /* else: merge block */
604
   std::set<Temp> partial_spills;
605

606
   /* keep variables spilled on all incoming paths */
607
   for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {
608
      std::vector<unsigned>& preds =
609
         pair.first.is_linear() ? block->linear_preds : block->logical_preds;
610
      /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload
611
       * it. Otherwise, if any predecessor reloads it, ensure it's reloaded on all other
612
       * predecessors. The idea is that it's better in practice to rematerialize redundantly than to
613
       * create lots of phis. */
614
      /* TODO: test this idea with more than Dawn of War III shaders (the current pipeline-db
615
       * doesn't seem to exercise this path much) */
616
      bool remat = ctx.remat.count(pair.first);
617
      bool spill = !remat;
618
      uint32_t spill_id = 0;
619
      for (unsigned pred_idx : preds) {
620
         /* variable is not even live at the predecessor: probably from a phi */
621
         if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==
622
             ctx.next_use_distances_end[pred_idx].end()) {
623
            spill = false;
624
            break;
625
         }
626
         if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end()) {
627
            if (!remat)
628
               spill = false;
629
         } else {
630
            partial_spills.insert(pair.first);
631
            /* it might be that on one incoming path, the variable has a different spill_id, but
632
             * add_couple_code() will take care of that. */
633
            spill_id = ctx.spills_exit[pred_idx][pair.first];
634
            if (remat)
635
               spill = true;
636
         }
637
      }
638
      if (spill) {
639
         ctx.spills_entry[block_idx][pair.first] = spill_id;
640
         partial_spills.erase(pair.first);
641
         spilled_registers += pair.first;
642
      }
643
   }
644

645
   /* same for phis */
646
   for (aco_ptr<Instruction>& phi : block->instructions) {
647
      if (!is_phi(phi))
648
         break;
649
      if (!phi->definitions[0].isTemp())
650
         continue;
651

652
      std::vector<unsigned>& preds =
653
         phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
654
      bool spill = true;
655

656
      for (unsigned i = 0; i < phi->operands.size(); i++) {
657
         /* non-temp operands can increase the register pressure */
658
         if (!phi->operands[i].isTemp()) {
659
            partial_spills.insert(phi->definitions[0].getTemp());
660
            continue;
661
         }
662

663
         if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) ==
664
             ctx.spills_exit[preds[i]].end())
665
            spill = false;
666
         else
667
            partial_spills.insert(phi->definitions[0].getTemp());
668
      }
669
      if (spill) {
670
         ctx.spills_entry[block_idx][phi->definitions[0].getTemp()] =
671
            ctx.allocate_spill_id(phi->definitions[0].regClass());
672
         partial_spills.erase(phi->definitions[0].getTemp());
673
         spilled_registers += phi->definitions[0].getTemp();
674
      }
675
   }
676

677
   /* if reg pressure at first instruction is still too high, add partially spilled variables */
678
   RegisterDemand reg_pressure = get_live_in_demand(ctx, block_idx);
679
   reg_pressure -= spilled_registers;
680

681
   while (reg_pressure.exceeds(ctx.target_pressure)) {
682
      assert(!partial_spills.empty());
683
      std::set<Temp>::iterator it = partial_spills.begin();
684
      Temp to_spill = Temp();
685
      unsigned distance = 0;
686
      RegType type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;
687

688
      while (it != partial_spills.end()) {
689
         assert(ctx.spills_entry[block_idx].find(*it) == ctx.spills_entry[block_idx].end());
690

691
         if (it->type() == type && next_use_distances[*it].second > distance) {
692
            distance = next_use_distances[*it].second;
693
            to_spill = *it;
694
         }
695
         ++it;
696
      }
697
      assert(distance != 0);
698

699
      ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());
700
      partial_spills.erase(to_spill);
701
      spilled_registers += to_spill;
702
      reg_pressure -= to_spill;
703
   }
704

705
   return spilled_registers;
706
}
707

708
void
709
add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
710
{
711
   /* no coupling code necessary */
712
   if (block->linear_preds.size() == 0)
713
      return;
714

715
   std::vector<aco_ptr<Instruction>> instructions;
716
   /* branch block: TODO take other branch into consideration */
717
   if (block->linear_preds.size() == 1 &&
718
       !(block->kind & (block_kind_loop_exit | block_kind_loop_header))) {
719
      assert(ctx.processed[block->linear_preds[0]]);
720
      assert(ctx.register_demand[block_idx].size() == block->instructions.size());
721
      std::vector<RegisterDemand> reg_demand;
722
      unsigned insert_idx = 0;
723
      RegisterDemand demand_before = get_demand_before(ctx, block_idx, 0);
724

725
      for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live :
726
           ctx.next_use_distances_start[block_idx]) {
727
         const unsigned pred_idx = block->linear_preds[0];
728

729
         if (!live.first.is_linear())
730
            continue;
731
         /* still spilled */
732
         if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end())
733
            continue;
734

735
         /* in register at end of predecessor */
736
         if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) {
737
            std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first);
738
            if (it != ctx.renames[pred_idx].end())
739
               ctx.renames[block_idx].insert(*it);
740
            continue;
741
         }
742

743
         /* variable is spilled at predecessor and live at current block: create reload instruction */
744
         Temp new_name = ctx.program->allocateTmp(live.first.regClass());
745
         aco_ptr<Instruction> reload =
746
            do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
747
         instructions.emplace_back(std::move(reload));
748
         reg_demand.push_back(demand_before);
749
         ctx.renames[block_idx][live.first] = new_name;
750
      }
751

752
      if (block->logical_preds.size() == 1) {
753
         do {
754
            assert(insert_idx < block->instructions.size());
755
            instructions.emplace_back(std::move(block->instructions[insert_idx]));
756
            reg_demand.push_back(ctx.register_demand[block_idx][insert_idx]);
757
            insert_idx++;
758
         } while (instructions.back()->opcode != aco_opcode::p_logical_start);
759

760
         unsigned pred_idx = block->logical_preds[0];
761
         for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live :
762
              ctx.next_use_distances_start[block_idx]) {
763
            if (live.first.is_linear())
764
               continue;
765
            /* still spilled */
766
            if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end())
767
               continue;
768

769
            /* in register at end of predecessor */
770
            if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) {
771
               std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first);
772
               if (it != ctx.renames[pred_idx].end())
773
                  ctx.renames[block_idx].insert(*it);
774
               continue;
775
            }
776

777
            /* variable is spilled at predecessor and live at current block:
778
             * create reload instruction */
779
            Temp new_name = ctx.program->allocateTmp(live.first.regClass());
780
            aco_ptr<Instruction> reload =
781
               do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
782
            instructions.emplace_back(std::move(reload));
783
            reg_demand.emplace_back(reg_demand.back());
784
            ctx.renames[block_idx][live.first] = new_name;
785
         }
786
      }
787

788
      /* combine new reload instructions with original block */
789
      if (!instructions.empty()) {
790
         reg_demand.insert(reg_demand.end(),
791
                           std::next(ctx.register_demand[block->index].begin(), insert_idx),
792
                           ctx.register_demand[block->index].end());
793
         ctx.register_demand[block_idx] = std::move(reg_demand);
794
         instructions.insert(instructions.end(),
795
                             std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(
796
                                std::next(block->instructions.begin(), insert_idx)),
797
                             std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(
798
                                block->instructions.end()));
799
         block->instructions = std::move(instructions);
800
      }
801
      return;
802
   }
803

804
   /* loop header and merge blocks: check if all (linear) predecessors have been processed */
805
   for (ASSERTED unsigned pred : block->linear_preds)
806
      assert(ctx.processed[pred]);
807

808
   /* iterate the phi nodes for which operands to spill at the predecessor */
809
   for (aco_ptr<Instruction>& phi : block->instructions) {
810
      if (!is_phi(phi))
811
         break;
812

813
      /* if the phi is not spilled, add to instructions */
814
      if (!phi->definitions[0].isTemp() ||
815
          ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) ==
816
             ctx.spills_entry[block_idx].end()) {
817
         instructions.emplace_back(std::move(phi));
818
         continue;
819
      }
820

821
      std::vector<unsigned>& preds =
822
         phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
823
      uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()];
824

825
      for (unsigned i = 0; i < phi->operands.size(); i++) {
826
         if (phi->operands[i].isUndefined())
827
            continue;
828

829
         unsigned pred_idx = preds[i];
830
         Operand spill_op = phi->operands[i];
831

832
         if (spill_op.isTemp()) {
833
            assert(phi->operands[i].isKill());
834
            Temp var = phi->operands[i].getTemp();
835

836
            std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
837
            /* prevent the definining instruction from being DCE'd if it could be rematerialized */
838
            if (rename_it == ctx.renames[preds[i]].end() && ctx.remat.count(var))
839
               ctx.remat_used[ctx.remat[var].instr] = true;
840

841
            /* check if variable is already spilled at predecessor */
842
            std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(var);
843
            if (spilled != ctx.spills_exit[pred_idx].end()) {
844
               if (spilled->second != def_spill_id)
845
                  ctx.add_affinity(def_spill_id, spilled->second);
846
               continue;
847
            }
848

849
            /* rename if necessary */
850
            if (rename_it != ctx.renames[pred_idx].end()) {
851
               spill_op.setTemp(rename_it->second);
852
               ctx.renames[pred_idx].erase(rename_it);
853
            }
854
         }
855

856
         uint32_t spill_id = ctx.allocate_spill_id(phi->definitions[0].regClass());
857

858
         /* add interferences and affinity */
859
         for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx])
860
            ctx.add_interference(spill_id, pair.second);
861
         ctx.add_affinity(def_spill_id, spill_id);
862

863
         aco_ptr<Pseudo_instruction> spill{
864
            create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
865
         spill->operands[0] = spill_op;
866
         spill->operands[1] = Operand::c32(spill_id);
867
         Block& pred = ctx.program->blocks[pred_idx];
868
         unsigned idx = pred.instructions.size();
869
         do {
870
            assert(idx != 0);
871
            idx--;
872
         } while (phi->opcode == aco_opcode::p_phi &&
873
                  pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
874
         std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
875
         pred.instructions.insert(it, std::move(spill));
876
         if (spill_op.isTemp())
877
            ctx.spills_exit[pred_idx][spill_op.getTemp()] = spill_id;
878
      }
879

880
      /* remove phi from instructions */
881
      phi.reset();
882
   }
883

884
   /* iterate all (other) spilled variables for which to spill at the predecessor */
885
   // TODO: would be better to have them sorted: first vgprs and first with longest distance
886
   for (std::pair<Temp, uint32_t> pair : ctx.spills_entry[block_idx]) {
887
      std::vector<unsigned> preds =
888
         pair.first.is_linear() ? block->linear_preds : block->logical_preds;
889

890
      for (unsigned pred_idx : preds) {
891
         /* variable is already spilled at predecessor */
892
         std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(pair.first);
893
         if (spilled != ctx.spills_exit[pred_idx].end()) {
894
            if (spilled->second != pair.second)
895
               ctx.add_affinity(pair.second, spilled->second);
896
            continue;
897
         }
898

899
         /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */
900
         if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==
901
             ctx.next_use_distances_end[pred_idx].end())
902
            continue;
903

904
         /* add interferences between spilled variable and predecessors exit spills */
905
         for (std::pair<Temp, uint32_t> exit_spill : ctx.spills_exit[pred_idx]) {
906
            if (exit_spill.first == pair.first)
907
               continue;
908
            ctx.add_interference(exit_spill.second, pair.second);
909
         }
910

911
         /* variable is in register at predecessor and has to be spilled */
912
         /* rename if necessary */
913
         Temp var = pair.first;
914
         std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
915
         if (rename_it != ctx.renames[pred_idx].end()) {
916
            var = rename_it->second;
917
            ctx.renames[pred_idx].erase(rename_it);
918
         }
919

920
         aco_ptr<Pseudo_instruction> spill{
921
            create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
922
         spill->operands[0] = Operand(var);
923
         spill->operands[1] = Operand::c32(pair.second);
924
         Block& pred = ctx.program->blocks[pred_idx];
925
         unsigned idx = pred.instructions.size();
926
         do {
927
            assert(idx != 0);
928
            idx--;
929
         } while (pair.first.type() == RegType::vgpr &&
930
                  pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
931
         std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
932
         pred.instructions.insert(it, std::move(spill));
933
         ctx.spills_exit[pred.index][pair.first] = pair.second;
934
      }
935
   }
936

937
   /* iterate phis for which operands to reload */
938
   for (aco_ptr<Instruction>& phi : instructions) {
939
      assert(phi->opcode == aco_opcode::p_phi || phi->opcode == aco_opcode::p_linear_phi);
940
      assert(!phi->definitions[0].isTemp() ||
941
             ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) ==
942
                ctx.spills_entry[block_idx].end());
943

944
      std::vector<unsigned>& preds =
945
         phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
946
      for (unsigned i = 0; i < phi->operands.size(); i++) {
947
         if (!phi->operands[i].isTemp())
948
            continue;
949
         unsigned pred_idx = preds[i];
950

951
         /* if the operand was reloaded, rename */
952
         if (ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()) ==
953
             ctx.spills_exit[pred_idx].end()) {
954
            std::map<Temp, Temp>::iterator it =
955
               ctx.renames[pred_idx].find(phi->operands[i].getTemp());
956
            if (it != ctx.renames[pred_idx].end())
957
               phi->operands[i].setTemp(it->second);
958
            /* prevent the definining instruction from being DCE'd if it could be rematerialized */
959
            else if (ctx.remat.count(phi->operands[i].getTemp()))
960
               ctx.remat_used[ctx.remat[phi->operands[i].getTemp()].instr] = true;
961
            continue;
962
         }
963

964
         Temp tmp = phi->operands[i].getTemp();
965

966
         /* reload phi operand at end of predecessor block */
967
         Temp new_name = ctx.program->allocateTmp(tmp.regClass());
968
         Block& pred = ctx.program->blocks[pred_idx];
969
         unsigned idx = pred.instructions.size();
970
         do {
971
            assert(idx != 0);
972
            idx--;
973
         } while (phi->opcode == aco_opcode::p_phi &&
974
                  pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
975
         std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
976
         aco_ptr<Instruction> reload =
977
            do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]);
978

979
         /* reload spilled exec mask directly to exec */
980
         if (!phi->definitions[0].isTemp()) {
981
            assert(phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec);
982
            reload->definitions[0] = phi->definitions[0];
983
            phi->operands[i] = Operand(exec, ctx.program->lane_mask);
984
         } else {
985
            ctx.spills_exit[pred_idx].erase(tmp);
986
            ctx.renames[pred_idx][tmp] = new_name;
987
            phi->operands[i].setTemp(new_name);
988
         }
989

990
         pred.instructions.insert(it, std::move(reload));
991
      }
992
   }
993

994
   /* iterate live variables for which to reload */
995
   // TODO: reload at current block if variable is spilled on all predecessors
996
   for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair :
997
        ctx.next_use_distances_start[block_idx]) {
998
      /* skip spilled variables */
999
      if (ctx.spills_entry[block_idx].find(pair.first) != ctx.spills_entry[block_idx].end())
1000
         continue;
1001
      std::vector<unsigned> preds =
1002
         pair.first.is_linear() ? block->linear_preds : block->logical_preds;
1003

1004
      /* variable is dead at predecessor, it must be from a phi */
1005
      bool is_dead = false;
1006
      for (unsigned pred_idx : preds) {
1007
         if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==
1008
             ctx.next_use_distances_end[pred_idx].end())
1009
            is_dead = true;
1010
      }
1011
      if (is_dead)
1012
         continue;
1013
      for (unsigned pred_idx : preds) {
1014
         /* the variable is not spilled at the predecessor */
1015
         if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end())
1016
            continue;
1017

1018
         /* variable is spilled at predecessor and has to be reloaded */
1019
         Temp new_name = ctx.program->allocateTmp(pair.first.regClass());
1020
         Block& pred = ctx.program->blocks[pred_idx];
1021
         unsigned idx = pred.instructions.size();
1022
         do {
1023
            assert(idx != 0);
1024
            idx--;
1025
         } while (pair.first.type() == RegType::vgpr &&
1026
                  pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
1027
         std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
1028

1029
         aco_ptr<Instruction> reload =
1030
            do_reload(ctx, pair.first, new_name, ctx.spills_exit[pred.index][pair.first]);
1031
         pred.instructions.insert(it, std::move(reload));
1032

1033
         ctx.spills_exit[pred.index].erase(pair.first);
1034
         ctx.renames[pred.index][pair.first] = new_name;
1035
      }
1036

1037
      /* check if we have to create a new phi for this variable */
1038
      Temp rename = Temp();
1039
      bool is_same = true;
1040
      for (unsigned pred_idx : preds) {
1041
         if (ctx.renames[pred_idx].find(pair.first) == ctx.renames[pred_idx].end()) {
1042
            if (rename == Temp())
1043
               rename = pair.first;
1044
            else
1045
               is_same = rename == pair.first;
1046
         } else {
1047
            if (rename == Temp())
1048
               rename = ctx.renames[pred_idx][pair.first];
1049
            else
1050
               is_same = rename == ctx.renames[pred_idx][pair.first];
1051
         }
1052

1053
         if (!is_same)
1054
            break;
1055
      }
1056

1057
      if (!is_same) {
1058
         /* the variable was renamed differently in the predecessors: we have to create a phi */
1059
         aco_opcode opcode = pair.first.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1060
         aco_ptr<Pseudo_instruction> phi{
1061
            create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1062
         rename = ctx.program->allocateTmp(pair.first.regClass());
1063
         for (unsigned i = 0; i < phi->operands.size(); i++) {
1064
            Temp tmp;
1065
            if (ctx.renames[preds[i]].find(pair.first) != ctx.renames[preds[i]].end()) {
1066
               tmp = ctx.renames[preds[i]][pair.first];
1067
            } else if (preds[i] >= block_idx) {
1068
               tmp = rename;
1069
            } else {
1070
               tmp = pair.first;
1071
               /* prevent the definining instruction from being DCE'd if it could be rematerialized */
1072
               if (ctx.remat.count(tmp))
1073
                  ctx.remat_used[ctx.remat[tmp].instr] = true;
1074
            }
1075
            phi->operands[i] = Operand(tmp);
1076
         }
1077
         phi->definitions[0] = Definition(rename);
1078
         instructions.emplace_back(std::move(phi));
1079
      }
1080

1081
      /* the variable was renamed: add new name to renames */
1082
      if (!(rename == Temp() || rename == pair.first))
1083
         ctx.renames[block_idx][pair.first] = rename;
1084
   }
1085

1086
   /* combine phis with instructions */
1087
   unsigned idx = 0;
1088
   while (!block->instructions[idx]) {
1089
      idx++;
1090
   }
1091

1092
   if (!ctx.processed[block_idx]) {
1093
      assert(!(block->kind & block_kind_loop_header));
1094
      RegisterDemand demand_before = get_demand_before(ctx, block_idx, idx);
1095
      ctx.register_demand[block->index].erase(ctx.register_demand[block->index].begin(),
1096
                                              ctx.register_demand[block->index].begin() + idx);
1097
      ctx.register_demand[block->index].insert(ctx.register_demand[block->index].begin(),
1098
                                               instructions.size(), demand_before);
1099
   }
1100

1101
   std::vector<aco_ptr<Instruction>>::iterator start = std::next(block->instructions.begin(), idx);
1102
   instructions.insert(
1103
      instructions.end(), std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(start),
1104
      std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
1105
   block->instructions = std::move(instructions);
1106
}
1107

1108
void
1109
process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
1110
              std::map<Temp, uint32_t>& current_spills, RegisterDemand spilled_registers)
1111
{
1112
   assert(!ctx.processed[block_idx]);
1113

1114
   std::vector<std::map<Temp, uint32_t>> local_next_use_distance;
1115
   std::vector<aco_ptr<Instruction>> instructions;
1116
   unsigned idx = 0;
1117

1118
   /* phis are handled separetely */
1119
   while (block->instructions[idx]->opcode == aco_opcode::p_phi ||
1120
          block->instructions[idx]->opcode == aco_opcode::p_linear_phi) {
1121
      instructions.emplace_back(std::move(block->instructions[idx++]));
1122
   }
1123

1124
   if (block->register_demand.exceeds(ctx.target_pressure))
1125
      local_next_use_distance = local_next_uses(ctx, block);
1126

1127
   while (idx < block->instructions.size()) {
1128
      aco_ptr<Instruction>& instr = block->instructions[idx];
1129

1130
      std::map<Temp, std::pair<Temp, uint32_t>> reloads;
1131
      std::map<Temp, uint32_t> spills;
1132
      /* rename and reload operands */
1133
      for (Operand& op : instr->operands) {
1134
         if (!op.isTemp())
1135
            continue;
1136
         if (current_spills.find(op.getTemp()) == current_spills.end()) {
1137
            /* the Operand is in register: check if it was renamed */
1138
            if (ctx.renames[block_idx].find(op.getTemp()) != ctx.renames[block_idx].end())
1139
               op.setTemp(ctx.renames[block_idx][op.getTemp()]);
1140
            /* prevent it's definining instruction from being DCE'd if it could be rematerialized */
1141
            else if (ctx.remat.count(op.getTemp()))
1142
               ctx.remat_used[ctx.remat[op.getTemp()].instr] = true;
1143
            continue;
1144
         }
1145
         /* the Operand is spilled: add it to reloads */
1146
         Temp new_tmp = ctx.program->allocateTmp(op.regClass());
1147
         ctx.renames[block_idx][op.getTemp()] = new_tmp;
1148
         reloads[new_tmp] = std::make_pair(op.getTemp(), current_spills[op.getTemp()]);
1149
         current_spills.erase(op.getTemp());
1150
         op.setTemp(new_tmp);
1151
         spilled_registers -= new_tmp;
1152
      }
1153

1154
      /* check if register demand is low enough before and after the current instruction */
1155
      if (block->register_demand.exceeds(ctx.target_pressure)) {
1156

1157
         RegisterDemand new_demand = ctx.register_demand[block_idx][idx];
1158
         new_demand.update(get_demand_before(ctx, block_idx, idx));
1159

1160
         assert(!local_next_use_distance.empty());
1161

1162
         /* if reg pressure is too high, spill variable with furthest next use */
1163
         while ((new_demand - spilled_registers).exceeds(ctx.target_pressure)) {
1164
            unsigned distance = 0;
1165
            Temp to_spill;
1166
            bool do_rematerialize = false;
1167
            RegType type = RegType::sgpr;
1168
            if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr)
1169
               type = RegType::vgpr;
1170

1171
            for (std::pair<Temp, uint32_t> pair : local_next_use_distance[idx]) {
1172
               if (pair.first.type() != type)
1173
                  continue;
1174
               bool can_rematerialize = ctx.remat.count(pair.first);
1175
               if (((pair.second > distance && can_rematerialize == do_rematerialize) ||
1176
                    (can_rematerialize && !do_rematerialize && pair.second > idx)) &&
1177
                   current_spills.find(pair.first) == current_spills.end() &&
1178
                   ctx.spills_exit[block_idx].find(pair.first) ==
1179
                      ctx.spills_exit[block_idx].end()) {
1180
                  to_spill = pair.first;
1181
                  distance = pair.second;
1182
                  do_rematerialize = can_rematerialize;
1183
               }
1184
            }
1185

1186
            assert(distance != 0 && distance > idx);
1187
            uint32_t spill_id = ctx.allocate_spill_id(to_spill.regClass());
1188

1189
            /* add interferences with currently spilled variables */
1190
            for (std::pair<Temp, uint32_t> pair : current_spills)
1191
               ctx.add_interference(spill_id, pair.second);
1192
            for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads)
1193
               ctx.add_interference(spill_id, pair.second.second);
1194

1195
            current_spills[to_spill] = spill_id;
1196
            spilled_registers += to_spill;
1197

1198
            /* rename if necessary */
1199
            if (ctx.renames[block_idx].find(to_spill) != ctx.renames[block_idx].end()) {
1200
               to_spill = ctx.renames[block_idx][to_spill];
1201
            }
1202

1203
            /* add spill to new instructions */
1204
            aco_ptr<Pseudo_instruction> spill{
1205
               create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
1206
            spill->operands[0] = Operand(to_spill);
1207
            spill->operands[1] = Operand::c32(spill_id);
1208
            instructions.emplace_back(std::move(spill));
1209
         }
1210
      }
1211

1212
      /* add reloads and instruction to new instructions */
1213
      for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads) {
1214
         aco_ptr<Instruction> reload =
1215
            do_reload(ctx, pair.second.first, pair.first, pair.second.second);
1216
         instructions.emplace_back(std::move(reload));
1217
      }
1218
      instructions.emplace_back(std::move(instr));
1219
      idx++;
1220
   }
1221

1222
   block->instructions = std::move(instructions);
1223
   ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end());
1224
}
1225

1226
void
1227
spill_block(spill_ctx& ctx, unsigned block_idx)
1228
{
1229
   Block* block = &ctx.program->blocks[block_idx];
1230

1231
   /* determine set of variables which are spilled at the beginning of the block */
1232
   RegisterDemand spilled_registers = init_live_in_vars(ctx, block, block_idx);
1233

1234
   /* add interferences for spilled variables */
1235
   for (auto it = ctx.spills_entry[block_idx].begin(); it != ctx.spills_entry[block_idx].end();
1236
        ++it) {
1237
      for (auto it2 = std::next(it); it2 != ctx.spills_entry[block_idx].end(); ++it2)
1238
         ctx.add_interference(it->second, it2->second);
1239
   }
1240

1241
   bool is_loop_header = block->loop_nest_depth && ctx.loop_header.top()->index == block_idx;
1242
   if (!is_loop_header) {
1243
      /* add spill/reload code on incoming control flow edges */
1244
      add_coupling_code(ctx, block, block_idx);
1245
   }
1246

1247
   std::map<Temp, uint32_t> current_spills = ctx.spills_entry[block_idx];
1248

1249
   /* check conditions to process this block */
1250
   bool process = (block->register_demand - spilled_registers).exceeds(ctx.target_pressure) ||
1251
                  !ctx.renames[block_idx].empty() || ctx.remat_used.size();
1252

1253
   for (auto it = current_spills.begin(); !process && it != current_spills.end(); ++it) {
1254
      if (ctx.next_use_distances_start[block_idx][it->first].first == block_idx)
1255
         process = true;
1256
   }
1257

1258
   if (process)
1259
      process_block(ctx, block_idx, block, current_spills, spilled_registers);
1260
   else
1261
      ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end());
1262

1263
   ctx.processed[block_idx] = true;
1264

1265
   /* check if the next block leaves the current loop */
1266
   if (block->loop_nest_depth == 0 ||
1267
       ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth)
1268
      return;
1269

1270
   Block* loop_header = ctx.loop_header.top();
1271

1272
   /* preserve original renames at end of loop header block */
1273
   std::map<Temp, Temp> renames = std::move(ctx.renames[loop_header->index]);
1274

1275
   /* add coupling code to all loop header predecessors */
1276
   add_coupling_code(ctx, loop_header, loop_header->index);
1277

1278
   /* propagate new renames through loop: i.e. repair the SSA */
1279
   renames.swap(ctx.renames[loop_header->index]);
1280
   for (std::pair<Temp, Temp> rename : renames) {
1281
      for (unsigned idx = loop_header->index; idx <= block_idx; idx++) {
1282
         Block& current = ctx.program->blocks[idx];
1283
         std::vector<aco_ptr<Instruction>>::iterator instr_it = current.instructions.begin();
1284

1285
         /* first rename phis */
1286
         while (instr_it != current.instructions.end()) {
1287
            aco_ptr<Instruction>& phi = *instr_it;
1288
            if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
1289
               break;
1290
            /* no need to rename the loop header phis once again. this happened in
1291
             * add_coupling_code() */
1292
            if (idx == loop_header->index) {
1293
               instr_it++;
1294
               continue;
1295
            }
1296

1297
            for (Operand& op : phi->operands) {
1298
               if (!op.isTemp())
1299
                  continue;
1300
               if (op.getTemp() == rename.first)
1301
                  op.setTemp(rename.second);
1302
            }
1303
            instr_it++;
1304
         }
1305

1306
         /* variable is not live at beginning of this block */
1307
         if (ctx.next_use_distances_start[idx].count(rename.first) == 0)
1308
            continue;
1309

1310
         /* if the variable is live at the block's exit, add rename */
1311
         if (ctx.next_use_distances_end[idx].count(rename.first) != 0)
1312
            ctx.renames[idx].insert(rename);
1313

1314
         /* rename all uses in this block */
1315
         bool renamed = false;
1316
         while (!renamed && instr_it != current.instructions.end()) {
1317
            aco_ptr<Instruction>& instr = *instr_it;
1318
            for (Operand& op : instr->operands) {
1319
               if (!op.isTemp())
1320
                  continue;
1321
               if (op.getTemp() == rename.first) {
1322
                  op.setTemp(rename.second);
1323
                  /* we can stop with this block as soon as the variable is spilled */
1324
                  if (instr->opcode == aco_opcode::p_spill)
1325
                     renamed = true;
1326
               }
1327
            }
1328
            instr_it++;
1329
         }
1330
      }
1331
   }
1332

1333
   /* remove loop header info from stack */
1334
   ctx.loop_header.pop();
1335
}
1336

1337
Temp
1338
load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset,
1339
                      std::vector<aco_ptr<Instruction>>& instructions, unsigned offset,
1340
                      bool is_top_level)
1341
{
1342
   Builder bld(ctx.program);
1343
   if (is_top_level) {
1344
      bld.reset(&instructions);
1345
   } else {
1346
      /* find p_logical_end */
1347
      unsigned idx = instructions.size() - 1;
1348
      while (instructions[idx]->opcode != aco_opcode::p_logical_end)
1349
         idx--;
1350
      bld.reset(&instructions, std::next(instructions.begin(), idx));
1351
   }
1352

1353
   Temp private_segment_buffer = ctx.program->private_segment_buffer;
1354
   if (ctx.program->stage != compute_cs)
1355
      private_segment_buffer =
1356
         bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero());
1357

1358
   if (offset)
1359
      scratch_offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
1360
                                scratch_offset, Operand::c32(offset));
1361

1362
   uint32_t rsrc_conf =
1363
      S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx.program->wave_size == 64 ? 3 : 2);
1364

1365
   if (ctx.program->chip_class >= GFX10) {
1366
      rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
1367
                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
1368
   } else if (ctx.program->chip_class <= GFX7) {
1369
      /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
1370
      rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1371
                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1372
   }
1373
   /* older generations need element size = 4 bytes. element size removed in GFX9 */
1374
   if (ctx.program->chip_class <= GFX8)
1375
      rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
1376

1377
   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
1378
                     Operand::c32(-1u), Operand::c32(rsrc_conf));
1379
}
1380

1381
void
1382
add_interferences(spill_ctx& ctx, std::vector<bool>& is_assigned, std::vector<uint32_t>& slots,
1383
                  std::vector<bool>& slots_used, unsigned id)
1384
{
1385
   for (unsigned other : ctx.interferences[id].second) {
1386
      if (!is_assigned[other])
1387
         continue;
1388

1389
      RegClass other_rc = ctx.interferences[other].first;
1390
      unsigned slot = slots[other];
1391
      std::fill(slots_used.begin() + slot, slots_used.begin() + slot + other_rc.size(), true);
1392
   }
1393
}
1394

1395
unsigned
1396
find_available_slot(std::vector<bool>& used, unsigned wave_size, unsigned size, bool is_sgpr,
1397
                    unsigned* num_slots)
1398
{
1399
   unsigned wave_size_minus_one = wave_size - 1;
1400
   unsigned slot = 0;
1401

1402
   while (true) {
1403
      bool available = true;
1404
      for (unsigned i = 0; i < size; i++) {
1405
         if (slot + i < used.size() && used[slot + i]) {
1406
            available = false;
1407
            break;
1408
         }
1409
      }
1410
      if (!available) {
1411
         slot++;
1412
         continue;
1413
      }
1414

1415
      if (is_sgpr && ((slot & wave_size_minus_one) > wave_size - size)) {
1416
         slot = align(slot, wave_size);
1417
         continue;
1418
      }
1419

1420
      std::fill(used.begin(), used.end(), false);
1421

1422
      if (slot + size > used.size())
1423
         used.resize(slot + size);
1424

1425
      return slot;
1426
   }
1427
}
1428

1429
void
1430
assign_spill_slots_helper(spill_ctx& ctx, RegType type, std::vector<bool>& is_assigned,
1431
                          std::vector<uint32_t>& slots, unsigned* num_slots)
1432
{
1433
   std::vector<bool> slots_used(*num_slots);
1434

1435
   /* assign slots for ids with affinities first */
1436
   for (std::vector<uint32_t>& vec : ctx.affinities) {
1437
      if (ctx.interferences[vec[0]].first.type() != type)
1438
         continue;
1439

1440
      for (unsigned id : vec) {
1441
         if (!ctx.is_reloaded[id])
1442
            continue;
1443

1444
         add_interferences(ctx, is_assigned, slots, slots_used, id);
1445
      }
1446

1447
      unsigned slot =
1448
         find_available_slot(slots_used, ctx.wave_size, ctx.interferences[vec[0]].first.size(),
1449
                             type == RegType::sgpr, num_slots);
1450

1451
      for (unsigned id : vec) {
1452
         assert(!is_assigned[id]);
1453

1454
         if (ctx.is_reloaded[id]) {
1455
            slots[id] = slot;
1456
            is_assigned[id] = true;
1457
         }
1458
      }
1459
   }
1460

1461
   /* assign slots for ids without affinities */
1462
   for (unsigned id = 0; id < ctx.interferences.size(); id++) {
1463
      if (is_assigned[id] || !ctx.is_reloaded[id] || ctx.interferences[id].first.type() != type)
1464
         continue;
1465

1466
      add_interferences(ctx, is_assigned, slots, slots_used, id);
1467

1468
      unsigned slot =
1469
         find_available_slot(slots_used, ctx.wave_size, ctx.interferences[id].first.size(),
1470
                             type == RegType::sgpr, num_slots);
1471

1472
      slots[id] = slot;
1473
      is_assigned[id] = true;
1474
   }
1475

1476
   *num_slots = slots_used.size();
1477
}
1478

1479
void
1480
assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
1481
{
1482
   std::vector<uint32_t> slots(ctx.interferences.size());
1483
   std::vector<bool> is_assigned(ctx.interferences.size());
1484

1485
   /* first, handle affinities: just merge all interferences into both spill ids */
1486
   for (std::vector<uint32_t>& vec : ctx.affinities) {
1487
      for (unsigned i = 0; i < vec.size(); i++) {
1488
         for (unsigned j = i + 1; j < vec.size(); j++) {
1489
            assert(vec[i] != vec[j]);
1490
            bool reloaded = ctx.is_reloaded[vec[i]] || ctx.is_reloaded[vec[j]];
1491
            ctx.is_reloaded[vec[i]] = reloaded;
1492
            ctx.is_reloaded[vec[j]] = reloaded;
1493
         }
1494
      }
1495
   }
1496
   for (ASSERTED uint32_t i = 0; i < ctx.interferences.size(); i++)
1497
      for (ASSERTED uint32_t id : ctx.interferences[i].second)
1498
         assert(i != id);
1499

1500
   /* for each spill slot, assign as many spill ids as possible */
1501
   unsigned sgpr_spill_slots = 0, vgpr_spill_slots = 0;
1502
   assign_spill_slots_helper(ctx, RegType::sgpr, is_assigned, slots, &sgpr_spill_slots);
1503
   assign_spill_slots_helper(ctx, RegType::vgpr, is_assigned, slots, &vgpr_spill_slots);
1504

1505
   for (unsigned id = 0; id < is_assigned.size(); id++)
1506
      assert(is_assigned[id] || !ctx.is_reloaded[id]);
1507

1508
   for (std::vector<uint32_t>& vec : ctx.affinities) {
1509
      for (unsigned i = 0; i < vec.size(); i++) {
1510
         for (unsigned j = i + 1; j < vec.size(); j++) {
1511
            assert(is_assigned[vec[i]] == is_assigned[vec[j]]);
1512
            if (!is_assigned[vec[i]])
1513
               continue;
1514
            assert(ctx.is_reloaded[vec[i]] == ctx.is_reloaded[vec[j]]);
1515
            assert(ctx.interferences[vec[i]].first.type() ==
1516
                   ctx.interferences[vec[j]].first.type());
1517
            assert(slots[vec[i]] == slots[vec[j]]);
1518
         }
1519
      }
1520
   }
1521

1522
   /* hope, we didn't mess up */
1523
   std::vector<Temp> vgpr_spill_temps((sgpr_spill_slots + ctx.wave_size - 1) / ctx.wave_size);
1524
   assert(vgpr_spill_temps.size() <= spills_to_vgpr);
1525

1526
   /* replace pseudo instructions with actual hardware instructions */
1527
   Temp scratch_offset = ctx.program->scratch_offset, scratch_rsrc = Temp();
1528
   unsigned last_top_level_block_idx = 0;
1529
   std::vector<bool> reload_in_loop(vgpr_spill_temps.size());
1530
   for (Block& block : ctx.program->blocks) {
1531

1532
      /* after loops, we insert a user if there was a reload inside the loop */
1533
      if (block.loop_nest_depth == 0) {
1534
         int end_vgprs = 0;
1535
         for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
1536
            if (reload_in_loop[i])
1537
               end_vgprs++;
1538
         }
1539

1540
         if (end_vgprs > 0) {
1541
            aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(
1542
               aco_opcode::p_end_linear_vgpr, Format::PSEUDO, end_vgprs, 0)};
1543
            int k = 0;
1544
            for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
1545
               if (reload_in_loop[i])
1546
                  destr->operands[k++] = Operand(vgpr_spill_temps[i]);
1547
               reload_in_loop[i] = false;
1548
            }
1549
            /* find insertion point */
1550
            std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
1551
            while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
1552
               ++it;
1553
            block.instructions.insert(it, std::move(destr));
1554
         }
1555
      }
1556

1557
      if (block.kind & block_kind_top_level && !block.linear_preds.empty()) {
1558
         last_top_level_block_idx = block.index;
1559

1560
         /* check if any spilled variables use a created linear vgpr, otherwise destroy them */
1561
         for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
1562
            if (vgpr_spill_temps[i] == Temp())
1563
               continue;
1564

1565
            bool can_destroy = true;
1566
            for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[block.linear_preds[0]]) {
1567

1568
               if (ctx.interferences[pair.second].first.type() == RegType::sgpr &&
1569
                   slots[pair.second] / ctx.wave_size == i) {
1570
                  can_destroy = false;
1571
                  break;
1572
               }
1573
            }
1574
            if (can_destroy)
1575
               vgpr_spill_temps[i] = Temp();
1576
         }
1577
      }
1578

1579
      std::vector<aco_ptr<Instruction>>::iterator it;
1580
      std::vector<aco_ptr<Instruction>> instructions;
1581
      instructions.reserve(block.instructions.size());
1582
      Builder bld(ctx.program, &instructions);
1583
      for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1584

1585
         if ((*it)->opcode == aco_opcode::p_spill) {
1586
            uint32_t spill_id = (*it)->operands[1].constantValue();
1587

1588
            if (!ctx.is_reloaded[spill_id]) {
1589
               /* never reloaded, so don't spill */
1590
            } else if (!is_assigned[spill_id]) {
1591
               unreachable("No spill slot assigned for spill id");
1592
            } else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) {
1593
               /* spill vgpr */
1594
               ctx.program->config->spilled_vgprs += (*it)->operands[0].size();
1595
               uint32_t spill_slot = slots[spill_id];
1596
               bool add_offset_to_sgpr =
1597
                  ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size +
1598
                     vgpr_spill_slots * 4 >
1599
                  4096;
1600
               unsigned base_offset =
1601
                  add_offset_to_sgpr
1602
                     ? 0
1603
                     : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
1604

1605
               /* check if the scratch resource descriptor already exists */
1606
               if (scratch_rsrc == Temp()) {
1607
                  unsigned offset =
1608
                     add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
1609
                  scratch_rsrc = load_scratch_resource(
1610
                     ctx, scratch_offset,
1611
                     last_top_level_block_idx == block.index
1612
                        ? instructions
1613
                        : ctx.program->blocks[last_top_level_block_idx].instructions,
1614
                     offset, last_top_level_block_idx == block.index);
1615
               }
1616

1617
               unsigned offset = base_offset + spill_slot * 4;
1618
               aco_opcode opcode = aco_opcode::buffer_store_dword;
1619
               assert((*it)->operands[0].isTemp());
1620
               Temp temp = (*it)->operands[0].getTemp();
1621
               assert(temp.type() == RegType::vgpr && !temp.is_linear());
1622
               if (temp.size() > 1) {
1623
                  Instruction* split{create_instruction<Pseudo_instruction>(
1624
                     aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};
1625
                  split->operands[0] = Operand(temp);
1626
                  for (unsigned i = 0; i < temp.size(); i++)
1627
                     split->definitions[i] = bld.def(v1);
1628
                  bld.insert(split);
1629
                  for (unsigned i = 0; i < temp.size(); i++) {
1630
                     Instruction* instr =
1631
                        bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset,
1632
                                  split->definitions[i].getTemp(), offset + i * 4, false, true);
1633
                     instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1634
                  }
1635
               } else {
1636
                  Instruction* instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset,
1637
                                                 temp, offset, false, true);
1638
                  instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1639
               }
1640
            } else {
1641
               ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
1642

1643
               uint32_t spill_slot = slots[spill_id];
1644

1645
               /* check if the linear vgpr already exists */
1646
               if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
1647
                  Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());
1648
                  vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
1649
                  aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
1650
                     aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
1651
                  create->definitions[0] = Definition(linear_vgpr);
1652
                  /* find the right place to insert this definition */
1653
                  if (last_top_level_block_idx == block.index) {
1654
                     /* insert right before the current instruction */
1655
                     instructions.emplace_back(std::move(create));
1656
                  } else {
1657
                     assert(last_top_level_block_idx < block.index);
1658
                     /* insert before the branch at last top level block */
1659
                     std::vector<aco_ptr<Instruction>>& block_instrs =
1660
                        ctx.program->blocks[last_top_level_block_idx].instructions;
1661
                     block_instrs.insert(std::prev(block_instrs.end()), std::move(create));
1662
                  }
1663
               }
1664

1665
               /* spill sgpr: just add the vgpr temp to operands */
1666
               Pseudo_instruction* spill =
1667
                  create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 3, 0);
1668
               spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
1669
               spill->operands[1] = Operand::c32(spill_slot % ctx.wave_size);
1670
               spill->operands[2] = (*it)->operands[0];
1671
               instructions.emplace_back(aco_ptr<Instruction>(spill));
1672
            }
1673

1674
         } else if ((*it)->opcode == aco_opcode::p_reload) {
1675
            uint32_t spill_id = (*it)->operands[0].constantValue();
1676
            assert(ctx.is_reloaded[spill_id]);
1677

1678
            if (!is_assigned[spill_id]) {
1679
               unreachable("No spill slot assigned for spill id");
1680
            } else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) {
1681
               /* reload vgpr */
1682
               uint32_t spill_slot = slots[spill_id];
1683
               bool add_offset_to_sgpr =
1684
                  ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size +
1685
                     vgpr_spill_slots * 4 >
1686
                  4096;
1687
               unsigned base_offset =
1688
                  add_offset_to_sgpr
1689
                     ? 0
1690
                     : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
1691

1692
               /* check if the scratch resource descriptor already exists */
1693
               if (scratch_rsrc == Temp()) {
1694
                  unsigned offset =
1695
                     add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
1696
                  scratch_rsrc = load_scratch_resource(
1697
                     ctx, scratch_offset,
1698
                     last_top_level_block_idx == block.index
1699
                        ? instructions
1700
                        : ctx.program->blocks[last_top_level_block_idx].instructions,
1701
                     offset, last_top_level_block_idx == block.index);
1702
               }
1703

1704
               unsigned offset = base_offset + spill_slot * 4;
1705
               aco_opcode opcode = aco_opcode::buffer_load_dword;
1706
               Definition def = (*it)->definitions[0];
1707
               if (def.size() > 1) {
1708
                  Instruction* vec{create_instruction<Pseudo_instruction>(
1709
                     aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)};
1710
                  vec->definitions[0] = def;
1711
                  for (unsigned i = 0; i < def.size(); i++) {
1712
                     Temp tmp = bld.tmp(v1);
1713
                     vec->operands[i] = Operand(tmp);
1714
                     Instruction* instr =
1715
                        bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1),
1716
                                  scratch_offset, offset + i * 4, false, true);
1717
                     instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1718
                  }
1719
                  bld.insert(vec);
1720
               } else {
1721
                  Instruction* instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1),
1722
                                                 scratch_offset, offset, false, true);
1723
                  instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1724
               }
1725
            } else {
1726
               uint32_t spill_slot = slots[spill_id];
1727
               reload_in_loop[spill_slot / ctx.wave_size] = block.loop_nest_depth > 0;
1728

1729
               /* check if the linear vgpr already exists */
1730
               if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
1731
                  Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());
1732
                  vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
1733
                  aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
1734
                     aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
1735
                  create->definitions[0] = Definition(linear_vgpr);
1736
                  /* find the right place to insert this definition */
1737
                  if (last_top_level_block_idx == block.index) {
1738
                     /* insert right before the current instruction */
1739
                     instructions.emplace_back(std::move(create));
1740
                  } else {
1741
                     assert(last_top_level_block_idx < block.index);
1742
                     /* insert before the branch at last top level block */
1743
                     std::vector<aco_ptr<Instruction>>& block_instrs =
1744
                        ctx.program->blocks[last_top_level_block_idx].instructions;
1745
                     block_instrs.insert(std::prev(block_instrs.end()), std::move(create));
1746
                  }
1747
               }
1748

1749
               /* reload sgpr: just add the vgpr temp to operands */
1750
               Pseudo_instruction* reload = create_instruction<Pseudo_instruction>(
1751
                  aco_opcode::p_reload, Format::PSEUDO, 2, 1);
1752
               reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
1753
               reload->operands[1] = Operand::c32(spill_slot % ctx.wave_size);
1754
               reload->definitions[0] = (*it)->definitions[0];
1755
               instructions.emplace_back(aco_ptr<Instruction>(reload));
1756
            }
1757
         } else if (!ctx.remat_used.count(it->get()) || ctx.remat_used[it->get()]) {
1758
            instructions.emplace_back(std::move(*it));
1759
         }
1760
      }
1761
      block.instructions = std::move(instructions);
1762
   }
1763

1764
   /* update required scratch memory */
1765
   ctx.program->config->scratch_bytes_per_wave +=
1766
      align(vgpr_spill_slots * 4 * ctx.program->wave_size, 1024);
1767

1768
   /* SSA elimination inserts copies for logical phis right before p_logical_end
1769
    * So if a linear vgpr is used between that p_logical_end and the branch,
1770
    * we need to ensure logical phis don't choose a definition which aliases
1771
    * the linear vgpr.
1772
    * TODO: Moving the spills and reloads to before p_logical_end might produce
1773
    *       slightly better code. */
1774
   for (Block& block : ctx.program->blocks) {
1775
      /* loops exits are already handled */
1776
      if (block.logical_preds.size() <= 1)
1777
         continue;
1778

1779
      bool has_logical_phis = false;
1780
      for (aco_ptr<Instruction>& instr : block.instructions) {
1781
         if (instr->opcode == aco_opcode::p_phi) {
1782
            has_logical_phis = true;
1783
            break;
1784
         } else if (instr->opcode != aco_opcode::p_linear_phi) {
1785
            break;
1786
         }
1787
      }
1788
      if (!has_logical_phis)
1789
         continue;
1790

1791
      std::set<Temp> vgprs;
1792
      for (unsigned pred_idx : block.logical_preds) {
1793
         Block& pred = ctx.program->blocks[pred_idx];
1794
         for (int i = pred.instructions.size() - 1; i >= 0; i--) {
1795
            aco_ptr<Instruction>& pred_instr = pred.instructions[i];
1796
            if (pred_instr->opcode == aco_opcode::p_logical_end) {
1797
               break;
1798
            } else if (pred_instr->opcode == aco_opcode::p_spill ||
1799
                       pred_instr->opcode == aco_opcode::p_reload) {
1800
               vgprs.insert(pred_instr->operands[0].getTemp());
1801
            }
1802
         }
1803
      }
1804
      if (!vgprs.size())
1805
         continue;
1806

1807
      aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(
1808
         aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vgprs.size(), 0)};
1809
      int k = 0;
1810
      for (Temp tmp : vgprs) {
1811
         destr->operands[k++] = Operand(tmp);
1812
      }
1813
      /* find insertion point */
1814
      std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
1815
      while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
1816
         ++it;
1817
      block.instructions.insert(it, std::move(destr));
1818
   }
1819
}
1820

1821
} /* end namespace */
1822

1823
void
1824
spill(Program* program, live& live_vars)
1825
{
1826
   program->config->spilled_vgprs = 0;
1827
   program->config->spilled_sgprs = 0;
1828

1829
   program->progress = CompilationProgress::after_spilling;
1830

1831
   /* no spilling when register pressure is low enough */
1832
   if (program->num_waves > 0)
1833
      return;
1834

1835
   /* lower to CSSA before spilling to ensure correctness w.r.t. phis */
1836
   lower_to_cssa(program, live_vars);
1837

1838
   /* calculate target register demand */
1839
   const RegisterDemand demand = program->max_reg_demand; /* current max */
1840
   const uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
1841
   const uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
1842
   uint16_t extra_vgprs = 0;
1843
   uint16_t extra_sgprs = 0;
1844

1845
   /* calculate extra VGPRs required for spilling SGPRs */
1846
   if (demand.sgpr > sgpr_limit) {
1847
      unsigned sgpr_spills = demand.sgpr - sgpr_limit;
1848
      extra_vgprs = DIV_ROUND_UP(sgpr_spills, program->wave_size) + 1;
1849
   }
1850
   /* add extra SGPRs required for spilling VGPRs */
1851
   if (demand.vgpr + extra_vgprs > vgpr_limit) {
1852
      extra_sgprs = 5; /* scratch_resource (s4) + scratch_offset (s1) */
1853
      if (demand.sgpr + extra_sgprs > sgpr_limit) {
1854
         /* re-calculate in case something has changed */
1855
         unsigned sgpr_spills = demand.sgpr + extra_sgprs - sgpr_limit;
1856
         extra_vgprs = DIV_ROUND_UP(sgpr_spills, program->wave_size) + 1;
1857
      }
1858
   }
1859
   /* the spiller has to target the following register demand */
1860
   const RegisterDemand target(vgpr_limit - extra_vgprs, sgpr_limit - extra_sgprs);
1861

1862
   /* initialize ctx */
1863
   spill_ctx ctx(target, program, live_vars.register_demand);
1864
   compute_global_next_uses(ctx);
1865
   get_rematerialize_info(ctx);
1866

1867
   /* create spills and reloads */
1868
   for (unsigned i = 0; i < program->blocks.size(); i++)
1869
      spill_block(ctx, i);
1870

1871
   /* assign spill slots and DCE rematerialized code */
1872
   assign_spill_slots(ctx, extra_vgprs);
1873

1874
   /* update live variable information */
1875
   live_vars = live_var_analysis(program);
1876

1877
   assert(program->num_waves > 0);
1878
}
1879

1880
} // namespace aco
1881

1882
Product

Resources

Company