Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_spill.cpp
4550 views
1
/*
2
* Copyright © 2018 Valve Corporation
3
* Copyright © 2018 Google
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
* IN THE SOFTWARE.
23
*
24
*/
25
26
#include "aco_builder.h"
27
#include "aco_ir.h"
28
29
#include "common/sid.h"
30
31
#include <map>
32
#include <set>
33
#include <stack>
34
#include <unordered_set>
35
#include <vector>
36
37
/*
38
* Implements the spilling algorithm on SSA-form from
39
* "Register Spilling and Live-Range Splitting for SSA-Form Programs"
40
* by Matthias Braun and Sebastian Hack.
41
*/
42
43
namespace aco {
44
45
namespace {
46
47
struct remat_info {
48
Instruction* instr;
49
};
50
51
struct spill_ctx {
52
RegisterDemand target_pressure;
53
Program* program;
54
std::vector<std::vector<RegisterDemand>> register_demand;
55
std::vector<std::map<Temp, Temp>> renames;
56
std::vector<std::map<Temp, uint32_t>> spills_entry;
57
std::vector<std::map<Temp, uint32_t>> spills_exit;
58
std::vector<bool> processed;
59
std::stack<Block*> loop_header;
60
std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_start;
61
std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_end;
62
std::vector<std::pair<RegClass, std::unordered_set<uint32_t>>> interferences;
63
std::vector<std::vector<uint32_t>> affinities;
64
std::vector<bool> is_reloaded;
65
std::map<Temp, remat_info> remat;
66
std::map<Instruction*, bool> remat_used;
67
unsigned wave_size;
68
69
spill_ctx(const RegisterDemand target_pressure_, Program* program_,
70
std::vector<std::vector<RegisterDemand>> register_demand_)
71
: target_pressure(target_pressure_), program(program_),
72
register_demand(std::move(register_demand_)), renames(program->blocks.size()),
73
spills_entry(program->blocks.size()), spills_exit(program->blocks.size()),
74
processed(program->blocks.size(), false), wave_size(program->wave_size)
75
{}
76
77
void add_affinity(uint32_t first, uint32_t second)
78
{
79
unsigned found_first = affinities.size();
80
unsigned found_second = affinities.size();
81
for (unsigned i = 0; i < affinities.size(); i++) {
82
std::vector<uint32_t>& vec = affinities[i];
83
for (uint32_t entry : vec) {
84
if (entry == first)
85
found_first = i;
86
else if (entry == second)
87
found_second = i;
88
}
89
}
90
if (found_first == affinities.size() && found_second == affinities.size()) {
91
affinities.emplace_back(std::vector<uint32_t>({first, second}));
92
} else if (found_first < affinities.size() && found_second == affinities.size()) {
93
affinities[found_first].push_back(second);
94
} else if (found_second < affinities.size() && found_first == affinities.size()) {
95
affinities[found_second].push_back(first);
96
} else if (found_first != found_second) {
97
/* merge second into first */
98
affinities[found_first].insert(affinities[found_first].end(),
99
affinities[found_second].begin(),
100
affinities[found_second].end());
101
affinities.erase(std::next(affinities.begin(), found_second));
102
} else {
103
assert(found_first == found_second);
104
}
105
}
106
107
void add_interference(uint32_t first, uint32_t second)
108
{
109
if (interferences[first].first.type() != interferences[second].first.type())
110
return;
111
112
bool inserted = interferences[first].second.insert(second).second;
113
if (inserted)
114
interferences[second].second.insert(first);
115
}
116
117
uint32_t allocate_spill_id(RegClass rc)
118
{
119
interferences.emplace_back(rc, std::unordered_set<uint32_t>());
120
is_reloaded.push_back(false);
121
return next_spill_id++;
122
}
123
124
uint32_t next_spill_id = 0;
125
};
126
127
int32_t
128
get_dominator(int idx_a, int idx_b, Program* program, bool is_linear)
129
{
130
131
if (idx_a == -1)
132
return idx_b;
133
if (idx_b == -1)
134
return idx_a;
135
if (is_linear) {
136
while (idx_a != idx_b) {
137
if (idx_a > idx_b)
138
idx_a = program->blocks[idx_a].linear_idom;
139
else
140
idx_b = program->blocks[idx_b].linear_idom;
141
}
142
} else {
143
while (idx_a != idx_b) {
144
if (idx_a > idx_b)
145
idx_a = program->blocks[idx_a].logical_idom;
146
else
147
idx_b = program->blocks[idx_b].logical_idom;
148
}
149
}
150
assert(idx_a != -1);
151
return idx_a;
152
}
153
154
void
155
next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>& worklist)
156
{
157
Block* block = &ctx.program->blocks[block_idx];
158
std::map<Temp, std::pair<uint32_t, uint32_t>> next_uses = ctx.next_use_distances_end[block_idx];
159
160
/* to compute the next use distance at the beginning of the block, we have to add the block's
161
* size */
162
for (std::map<Temp, std::pair<uint32_t, uint32_t>>::iterator it = next_uses.begin();
163
it != next_uses.end(); ++it)
164
it->second.second = it->second.second + block->instructions.size();
165
166
int idx = block->instructions.size() - 1;
167
while (idx >= 0) {
168
aco_ptr<Instruction>& instr = block->instructions[idx];
169
170
if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi)
171
break;
172
173
for (const Definition& def : instr->definitions) {
174
if (def.isTemp())
175
next_uses.erase(def.getTemp());
176
}
177
178
for (const Operand& op : instr->operands) {
179
/* omit exec mask */
180
if (op.isFixed() && op.physReg() == exec)
181
continue;
182
if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())
183
continue;
184
if (op.isTemp())
185
next_uses[op.getTemp()] = {block_idx, idx};
186
}
187
idx--;
188
}
189
190
assert(block_idx != 0 || next_uses.empty());
191
ctx.next_use_distances_start[block_idx] = next_uses;
192
while (idx >= 0) {
193
aco_ptr<Instruction>& instr = block->instructions[idx];
194
assert(instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi);
195
196
if (!instr->definitions[0].isTemp()) {
197
idx--;
198
continue;
199
}
200
201
auto it = next_uses.find(instr->definitions[0].getTemp());
202
std::pair<uint32_t, uint32_t> distance =
203
it == next_uses.end() ? std::make_pair(block_idx, 0u) : it->second;
204
for (unsigned i = 0; i < instr->operands.size(); i++) {
205
unsigned pred_idx =
206
instr->opcode == aco_opcode::p_phi ? block->logical_preds[i] : block->linear_preds[i];
207
if (instr->operands[i].isTemp()) {
208
if (ctx.next_use_distances_end[pred_idx].find(instr->operands[i].getTemp()) ==
209
ctx.next_use_distances_end[pred_idx].end() ||
210
ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] != distance)
211
worklist.insert(pred_idx);
212
ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] = distance;
213
}
214
}
215
next_uses.erase(instr->definitions[0].getTemp());
216
idx--;
217
}
218
219
/* all remaining live vars must be live-out at the predecessors */
220
for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_uses) {
221
Temp temp = pair.first;
222
uint32_t distance = pair.second.second;
223
uint32_t dom = pair.second.first;
224
std::vector<unsigned>& preds = temp.is_linear() ? block->linear_preds : block->logical_preds;
225
for (unsigned pred_idx : preds) {
226
if (ctx.program->blocks[pred_idx].loop_nest_depth > block->loop_nest_depth)
227
distance += 0xFFFF;
228
if (ctx.next_use_distances_end[pred_idx].find(temp) !=
229
ctx.next_use_distances_end[pred_idx].end()) {
230
dom = get_dominator(dom, ctx.next_use_distances_end[pred_idx][temp].first, ctx.program,
231
temp.is_linear());
232
distance = std::min(ctx.next_use_distances_end[pred_idx][temp].second, distance);
233
}
234
if (ctx.next_use_distances_end[pred_idx][temp] !=
235
std::pair<uint32_t, uint32_t>{dom, distance})
236
worklist.insert(pred_idx);
237
ctx.next_use_distances_end[pred_idx][temp] = {dom, distance};
238
}
239
}
240
}
241
242
void
243
compute_global_next_uses(spill_ctx& ctx)
244
{
245
ctx.next_use_distances_start.resize(ctx.program->blocks.size());
246
ctx.next_use_distances_end.resize(ctx.program->blocks.size());
247
std::set<uint32_t> worklist;
248
for (Block& block : ctx.program->blocks)
249
worklist.insert(block.index);
250
251
while (!worklist.empty()) {
252
std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
253
unsigned block_idx = *b_it;
254
worklist.erase(block_idx);
255
next_uses_per_block(ctx, block_idx, worklist);
256
}
257
}
258
259
bool
260
should_rematerialize(aco_ptr<Instruction>& instr)
261
{
262
/* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */
263
if (instr->format != Format::VOP1 && instr->format != Format::SOP1 &&
264
instr->format != Format::PSEUDO && instr->format != Format::SOPK)
265
return false;
266
/* TODO: pseudo-instruction rematerialization is only supported for
267
* p_create_vector/p_parallelcopy */
268
if (instr->isPseudo() && instr->opcode != aco_opcode::p_create_vector &&
269
instr->opcode != aco_opcode::p_parallelcopy)
270
return false;
271
if (instr->isSOPK() && instr->opcode != aco_opcode::s_movk_i32)
272
return false;
273
274
for (const Operand& op : instr->operands) {
275
/* TODO: rematerialization using temporaries isn't yet supported */
276
if (!op.isConstant())
277
return false;
278
}
279
280
/* TODO: rematerialization with multiple definitions isn't yet supported */
281
if (instr->definitions.size() > 1)
282
return false;
283
284
return true;
285
}
286
287
aco_ptr<Instruction>
288
do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id)
289
{
290
std::map<Temp, remat_info>::iterator remat = ctx.remat.find(tmp);
291
if (remat != ctx.remat.end()) {
292
Instruction* instr = remat->second.instr;
293
assert((instr->isVOP1() || instr->isSOP1() || instr->isPseudo() || instr->isSOPK()) &&
294
"unsupported");
295
assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector ||
296
instr->opcode == aco_opcode::p_parallelcopy) &&
297
"unsupported");
298
assert(instr->definitions.size() == 1 && "unsupported");
299
300
aco_ptr<Instruction> res;
301
if (instr->isVOP1()) {
302
res.reset(create_instruction<VOP1_instruction>(
303
instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
304
} else if (instr->isSOP1()) {
305
res.reset(create_instruction<SOP1_instruction>(
306
instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
307
} else if (instr->isPseudo()) {
308
res.reset(create_instruction<Pseudo_instruction>(
309
instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
310
} else if (instr->isSOPK()) {
311
res.reset(create_instruction<SOPK_instruction>(
312
instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
313
res->sopk().imm = instr->sopk().imm;
314
}
315
for (unsigned i = 0; i < instr->operands.size(); i++) {
316
res->operands[i] = instr->operands[i];
317
if (instr->operands[i].isTemp()) {
318
assert(false && "unsupported");
319
if (ctx.remat.count(instr->operands[i].getTemp()))
320
ctx.remat_used[ctx.remat[instr->operands[i].getTemp()].instr] = true;
321
}
322
}
323
res->definitions[0] = Definition(new_name);
324
return res;
325
} else {
326
aco_ptr<Pseudo_instruction> reload{
327
create_instruction<Pseudo_instruction>(aco_opcode::p_reload, Format::PSEUDO, 1, 1)};
328
reload->operands[0] = Operand::c32(spill_id);
329
reload->definitions[0] = Definition(new_name);
330
ctx.is_reloaded[spill_id] = true;
331
return reload;
332
}
333
}
334
335
void
336
get_rematerialize_info(spill_ctx& ctx)
337
{
338
for (Block& block : ctx.program->blocks) {
339
bool logical = false;
340
for (aco_ptr<Instruction>& instr : block.instructions) {
341
if (instr->opcode == aco_opcode::p_logical_start)
342
logical = true;
343
else if (instr->opcode == aco_opcode::p_logical_end)
344
logical = false;
345
if (logical && should_rematerialize(instr)) {
346
for (const Definition& def : instr->definitions) {
347
if (def.isTemp()) {
348
ctx.remat[def.getTemp()] = remat_info{instr.get()};
349
ctx.remat_used[instr.get()] = false;
350
}
351
}
352
}
353
}
354
}
355
}
356
357
std::vector<std::map<Temp, uint32_t>>
358
local_next_uses(spill_ctx& ctx, Block* block)
359
{
360
std::vector<std::map<Temp, uint32_t>> local_next_uses(block->instructions.size());
361
362
std::map<Temp, uint32_t> next_uses;
363
for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair :
364
ctx.next_use_distances_end[block->index])
365
next_uses[pair.first] = pair.second.second + block->instructions.size();
366
367
for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {
368
aco_ptr<Instruction>& instr = block->instructions[idx];
369
if (!instr)
370
break;
371
if (instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi)
372
break;
373
374
for (const Operand& op : instr->operands) {
375
if (op.isFixed() && op.physReg() == exec)
376
continue;
377
if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())
378
continue;
379
if (op.isTemp())
380
next_uses[op.getTemp()] = idx;
381
}
382
for (const Definition& def : instr->definitions) {
383
if (def.isTemp())
384
next_uses.erase(def.getTemp());
385
}
386
local_next_uses[idx] = next_uses;
387
}
388
return local_next_uses;
389
}
390
391
RegisterDemand
392
get_demand_before(spill_ctx& ctx, unsigned block_idx, unsigned idx)
393
{
394
if (idx == 0) {
395
RegisterDemand demand = ctx.register_demand[block_idx][idx];
396
aco_ptr<Instruction>& instr = ctx.program->blocks[block_idx].instructions[idx];
397
aco_ptr<Instruction> instr_before(nullptr);
398
return get_demand_before(demand, instr, instr_before);
399
} else {
400
return ctx.register_demand[block_idx][idx - 1];
401
}
402
}
403
404
RegisterDemand
405
get_live_in_demand(spill_ctx& ctx, unsigned block_idx)
406
{
407
unsigned idx = 0;
408
RegisterDemand reg_pressure = RegisterDemand();
409
Block& block = ctx.program->blocks[block_idx];
410
for (aco_ptr<Instruction>& phi : block.instructions) {
411
if (!is_phi(phi))
412
break;
413
idx++;
414
415
/* Killed phi definitions increase pressure in the predecessor but not
416
* the block they're in. Since the loops below are both to control
417
* pressure of the start of this block and the ends of it's
418
* predecessors, we need to count killed unspilled phi definitions here. */
419
if (phi->definitions[0].isTemp() && phi->definitions[0].isKill() &&
420
!ctx.spills_entry[block_idx].count(phi->definitions[0].getTemp()))
421
reg_pressure += phi->definitions[0].getTemp();
422
}
423
424
reg_pressure += get_demand_before(ctx, block_idx, idx);
425
426
/* Consider register pressure from linear predecessors. This can affect
427
* reg_pressure if the branch instructions define sgprs. */
428
for (unsigned pred : block.linear_preds)
429
reg_pressure.sgpr =
430
std::max<int16_t>(reg_pressure.sgpr, ctx.register_demand[pred].back().sgpr);
431
432
return reg_pressure;
433
}
434
435
RegisterDemand
436
init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
437
{
438
RegisterDemand spilled_registers;
439
440
/* first block, nothing was spilled before */
441
if (block_idx == 0)
442
return {0, 0};
443
444
/* next use distances at the beginning of the current block */
445
auto& next_use_distances = ctx.next_use_distances_start[block_idx];
446
447
/* loop header block */
448
if (block->loop_nest_depth > ctx.program->blocks[block_idx - 1].loop_nest_depth) {
449
assert(block->linear_preds[0] == block_idx - 1);
450
assert(block->logical_preds[0] == block_idx - 1);
451
452
/* create new loop_info */
453
ctx.loop_header.emplace(block);
454
455
/* check how many live-through variables should be spilled */
456
RegisterDemand reg_pressure = get_live_in_demand(ctx, block_idx);
457
RegisterDemand loop_demand = reg_pressure;
458
unsigned i = block_idx;
459
while (ctx.program->blocks[i].loop_nest_depth >= block->loop_nest_depth) {
460
assert(ctx.program->blocks.size() > i);
461
loop_demand.update(ctx.program->blocks[i++].register_demand);
462
}
463
unsigned loop_end = i;
464
465
for (auto spilled : ctx.spills_exit[block_idx - 1]) {
466
auto it = next_use_distances.find(spilled.first);
467
468
/* variable is not live at loop entry: probably a phi operand */
469
if (it == next_use_distances.end())
470
continue;
471
472
/* keep constants and live-through variables spilled */
473
if (it->second.first >= loop_end || ctx.remat.count(spilled.first)) {
474
ctx.spills_entry[block_idx][spilled.first] = spilled.second;
475
spilled_registers += spilled.first;
476
loop_demand -= spilled.first;
477
}
478
}
479
480
/* select live-through variables and constants */
481
RegType type = RegType::vgpr;
482
while (loop_demand.exceeds(ctx.target_pressure)) {
483
/* if VGPR demand is low enough, select SGPRs */
484
if (type == RegType::vgpr && loop_demand.vgpr <= ctx.target_pressure.vgpr)
485
type = RegType::sgpr;
486
/* if SGPR demand is low enough, break */
487
if (type == RegType::sgpr && loop_demand.sgpr <= ctx.target_pressure.sgpr)
488
break;
489
490
unsigned distance = 0;
491
Temp to_spill;
492
for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {
493
if (pair.first.type() == type &&
494
(pair.second.first >= loop_end ||
495
(ctx.remat.count(pair.first) && type == RegType::sgpr)) &&
496
pair.second.second > distance &&
497
ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
498
to_spill = pair.first;
499
distance = pair.second.second;
500
}
501
}
502
503
/* select SGPRs or break */
504
if (distance == 0) {
505
if (type == RegType::sgpr)
506
break;
507
type = RegType::sgpr;
508
continue;
509
}
510
511
uint32_t spill_id;
512
if (ctx.spills_exit[block_idx - 1].find(to_spill) ==
513
ctx.spills_exit[block_idx - 1].end()) {
514
spill_id = ctx.allocate_spill_id(to_spill.regClass());
515
} else {
516
spill_id = ctx.spills_exit[block_idx - 1][to_spill];
517
}
518
519
ctx.spills_entry[block_idx][to_spill] = spill_id;
520
spilled_registers += to_spill;
521
loop_demand -= to_spill;
522
}
523
524
/* shortcut */
525
if (!loop_demand.exceeds(ctx.target_pressure))
526
return spilled_registers;
527
528
/* if reg pressure is too high at beginning of loop, add variables with furthest use */
529
reg_pressure -= spilled_registers;
530
531
while (reg_pressure.exceeds(ctx.target_pressure)) {
532
unsigned distance = 0;
533
Temp to_spill;
534
type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;
535
536
for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {
537
if (pair.first.type() == type && pair.second.second > distance &&
538
ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
539
to_spill = pair.first;
540
distance = pair.second.second;
541
}
542
}
543
assert(distance != 0);
544
ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());
545
spilled_registers += to_spill;
546
reg_pressure -= to_spill;
547
}
548
549
return spilled_registers;
550
}
551
552
/* branch block */
553
if (block->linear_preds.size() == 1 && !(block->kind & block_kind_loop_exit)) {
554
/* keep variables spilled if they are alive and not used in the current block */
555
unsigned pred_idx = block->linear_preds[0];
556
for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
557
if (pair.first.type() == RegType::sgpr &&
558
next_use_distances.find(pair.first) != next_use_distances.end() &&
559
next_use_distances[pair.first].first != block_idx) {
560
ctx.spills_entry[block_idx].insert(pair);
561
spilled_registers.sgpr += pair.first.size();
562
}
563
}
564
if (block->logical_preds.size() == 1) {
565
pred_idx = block->logical_preds[0];
566
for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
567
if (pair.first.type() == RegType::vgpr &&
568
next_use_distances.find(pair.first) != next_use_distances.end() &&
569
next_use_distances[pair.first].first != block_idx) {
570
ctx.spills_entry[block_idx].insert(pair);
571
spilled_registers.vgpr += pair.first.size();
572
}
573
}
574
}
575
576
/* if register demand is still too high, we just keep all spilled live vars
577
* and process the block */
578
if (block->register_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) {
579
pred_idx = block->linear_preds[0];
580
for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
581
if (pair.first.type() == RegType::sgpr &&
582
next_use_distances.find(pair.first) != next_use_distances.end() &&
583
ctx.spills_entry[block_idx].insert(pair).second) {
584
spilled_registers.sgpr += pair.first.size();
585
}
586
}
587
}
588
if (block->register_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr &&
589
block->logical_preds.size() == 1) {
590
pred_idx = block->logical_preds[0];
591
for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
592
if (pair.first.type() == RegType::vgpr &&
593
next_use_distances.find(pair.first) != next_use_distances.end() &&
594
ctx.spills_entry[block_idx].insert(pair).second) {
595
spilled_registers.vgpr += pair.first.size();
596
}
597
}
598
}
599
600
return spilled_registers;
601
}
602
603
/* else: merge block */
604
std::set<Temp> partial_spills;
605
606
/* keep variables spilled on all incoming paths */
607
for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {
608
std::vector<unsigned>& preds =
609
pair.first.is_linear() ? block->linear_preds : block->logical_preds;
610
/* If it can be rematerialized, keep the variable spilled if all predecessors do not reload
611
* it. Otherwise, if any predecessor reloads it, ensure it's reloaded on all other
612
* predecessors. The idea is that it's better in practice to rematerialize redundantly than to
613
* create lots of phis. */
614
/* TODO: test this idea with more than Dawn of War III shaders (the current pipeline-db
615
* doesn't seem to exercise this path much) */
616
bool remat = ctx.remat.count(pair.first);
617
bool spill = !remat;
618
uint32_t spill_id = 0;
619
for (unsigned pred_idx : preds) {
620
/* variable is not even live at the predecessor: probably from a phi */
621
if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==
622
ctx.next_use_distances_end[pred_idx].end()) {
623
spill = false;
624
break;
625
}
626
if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end()) {
627
if (!remat)
628
spill = false;
629
} else {
630
partial_spills.insert(pair.first);
631
/* it might be that on one incoming path, the variable has a different spill_id, but
632
* add_couple_code() will take care of that. */
633
spill_id = ctx.spills_exit[pred_idx][pair.first];
634
if (remat)
635
spill = true;
636
}
637
}
638
if (spill) {
639
ctx.spills_entry[block_idx][pair.first] = spill_id;
640
partial_spills.erase(pair.first);
641
spilled_registers += pair.first;
642
}
643
}
644
645
/* same for phis */
646
for (aco_ptr<Instruction>& phi : block->instructions) {
647
if (!is_phi(phi))
648
break;
649
if (!phi->definitions[0].isTemp())
650
continue;
651
652
std::vector<unsigned>& preds =
653
phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
654
bool spill = true;
655
656
for (unsigned i = 0; i < phi->operands.size(); i++) {
657
/* non-temp operands can increase the register pressure */
658
if (!phi->operands[i].isTemp()) {
659
partial_spills.insert(phi->definitions[0].getTemp());
660
continue;
661
}
662
663
if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) ==
664
ctx.spills_exit[preds[i]].end())
665
spill = false;
666
else
667
partial_spills.insert(phi->definitions[0].getTemp());
668
}
669
if (spill) {
670
ctx.spills_entry[block_idx][phi->definitions[0].getTemp()] =
671
ctx.allocate_spill_id(phi->definitions[0].regClass());
672
partial_spills.erase(phi->definitions[0].getTemp());
673
spilled_registers += phi->definitions[0].getTemp();
674
}
675
}
676
677
/* if reg pressure at first instruction is still too high, add partially spilled variables */
678
RegisterDemand reg_pressure = get_live_in_demand(ctx, block_idx);
679
reg_pressure -= spilled_registers;
680
681
while (reg_pressure.exceeds(ctx.target_pressure)) {
682
assert(!partial_spills.empty());
683
std::set<Temp>::iterator it = partial_spills.begin();
684
Temp to_spill = Temp();
685
unsigned distance = 0;
686
RegType type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;
687
688
while (it != partial_spills.end()) {
689
assert(ctx.spills_entry[block_idx].find(*it) == ctx.spills_entry[block_idx].end());
690
691
if (it->type() == type && next_use_distances[*it].second > distance) {
692
distance = next_use_distances[*it].second;
693
to_spill = *it;
694
}
695
++it;
696
}
697
assert(distance != 0);
698
699
ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());
700
partial_spills.erase(to_spill);
701
spilled_registers += to_spill;
702
reg_pressure -= to_spill;
703
}
704
705
return spilled_registers;
706
}
707
708
void
709
add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
710
{
711
/* no coupling code necessary */
712
if (block->linear_preds.size() == 0)
713
return;
714
715
std::vector<aco_ptr<Instruction>> instructions;
716
/* branch block: TODO take other branch into consideration */
717
if (block->linear_preds.size() == 1 &&
718
!(block->kind & (block_kind_loop_exit | block_kind_loop_header))) {
719
assert(ctx.processed[block->linear_preds[0]]);
720
assert(ctx.register_demand[block_idx].size() == block->instructions.size());
721
std::vector<RegisterDemand> reg_demand;
722
unsigned insert_idx = 0;
723
RegisterDemand demand_before = get_demand_before(ctx, block_idx, 0);
724
725
for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live :
726
ctx.next_use_distances_start[block_idx]) {
727
const unsigned pred_idx = block->linear_preds[0];
728
729
if (!live.first.is_linear())
730
continue;
731
/* still spilled */
732
if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end())
733
continue;
734
735
/* in register at end of predecessor */
736
if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) {
737
std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first);
738
if (it != ctx.renames[pred_idx].end())
739
ctx.renames[block_idx].insert(*it);
740
continue;
741
}
742
743
/* variable is spilled at predecessor and live at current block: create reload instruction */
744
Temp new_name = ctx.program->allocateTmp(live.first.regClass());
745
aco_ptr<Instruction> reload =
746
do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
747
instructions.emplace_back(std::move(reload));
748
reg_demand.push_back(demand_before);
749
ctx.renames[block_idx][live.first] = new_name;
750
}
751
752
if (block->logical_preds.size() == 1) {
753
do {
754
assert(insert_idx < block->instructions.size());
755
instructions.emplace_back(std::move(block->instructions[insert_idx]));
756
reg_demand.push_back(ctx.register_demand[block_idx][insert_idx]);
757
insert_idx++;
758
} while (instructions.back()->opcode != aco_opcode::p_logical_start);
759
760
unsigned pred_idx = block->logical_preds[0];
761
for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live :
762
ctx.next_use_distances_start[block_idx]) {
763
if (live.first.is_linear())
764
continue;
765
/* still spilled */
766
if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end())
767
continue;
768
769
/* in register at end of predecessor */
770
if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) {
771
std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first);
772
if (it != ctx.renames[pred_idx].end())
773
ctx.renames[block_idx].insert(*it);
774
continue;
775
}
776
777
/* variable is spilled at predecessor and live at current block:
778
* create reload instruction */
779
Temp new_name = ctx.program->allocateTmp(live.first.regClass());
780
aco_ptr<Instruction> reload =
781
do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
782
instructions.emplace_back(std::move(reload));
783
reg_demand.emplace_back(reg_demand.back());
784
ctx.renames[block_idx][live.first] = new_name;
785
}
786
}
787
788
/* combine new reload instructions with original block */
789
if (!instructions.empty()) {
790
reg_demand.insert(reg_demand.end(),
791
std::next(ctx.register_demand[block->index].begin(), insert_idx),
792
ctx.register_demand[block->index].end());
793
ctx.register_demand[block_idx] = std::move(reg_demand);
794
instructions.insert(instructions.end(),
795
std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(
796
std::next(block->instructions.begin(), insert_idx)),
797
std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(
798
block->instructions.end()));
799
block->instructions = std::move(instructions);
800
}
801
return;
802
}
803
804
/* loop header and merge blocks: check if all (linear) predecessors have been processed */
805
for (ASSERTED unsigned pred : block->linear_preds)
806
assert(ctx.processed[pred]);
807
808
/* iterate the phi nodes for which operands to spill at the predecessor */
809
for (aco_ptr<Instruction>& phi : block->instructions) {
810
if (!is_phi(phi))
811
break;
812
813
/* if the phi is not spilled, add to instructions */
814
if (!phi->definitions[0].isTemp() ||
815
ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) ==
816
ctx.spills_entry[block_idx].end()) {
817
instructions.emplace_back(std::move(phi));
818
continue;
819
}
820
821
std::vector<unsigned>& preds =
822
phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
823
uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()];
824
825
for (unsigned i = 0; i < phi->operands.size(); i++) {
826
if (phi->operands[i].isUndefined())
827
continue;
828
829
unsigned pred_idx = preds[i];
830
Operand spill_op = phi->operands[i];
831
832
if (spill_op.isTemp()) {
833
assert(phi->operands[i].isKill());
834
Temp var = phi->operands[i].getTemp();
835
836
std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
837
/* prevent the definining instruction from being DCE'd if it could be rematerialized */
838
if (rename_it == ctx.renames[preds[i]].end() && ctx.remat.count(var))
839
ctx.remat_used[ctx.remat[var].instr] = true;
840
841
/* check if variable is already spilled at predecessor */
842
std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(var);
843
if (spilled != ctx.spills_exit[pred_idx].end()) {
844
if (spilled->second != def_spill_id)
845
ctx.add_affinity(def_spill_id, spilled->second);
846
continue;
847
}
848
849
/* rename if necessary */
850
if (rename_it != ctx.renames[pred_idx].end()) {
851
spill_op.setTemp(rename_it->second);
852
ctx.renames[pred_idx].erase(rename_it);
853
}
854
}
855
856
uint32_t spill_id = ctx.allocate_spill_id(phi->definitions[0].regClass());
857
858
/* add interferences and affinity */
859
for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx])
860
ctx.add_interference(spill_id, pair.second);
861
ctx.add_affinity(def_spill_id, spill_id);
862
863
aco_ptr<Pseudo_instruction> spill{
864
create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
865
spill->operands[0] = spill_op;
866
spill->operands[1] = Operand::c32(spill_id);
867
Block& pred = ctx.program->blocks[pred_idx];
868
unsigned idx = pred.instructions.size();
869
do {
870
assert(idx != 0);
871
idx--;
872
} while (phi->opcode == aco_opcode::p_phi &&
873
pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
874
std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
875
pred.instructions.insert(it, std::move(spill));
876
if (spill_op.isTemp())
877
ctx.spills_exit[pred_idx][spill_op.getTemp()] = spill_id;
878
}
879
880
/* remove phi from instructions */
881
phi.reset();
882
}
883
884
/* iterate all (other) spilled variables for which to spill at the predecessor */
885
// TODO: would be better to have them sorted: first vgprs and first with longest distance
886
for (std::pair<Temp, uint32_t> pair : ctx.spills_entry[block_idx]) {
887
std::vector<unsigned> preds =
888
pair.first.is_linear() ? block->linear_preds : block->logical_preds;
889
890
for (unsigned pred_idx : preds) {
891
/* variable is already spilled at predecessor */
892
std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(pair.first);
893
if (spilled != ctx.spills_exit[pred_idx].end()) {
894
if (spilled->second != pair.second)
895
ctx.add_affinity(pair.second, spilled->second);
896
continue;
897
}
898
899
/* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */
900
if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==
901
ctx.next_use_distances_end[pred_idx].end())
902
continue;
903
904
/* add interferences between spilled variable and predecessors exit spills */
905
for (std::pair<Temp, uint32_t> exit_spill : ctx.spills_exit[pred_idx]) {
906
if (exit_spill.first == pair.first)
907
continue;
908
ctx.add_interference(exit_spill.second, pair.second);
909
}
910
911
/* variable is in register at predecessor and has to be spilled */
912
/* rename if necessary */
913
Temp var = pair.first;
914
std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
915
if (rename_it != ctx.renames[pred_idx].end()) {
916
var = rename_it->second;
917
ctx.renames[pred_idx].erase(rename_it);
918
}
919
920
aco_ptr<Pseudo_instruction> spill{
921
create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
922
spill->operands[0] = Operand(var);
923
spill->operands[1] = Operand::c32(pair.second);
924
Block& pred = ctx.program->blocks[pred_idx];
925
unsigned idx = pred.instructions.size();
926
do {
927
assert(idx != 0);
928
idx--;
929
} while (pair.first.type() == RegType::vgpr &&
930
pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
931
std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
932
pred.instructions.insert(it, std::move(spill));
933
ctx.spills_exit[pred.index][pair.first] = pair.second;
934
}
935
}
936
937
/* iterate phis for which operands to reload */
938
for (aco_ptr<Instruction>& phi : instructions) {
939
assert(phi->opcode == aco_opcode::p_phi || phi->opcode == aco_opcode::p_linear_phi);
940
assert(!phi->definitions[0].isTemp() ||
941
ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) ==
942
ctx.spills_entry[block_idx].end());
943
944
std::vector<unsigned>& preds =
945
phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
946
for (unsigned i = 0; i < phi->operands.size(); i++) {
947
if (!phi->operands[i].isTemp())
948
continue;
949
unsigned pred_idx = preds[i];
950
951
/* if the operand was reloaded, rename */
952
if (ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()) ==
953
ctx.spills_exit[pred_idx].end()) {
954
std::map<Temp, Temp>::iterator it =
955
ctx.renames[pred_idx].find(phi->operands[i].getTemp());
956
if (it != ctx.renames[pred_idx].end())
957
phi->operands[i].setTemp(it->second);
958
/* prevent the definining instruction from being DCE'd if it could be rematerialized */
959
else if (ctx.remat.count(phi->operands[i].getTemp()))
960
ctx.remat_used[ctx.remat[phi->operands[i].getTemp()].instr] = true;
961
continue;
962
}
963
964
Temp tmp = phi->operands[i].getTemp();
965
966
/* reload phi operand at end of predecessor block */
967
Temp new_name = ctx.program->allocateTmp(tmp.regClass());
968
Block& pred = ctx.program->blocks[pred_idx];
969
unsigned idx = pred.instructions.size();
970
do {
971
assert(idx != 0);
972
idx--;
973
} while (phi->opcode == aco_opcode::p_phi &&
974
pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
975
std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
976
aco_ptr<Instruction> reload =
977
do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]);
978
979
/* reload spilled exec mask directly to exec */
980
if (!phi->definitions[0].isTemp()) {
981
assert(phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec);
982
reload->definitions[0] = phi->definitions[0];
983
phi->operands[i] = Operand(exec, ctx.program->lane_mask);
984
} else {
985
ctx.spills_exit[pred_idx].erase(tmp);
986
ctx.renames[pred_idx][tmp] = new_name;
987
phi->operands[i].setTemp(new_name);
988
}
989
990
pred.instructions.insert(it, std::move(reload));
991
}
992
}
993
994
/* iterate live variables for which to reload */
995
// TODO: reload at current block if variable is spilled on all predecessors
996
for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair :
997
ctx.next_use_distances_start[block_idx]) {
998
/* skip spilled variables */
999
if (ctx.spills_entry[block_idx].find(pair.first) != ctx.spills_entry[block_idx].end())
1000
continue;
1001
std::vector<unsigned> preds =
1002
pair.first.is_linear() ? block->linear_preds : block->logical_preds;
1003
1004
/* variable is dead at predecessor, it must be from a phi */
1005
bool is_dead = false;
1006
for (unsigned pred_idx : preds) {
1007
if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==
1008
ctx.next_use_distances_end[pred_idx].end())
1009
is_dead = true;
1010
}
1011
if (is_dead)
1012
continue;
1013
for (unsigned pred_idx : preds) {
1014
/* the variable is not spilled at the predecessor */
1015
if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end())
1016
continue;
1017
1018
/* variable is spilled at predecessor and has to be reloaded */
1019
Temp new_name = ctx.program->allocateTmp(pair.first.regClass());
1020
Block& pred = ctx.program->blocks[pred_idx];
1021
unsigned idx = pred.instructions.size();
1022
do {
1023
assert(idx != 0);
1024
idx--;
1025
} while (pair.first.type() == RegType::vgpr &&
1026
pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
1027
std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
1028
1029
aco_ptr<Instruction> reload =
1030
do_reload(ctx, pair.first, new_name, ctx.spills_exit[pred.index][pair.first]);
1031
pred.instructions.insert(it, std::move(reload));
1032
1033
ctx.spills_exit[pred.index].erase(pair.first);
1034
ctx.renames[pred.index][pair.first] = new_name;
1035
}
1036
1037
/* check if we have to create a new phi for this variable */
1038
Temp rename = Temp();
1039
bool is_same = true;
1040
for (unsigned pred_idx : preds) {
1041
if (ctx.renames[pred_idx].find(pair.first) == ctx.renames[pred_idx].end()) {
1042
if (rename == Temp())
1043
rename = pair.first;
1044
else
1045
is_same = rename == pair.first;
1046
} else {
1047
if (rename == Temp())
1048
rename = ctx.renames[pred_idx][pair.first];
1049
else
1050
is_same = rename == ctx.renames[pred_idx][pair.first];
1051
}
1052
1053
if (!is_same)
1054
break;
1055
}
1056
1057
if (!is_same) {
1058
/* the variable was renamed differently in the predecessors: we have to create a phi */
1059
aco_opcode opcode = pair.first.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1060
aco_ptr<Pseudo_instruction> phi{
1061
create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1062
rename = ctx.program->allocateTmp(pair.first.regClass());
1063
for (unsigned i = 0; i < phi->operands.size(); i++) {
1064
Temp tmp;
1065
if (ctx.renames[preds[i]].find(pair.first) != ctx.renames[preds[i]].end()) {
1066
tmp = ctx.renames[preds[i]][pair.first];
1067
} else if (preds[i] >= block_idx) {
1068
tmp = rename;
1069
} else {
1070
tmp = pair.first;
1071
/* prevent the definining instruction from being DCE'd if it could be rematerialized */
1072
if (ctx.remat.count(tmp))
1073
ctx.remat_used[ctx.remat[tmp].instr] = true;
1074
}
1075
phi->operands[i] = Operand(tmp);
1076
}
1077
phi->definitions[0] = Definition(rename);
1078
instructions.emplace_back(std::move(phi));
1079
}
1080
1081
/* the variable was renamed: add new name to renames */
1082
if (!(rename == Temp() || rename == pair.first))
1083
ctx.renames[block_idx][pair.first] = rename;
1084
}
1085
1086
/* combine phis with instructions */
1087
unsigned idx = 0;
1088
while (!block->instructions[idx]) {
1089
idx++;
1090
}
1091
1092
if (!ctx.processed[block_idx]) {
1093
assert(!(block->kind & block_kind_loop_header));
1094
RegisterDemand demand_before = get_demand_before(ctx, block_idx, idx);
1095
ctx.register_demand[block->index].erase(ctx.register_demand[block->index].begin(),
1096
ctx.register_demand[block->index].begin() + idx);
1097
ctx.register_demand[block->index].insert(ctx.register_demand[block->index].begin(),
1098
instructions.size(), demand_before);
1099
}
1100
1101
std::vector<aco_ptr<Instruction>>::iterator start = std::next(block->instructions.begin(), idx);
1102
instructions.insert(
1103
instructions.end(), std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(start),
1104
std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
1105
block->instructions = std::move(instructions);
1106
}
1107
1108
void
1109
process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
1110
std::map<Temp, uint32_t>& current_spills, RegisterDemand spilled_registers)
1111
{
1112
assert(!ctx.processed[block_idx]);
1113
1114
std::vector<std::map<Temp, uint32_t>> local_next_use_distance;
1115
std::vector<aco_ptr<Instruction>> instructions;
1116
unsigned idx = 0;
1117
1118
/* phis are handled separetely */
1119
while (block->instructions[idx]->opcode == aco_opcode::p_phi ||
1120
block->instructions[idx]->opcode == aco_opcode::p_linear_phi) {
1121
instructions.emplace_back(std::move(block->instructions[idx++]));
1122
}
1123
1124
if (block->register_demand.exceeds(ctx.target_pressure))
1125
local_next_use_distance = local_next_uses(ctx, block);
1126
1127
while (idx < block->instructions.size()) {
1128
aco_ptr<Instruction>& instr = block->instructions[idx];
1129
1130
std::map<Temp, std::pair<Temp, uint32_t>> reloads;
1131
std::map<Temp, uint32_t> spills;
1132
/* rename and reload operands */
1133
for (Operand& op : instr->operands) {
1134
if (!op.isTemp())
1135
continue;
1136
if (current_spills.find(op.getTemp()) == current_spills.end()) {
1137
/* the Operand is in register: check if it was renamed */
1138
if (ctx.renames[block_idx].find(op.getTemp()) != ctx.renames[block_idx].end())
1139
op.setTemp(ctx.renames[block_idx][op.getTemp()]);
1140
/* prevent it's definining instruction from being DCE'd if it could be rematerialized */
1141
else if (ctx.remat.count(op.getTemp()))
1142
ctx.remat_used[ctx.remat[op.getTemp()].instr] = true;
1143
continue;
1144
}
1145
/* the Operand is spilled: add it to reloads */
1146
Temp new_tmp = ctx.program->allocateTmp(op.regClass());
1147
ctx.renames[block_idx][op.getTemp()] = new_tmp;
1148
reloads[new_tmp] = std::make_pair(op.getTemp(), current_spills[op.getTemp()]);
1149
current_spills.erase(op.getTemp());
1150
op.setTemp(new_tmp);
1151
spilled_registers -= new_tmp;
1152
}
1153
1154
/* check if register demand is low enough before and after the current instruction */
1155
if (block->register_demand.exceeds(ctx.target_pressure)) {
1156
1157
RegisterDemand new_demand = ctx.register_demand[block_idx][idx];
1158
new_demand.update(get_demand_before(ctx, block_idx, idx));
1159
1160
assert(!local_next_use_distance.empty());
1161
1162
/* if reg pressure is too high, spill variable with furthest next use */
1163
while ((new_demand - spilled_registers).exceeds(ctx.target_pressure)) {
1164
unsigned distance = 0;
1165
Temp to_spill;
1166
bool do_rematerialize = false;
1167
RegType type = RegType::sgpr;
1168
if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr)
1169
type = RegType::vgpr;
1170
1171
for (std::pair<Temp, uint32_t> pair : local_next_use_distance[idx]) {
1172
if (pair.first.type() != type)
1173
continue;
1174
bool can_rematerialize = ctx.remat.count(pair.first);
1175
if (((pair.second > distance && can_rematerialize == do_rematerialize) ||
1176
(can_rematerialize && !do_rematerialize && pair.second > idx)) &&
1177
current_spills.find(pair.first) == current_spills.end() &&
1178
ctx.spills_exit[block_idx].find(pair.first) ==
1179
ctx.spills_exit[block_idx].end()) {
1180
to_spill = pair.first;
1181
distance = pair.second;
1182
do_rematerialize = can_rematerialize;
1183
}
1184
}
1185
1186
assert(distance != 0 && distance > idx);
1187
uint32_t spill_id = ctx.allocate_spill_id(to_spill.regClass());
1188
1189
/* add interferences with currently spilled variables */
1190
for (std::pair<Temp, uint32_t> pair : current_spills)
1191
ctx.add_interference(spill_id, pair.second);
1192
for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads)
1193
ctx.add_interference(spill_id, pair.second.second);
1194
1195
current_spills[to_spill] = spill_id;
1196
spilled_registers += to_spill;
1197
1198
/* rename if necessary */
1199
if (ctx.renames[block_idx].find(to_spill) != ctx.renames[block_idx].end()) {
1200
to_spill = ctx.renames[block_idx][to_spill];
1201
}
1202
1203
/* add spill to new instructions */
1204
aco_ptr<Pseudo_instruction> spill{
1205
create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
1206
spill->operands[0] = Operand(to_spill);
1207
spill->operands[1] = Operand::c32(spill_id);
1208
instructions.emplace_back(std::move(spill));
1209
}
1210
}
1211
1212
/* add reloads and instruction to new instructions */
1213
for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads) {
1214
aco_ptr<Instruction> reload =
1215
do_reload(ctx, pair.second.first, pair.first, pair.second.second);
1216
instructions.emplace_back(std::move(reload));
1217
}
1218
instructions.emplace_back(std::move(instr));
1219
idx++;
1220
}
1221
1222
block->instructions = std::move(instructions);
1223
ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end());
1224
}
1225
1226
void
1227
spill_block(spill_ctx& ctx, unsigned block_idx)
1228
{
1229
Block* block = &ctx.program->blocks[block_idx];
1230
1231
/* determine set of variables which are spilled at the beginning of the block */
1232
RegisterDemand spilled_registers = init_live_in_vars(ctx, block, block_idx);
1233
1234
/* add interferences for spilled variables */
1235
for (auto it = ctx.spills_entry[block_idx].begin(); it != ctx.spills_entry[block_idx].end();
1236
++it) {
1237
for (auto it2 = std::next(it); it2 != ctx.spills_entry[block_idx].end(); ++it2)
1238
ctx.add_interference(it->second, it2->second);
1239
}
1240
1241
bool is_loop_header = block->loop_nest_depth && ctx.loop_header.top()->index == block_idx;
1242
if (!is_loop_header) {
1243
/* add spill/reload code on incoming control flow edges */
1244
add_coupling_code(ctx, block, block_idx);
1245
}
1246
1247
std::map<Temp, uint32_t> current_spills = ctx.spills_entry[block_idx];
1248
1249
/* check conditions to process this block */
1250
bool process = (block->register_demand - spilled_registers).exceeds(ctx.target_pressure) ||
1251
!ctx.renames[block_idx].empty() || ctx.remat_used.size();
1252
1253
for (auto it = current_spills.begin(); !process && it != current_spills.end(); ++it) {
1254
if (ctx.next_use_distances_start[block_idx][it->first].first == block_idx)
1255
process = true;
1256
}
1257
1258
if (process)
1259
process_block(ctx, block_idx, block, current_spills, spilled_registers);
1260
else
1261
ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end());
1262
1263
ctx.processed[block_idx] = true;
1264
1265
/* check if the next block leaves the current loop */
1266
if (block->loop_nest_depth == 0 ||
1267
ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth)
1268
return;
1269
1270
Block* loop_header = ctx.loop_header.top();
1271
1272
/* preserve original renames at end of loop header block */
1273
std::map<Temp, Temp> renames = std::move(ctx.renames[loop_header->index]);
1274
1275
/* add coupling code to all loop header predecessors */
1276
add_coupling_code(ctx, loop_header, loop_header->index);
1277
1278
/* propagate new renames through loop: i.e. repair the SSA */
1279
renames.swap(ctx.renames[loop_header->index]);
1280
for (std::pair<Temp, Temp> rename : renames) {
1281
for (unsigned idx = loop_header->index; idx <= block_idx; idx++) {
1282
Block& current = ctx.program->blocks[idx];
1283
std::vector<aco_ptr<Instruction>>::iterator instr_it = current.instructions.begin();
1284
1285
/* first rename phis */
1286
while (instr_it != current.instructions.end()) {
1287
aco_ptr<Instruction>& phi = *instr_it;
1288
if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
1289
break;
1290
/* no need to rename the loop header phis once again. this happened in
1291
* add_coupling_code() */
1292
if (idx == loop_header->index) {
1293
instr_it++;
1294
continue;
1295
}
1296
1297
for (Operand& op : phi->operands) {
1298
if (!op.isTemp())
1299
continue;
1300
if (op.getTemp() == rename.first)
1301
op.setTemp(rename.second);
1302
}
1303
instr_it++;
1304
}
1305
1306
/* variable is not live at beginning of this block */
1307
if (ctx.next_use_distances_start[idx].count(rename.first) == 0)
1308
continue;
1309
1310
/* if the variable is live at the block's exit, add rename */
1311
if (ctx.next_use_distances_end[idx].count(rename.first) != 0)
1312
ctx.renames[idx].insert(rename);
1313
1314
/* rename all uses in this block */
1315
bool renamed = false;
1316
while (!renamed && instr_it != current.instructions.end()) {
1317
aco_ptr<Instruction>& instr = *instr_it;
1318
for (Operand& op : instr->operands) {
1319
if (!op.isTemp())
1320
continue;
1321
if (op.getTemp() == rename.first) {
1322
op.setTemp(rename.second);
1323
/* we can stop with this block as soon as the variable is spilled */
1324
if (instr->opcode == aco_opcode::p_spill)
1325
renamed = true;
1326
}
1327
}
1328
instr_it++;
1329
}
1330
}
1331
}
1332
1333
/* remove loop header info from stack */
1334
ctx.loop_header.pop();
1335
}
1336
1337
Temp
1338
load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset,
1339
std::vector<aco_ptr<Instruction>>& instructions, unsigned offset,
1340
bool is_top_level)
1341
{
1342
Builder bld(ctx.program);
1343
if (is_top_level) {
1344
bld.reset(&instructions);
1345
} else {
1346
/* find p_logical_end */
1347
unsigned idx = instructions.size() - 1;
1348
while (instructions[idx]->opcode != aco_opcode::p_logical_end)
1349
idx--;
1350
bld.reset(&instructions, std::next(instructions.begin(), idx));
1351
}
1352
1353
Temp private_segment_buffer = ctx.program->private_segment_buffer;
1354
if (ctx.program->stage != compute_cs)
1355
private_segment_buffer =
1356
bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero());
1357
1358
if (offset)
1359
scratch_offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
1360
scratch_offset, Operand::c32(offset));
1361
1362
uint32_t rsrc_conf =
1363
S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx.program->wave_size == 64 ? 3 : 2);
1364
1365
if (ctx.program->chip_class >= GFX10) {
1366
rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
1367
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
1368
} else if (ctx.program->chip_class <= GFX7) {
1369
/* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
1370
rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1371
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1372
}
1373
/* older generations need element size = 4 bytes. element size removed in GFX9 */
1374
if (ctx.program->chip_class <= GFX8)
1375
rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
1376
1377
return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
1378
Operand::c32(-1u), Operand::c32(rsrc_conf));
1379
}
1380
1381
void
1382
add_interferences(spill_ctx& ctx, std::vector<bool>& is_assigned, std::vector<uint32_t>& slots,
1383
std::vector<bool>& slots_used, unsigned id)
1384
{
1385
for (unsigned other : ctx.interferences[id].second) {
1386
if (!is_assigned[other])
1387
continue;
1388
1389
RegClass other_rc = ctx.interferences[other].first;
1390
unsigned slot = slots[other];
1391
std::fill(slots_used.begin() + slot, slots_used.begin() + slot + other_rc.size(), true);
1392
}
1393
}
1394
1395
unsigned
1396
find_available_slot(std::vector<bool>& used, unsigned wave_size, unsigned size, bool is_sgpr,
1397
unsigned* num_slots)
1398
{
1399
unsigned wave_size_minus_one = wave_size - 1;
1400
unsigned slot = 0;
1401
1402
while (true) {
1403
bool available = true;
1404
for (unsigned i = 0; i < size; i++) {
1405
if (slot + i < used.size() && used[slot + i]) {
1406
available = false;
1407
break;
1408
}
1409
}
1410
if (!available) {
1411
slot++;
1412
continue;
1413
}
1414
1415
if (is_sgpr && ((slot & wave_size_minus_one) > wave_size - size)) {
1416
slot = align(slot, wave_size);
1417
continue;
1418
}
1419
1420
std::fill(used.begin(), used.end(), false);
1421
1422
if (slot + size > used.size())
1423
used.resize(slot + size);
1424
1425
return slot;
1426
}
1427
}
1428
1429
void
1430
assign_spill_slots_helper(spill_ctx& ctx, RegType type, std::vector<bool>& is_assigned,
1431
std::vector<uint32_t>& slots, unsigned* num_slots)
1432
{
1433
std::vector<bool> slots_used(*num_slots);
1434
1435
/* assign slots for ids with affinities first */
1436
for (std::vector<uint32_t>& vec : ctx.affinities) {
1437
if (ctx.interferences[vec[0]].first.type() != type)
1438
continue;
1439
1440
for (unsigned id : vec) {
1441
if (!ctx.is_reloaded[id])
1442
continue;
1443
1444
add_interferences(ctx, is_assigned, slots, slots_used, id);
1445
}
1446
1447
unsigned slot =
1448
find_available_slot(slots_used, ctx.wave_size, ctx.interferences[vec[0]].first.size(),
1449
type == RegType::sgpr, num_slots);
1450
1451
for (unsigned id : vec) {
1452
assert(!is_assigned[id]);
1453
1454
if (ctx.is_reloaded[id]) {
1455
slots[id] = slot;
1456
is_assigned[id] = true;
1457
}
1458
}
1459
}
1460
1461
/* assign slots for ids without affinities */
1462
for (unsigned id = 0; id < ctx.interferences.size(); id++) {
1463
if (is_assigned[id] || !ctx.is_reloaded[id] || ctx.interferences[id].first.type() != type)
1464
continue;
1465
1466
add_interferences(ctx, is_assigned, slots, slots_used, id);
1467
1468
unsigned slot =
1469
find_available_slot(slots_used, ctx.wave_size, ctx.interferences[id].first.size(),
1470
type == RegType::sgpr, num_slots);
1471
1472
slots[id] = slot;
1473
is_assigned[id] = true;
1474
}
1475
1476
*num_slots = slots_used.size();
1477
}
1478
1479
void
1480
assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
1481
{
1482
std::vector<uint32_t> slots(ctx.interferences.size());
1483
std::vector<bool> is_assigned(ctx.interferences.size());
1484
1485
/* first, handle affinities: just merge all interferences into both spill ids */
1486
for (std::vector<uint32_t>& vec : ctx.affinities) {
1487
for (unsigned i = 0; i < vec.size(); i++) {
1488
for (unsigned j = i + 1; j < vec.size(); j++) {
1489
assert(vec[i] != vec[j]);
1490
bool reloaded = ctx.is_reloaded[vec[i]] || ctx.is_reloaded[vec[j]];
1491
ctx.is_reloaded[vec[i]] = reloaded;
1492
ctx.is_reloaded[vec[j]] = reloaded;
1493
}
1494
}
1495
}
1496
for (ASSERTED uint32_t i = 0; i < ctx.interferences.size(); i++)
1497
for (ASSERTED uint32_t id : ctx.interferences[i].second)
1498
assert(i != id);
1499
1500
/* for each spill slot, assign as many spill ids as possible */
1501
unsigned sgpr_spill_slots = 0, vgpr_spill_slots = 0;
1502
assign_spill_slots_helper(ctx, RegType::sgpr, is_assigned, slots, &sgpr_spill_slots);
1503
assign_spill_slots_helper(ctx, RegType::vgpr, is_assigned, slots, &vgpr_spill_slots);
1504
1505
for (unsigned id = 0; id < is_assigned.size(); id++)
1506
assert(is_assigned[id] || !ctx.is_reloaded[id]);
1507
1508
for (std::vector<uint32_t>& vec : ctx.affinities) {
1509
for (unsigned i = 0; i < vec.size(); i++) {
1510
for (unsigned j = i + 1; j < vec.size(); j++) {
1511
assert(is_assigned[vec[i]] == is_assigned[vec[j]]);
1512
if (!is_assigned[vec[i]])
1513
continue;
1514
assert(ctx.is_reloaded[vec[i]] == ctx.is_reloaded[vec[j]]);
1515
assert(ctx.interferences[vec[i]].first.type() ==
1516
ctx.interferences[vec[j]].first.type());
1517
assert(slots[vec[i]] == slots[vec[j]]);
1518
}
1519
}
1520
}
1521
1522
/* hope, we didn't mess up */
1523
std::vector<Temp> vgpr_spill_temps((sgpr_spill_slots + ctx.wave_size - 1) / ctx.wave_size);
1524
assert(vgpr_spill_temps.size() <= spills_to_vgpr);
1525
1526
/* replace pseudo instructions with actual hardware instructions */
1527
Temp scratch_offset = ctx.program->scratch_offset, scratch_rsrc = Temp();
1528
unsigned last_top_level_block_idx = 0;
1529
std::vector<bool> reload_in_loop(vgpr_spill_temps.size());
1530
for (Block& block : ctx.program->blocks) {
1531
1532
/* after loops, we insert a user if there was a reload inside the loop */
1533
if (block.loop_nest_depth == 0) {
1534
int end_vgprs = 0;
1535
for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
1536
if (reload_in_loop[i])
1537
end_vgprs++;
1538
}
1539
1540
if (end_vgprs > 0) {
1541
aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(
1542
aco_opcode::p_end_linear_vgpr, Format::PSEUDO, end_vgprs, 0)};
1543
int k = 0;
1544
for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
1545
if (reload_in_loop[i])
1546
destr->operands[k++] = Operand(vgpr_spill_temps[i]);
1547
reload_in_loop[i] = false;
1548
}
1549
/* find insertion point */
1550
std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
1551
while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
1552
++it;
1553
block.instructions.insert(it, std::move(destr));
1554
}
1555
}
1556
1557
if (block.kind & block_kind_top_level && !block.linear_preds.empty()) {
1558
last_top_level_block_idx = block.index;
1559
1560
/* check if any spilled variables use a created linear vgpr, otherwise destroy them */
1561
for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
1562
if (vgpr_spill_temps[i] == Temp())
1563
continue;
1564
1565
bool can_destroy = true;
1566
for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[block.linear_preds[0]]) {
1567
1568
if (ctx.interferences[pair.second].first.type() == RegType::sgpr &&
1569
slots[pair.second] / ctx.wave_size == i) {
1570
can_destroy = false;
1571
break;
1572
}
1573
}
1574
if (can_destroy)
1575
vgpr_spill_temps[i] = Temp();
1576
}
1577
}
1578
1579
std::vector<aco_ptr<Instruction>>::iterator it;
1580
std::vector<aco_ptr<Instruction>> instructions;
1581
instructions.reserve(block.instructions.size());
1582
Builder bld(ctx.program, &instructions);
1583
for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1584
1585
if ((*it)->opcode == aco_opcode::p_spill) {
1586
uint32_t spill_id = (*it)->operands[1].constantValue();
1587
1588
if (!ctx.is_reloaded[spill_id]) {
1589
/* never reloaded, so don't spill */
1590
} else if (!is_assigned[spill_id]) {
1591
unreachable("No spill slot assigned for spill id");
1592
} else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) {
1593
/* spill vgpr */
1594
ctx.program->config->spilled_vgprs += (*it)->operands[0].size();
1595
uint32_t spill_slot = slots[spill_id];
1596
bool add_offset_to_sgpr =
1597
ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size +
1598
vgpr_spill_slots * 4 >
1599
4096;
1600
unsigned base_offset =
1601
add_offset_to_sgpr
1602
? 0
1603
: ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
1604
1605
/* check if the scratch resource descriptor already exists */
1606
if (scratch_rsrc == Temp()) {
1607
unsigned offset =
1608
add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
1609
scratch_rsrc = load_scratch_resource(
1610
ctx, scratch_offset,
1611
last_top_level_block_idx == block.index
1612
? instructions
1613
: ctx.program->blocks[last_top_level_block_idx].instructions,
1614
offset, last_top_level_block_idx == block.index);
1615
}
1616
1617
unsigned offset = base_offset + spill_slot * 4;
1618
aco_opcode opcode = aco_opcode::buffer_store_dword;
1619
assert((*it)->operands[0].isTemp());
1620
Temp temp = (*it)->operands[0].getTemp();
1621
assert(temp.type() == RegType::vgpr && !temp.is_linear());
1622
if (temp.size() > 1) {
1623
Instruction* split{create_instruction<Pseudo_instruction>(
1624
aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};
1625
split->operands[0] = Operand(temp);
1626
for (unsigned i = 0; i < temp.size(); i++)
1627
split->definitions[i] = bld.def(v1);
1628
bld.insert(split);
1629
for (unsigned i = 0; i < temp.size(); i++) {
1630
Instruction* instr =
1631
bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset,
1632
split->definitions[i].getTemp(), offset + i * 4, false, true);
1633
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1634
}
1635
} else {
1636
Instruction* instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset,
1637
temp, offset, false, true);
1638
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1639
}
1640
} else {
1641
ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
1642
1643
uint32_t spill_slot = slots[spill_id];
1644
1645
/* check if the linear vgpr already exists */
1646
if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
1647
Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());
1648
vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
1649
aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
1650
aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
1651
create->definitions[0] = Definition(linear_vgpr);
1652
/* find the right place to insert this definition */
1653
if (last_top_level_block_idx == block.index) {
1654
/* insert right before the current instruction */
1655
instructions.emplace_back(std::move(create));
1656
} else {
1657
assert(last_top_level_block_idx < block.index);
1658
/* insert before the branch at last top level block */
1659
std::vector<aco_ptr<Instruction>>& block_instrs =
1660
ctx.program->blocks[last_top_level_block_idx].instructions;
1661
block_instrs.insert(std::prev(block_instrs.end()), std::move(create));
1662
}
1663
}
1664
1665
/* spill sgpr: just add the vgpr temp to operands */
1666
Pseudo_instruction* spill =
1667
create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 3, 0);
1668
spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
1669
spill->operands[1] = Operand::c32(spill_slot % ctx.wave_size);
1670
spill->operands[2] = (*it)->operands[0];
1671
instructions.emplace_back(aco_ptr<Instruction>(spill));
1672
}
1673
1674
} else if ((*it)->opcode == aco_opcode::p_reload) {
1675
uint32_t spill_id = (*it)->operands[0].constantValue();
1676
assert(ctx.is_reloaded[spill_id]);
1677
1678
if (!is_assigned[spill_id]) {
1679
unreachable("No spill slot assigned for spill id");
1680
} else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) {
1681
/* reload vgpr */
1682
uint32_t spill_slot = slots[spill_id];
1683
bool add_offset_to_sgpr =
1684
ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size +
1685
vgpr_spill_slots * 4 >
1686
4096;
1687
unsigned base_offset =
1688
add_offset_to_sgpr
1689
? 0
1690
: ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
1691
1692
/* check if the scratch resource descriptor already exists */
1693
if (scratch_rsrc == Temp()) {
1694
unsigned offset =
1695
add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
1696
scratch_rsrc = load_scratch_resource(
1697
ctx, scratch_offset,
1698
last_top_level_block_idx == block.index
1699
? instructions
1700
: ctx.program->blocks[last_top_level_block_idx].instructions,
1701
offset, last_top_level_block_idx == block.index);
1702
}
1703
1704
unsigned offset = base_offset + spill_slot * 4;
1705
aco_opcode opcode = aco_opcode::buffer_load_dword;
1706
Definition def = (*it)->definitions[0];
1707
if (def.size() > 1) {
1708
Instruction* vec{create_instruction<Pseudo_instruction>(
1709
aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)};
1710
vec->definitions[0] = def;
1711
for (unsigned i = 0; i < def.size(); i++) {
1712
Temp tmp = bld.tmp(v1);
1713
vec->operands[i] = Operand(tmp);
1714
Instruction* instr =
1715
bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1),
1716
scratch_offset, offset + i * 4, false, true);
1717
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1718
}
1719
bld.insert(vec);
1720
} else {
1721
Instruction* instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1),
1722
scratch_offset, offset, false, true);
1723
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
1724
}
1725
} else {
1726
uint32_t spill_slot = slots[spill_id];
1727
reload_in_loop[spill_slot / ctx.wave_size] = block.loop_nest_depth > 0;
1728
1729
/* check if the linear vgpr already exists */
1730
if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
1731
Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());
1732
vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
1733
aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
1734
aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
1735
create->definitions[0] = Definition(linear_vgpr);
1736
/* find the right place to insert this definition */
1737
if (last_top_level_block_idx == block.index) {
1738
/* insert right before the current instruction */
1739
instructions.emplace_back(std::move(create));
1740
} else {
1741
assert(last_top_level_block_idx < block.index);
1742
/* insert before the branch at last top level block */
1743
std::vector<aco_ptr<Instruction>>& block_instrs =
1744
ctx.program->blocks[last_top_level_block_idx].instructions;
1745
block_instrs.insert(std::prev(block_instrs.end()), std::move(create));
1746
}
1747
}
1748
1749
/* reload sgpr: just add the vgpr temp to operands */
1750
Pseudo_instruction* reload = create_instruction<Pseudo_instruction>(
1751
aco_opcode::p_reload, Format::PSEUDO, 2, 1);
1752
reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
1753
reload->operands[1] = Operand::c32(spill_slot % ctx.wave_size);
1754
reload->definitions[0] = (*it)->definitions[0];
1755
instructions.emplace_back(aco_ptr<Instruction>(reload));
1756
}
1757
} else if (!ctx.remat_used.count(it->get()) || ctx.remat_used[it->get()]) {
1758
instructions.emplace_back(std::move(*it));
1759
}
1760
}
1761
block.instructions = std::move(instructions);
1762
}
1763
1764
/* update required scratch memory */
1765
ctx.program->config->scratch_bytes_per_wave +=
1766
align(vgpr_spill_slots * 4 * ctx.program->wave_size, 1024);
1767
1768
/* SSA elimination inserts copies for logical phis right before p_logical_end
1769
* So if a linear vgpr is used between that p_logical_end and the branch,
1770
* we need to ensure logical phis don't choose a definition which aliases
1771
* the linear vgpr.
1772
* TODO: Moving the spills and reloads to before p_logical_end might produce
1773
* slightly better code. */
1774
for (Block& block : ctx.program->blocks) {
1775
/* loops exits are already handled */
1776
if (block.logical_preds.size() <= 1)
1777
continue;
1778
1779
bool has_logical_phis = false;
1780
for (aco_ptr<Instruction>& instr : block.instructions) {
1781
if (instr->opcode == aco_opcode::p_phi) {
1782
has_logical_phis = true;
1783
break;
1784
} else if (instr->opcode != aco_opcode::p_linear_phi) {
1785
break;
1786
}
1787
}
1788
if (!has_logical_phis)
1789
continue;
1790
1791
std::set<Temp> vgprs;
1792
for (unsigned pred_idx : block.logical_preds) {
1793
Block& pred = ctx.program->blocks[pred_idx];
1794
for (int i = pred.instructions.size() - 1; i >= 0; i--) {
1795
aco_ptr<Instruction>& pred_instr = pred.instructions[i];
1796
if (pred_instr->opcode == aco_opcode::p_logical_end) {
1797
break;
1798
} else if (pred_instr->opcode == aco_opcode::p_spill ||
1799
pred_instr->opcode == aco_opcode::p_reload) {
1800
vgprs.insert(pred_instr->operands[0].getTemp());
1801
}
1802
}
1803
}
1804
if (!vgprs.size())
1805
continue;
1806
1807
aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(
1808
aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vgprs.size(), 0)};
1809
int k = 0;
1810
for (Temp tmp : vgprs) {
1811
destr->operands[k++] = Operand(tmp);
1812
}
1813
/* find insertion point */
1814
std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
1815
while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
1816
++it;
1817
block.instructions.insert(it, std::move(destr));
1818
}
1819
}
1820
1821
} /* end namespace */
1822
1823
void
1824
spill(Program* program, live& live_vars)
1825
{
1826
program->config->spilled_vgprs = 0;
1827
program->config->spilled_sgprs = 0;
1828
1829
program->progress = CompilationProgress::after_spilling;
1830
1831
/* no spilling when register pressure is low enough */
1832
if (program->num_waves > 0)
1833
return;
1834
1835
/* lower to CSSA before spilling to ensure correctness w.r.t. phis */
1836
lower_to_cssa(program, live_vars);
1837
1838
/* calculate target register demand */
1839
const RegisterDemand demand = program->max_reg_demand; /* current max */
1840
const uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
1841
const uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
1842
uint16_t extra_vgprs = 0;
1843
uint16_t extra_sgprs = 0;
1844
1845
/* calculate extra VGPRs required for spilling SGPRs */
1846
if (demand.sgpr > sgpr_limit) {
1847
unsigned sgpr_spills = demand.sgpr - sgpr_limit;
1848
extra_vgprs = DIV_ROUND_UP(sgpr_spills, program->wave_size) + 1;
1849
}
1850
/* add extra SGPRs required for spilling VGPRs */
1851
if (demand.vgpr + extra_vgprs > vgpr_limit) {
1852
extra_sgprs = 5; /* scratch_resource (s4) + scratch_offset (s1) */
1853
if (demand.sgpr + extra_sgprs > sgpr_limit) {
1854
/* re-calculate in case something has changed */
1855
unsigned sgpr_spills = demand.sgpr + extra_sgprs - sgpr_limit;
1856
extra_vgprs = DIV_ROUND_UP(sgpr_spills, program->wave_size) + 1;
1857
}
1858
}
1859
/* the spiller has to target the following register demand */
1860
const RegisterDemand target(vgpr_limit - extra_vgprs, sgpr_limit - extra_sgprs);
1861
1862
/* initialize ctx */
1863
spill_ctx ctx(target, program, live_vars.register_demand);
1864
compute_global_next_uses(ctx);
1865
get_rematerialize_info(ctx);
1866
1867
/* create spills and reloads */
1868
for (unsigned i = 0; i < program->blocks.size(); i++)
1869
spill_block(ctx, i);
1870
1871
/* assign spill slots and DCE rematerialized code */
1872
assign_spill_slots(ctx, extra_vgprs);
1873
1874
/* update live variable information */
1875
live_vars = live_var_analysis(program);
1876
1877
assert(program->num_waves > 0);
1878
}
1879
1880
} // namespace aco
1881
1882