Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_lower_to_cssa.cpp
4550 views
1
/*
2
* Copyright © 2019 Valve Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*
23
*/
24
25
#include "aco_builder.h"
26
#include "aco_ir.h"
27
28
#include <algorithm>
29
#include <map>
30
#include <unordered_map>
31
#include <vector>
32
33
/*
34
* Implements an algorithm to lower to Concentional SSA Form (CSSA).
35
* After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency"
36
* by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon,
37
*
38
* By lowering the IR to CSSA, the insertion of parallelcopies is separated from
39
* the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling.
40
* The algorithm coalesces non-interfering phi-resources while taking value-equality
41
* into account. Re-indexes the SSA-defs.
42
*/
43
44
namespace aco {
45
namespace {
46
47
typedef std::vector<Temp> merge_set;
48
49
struct copy {
50
Definition def;
51
Operand op;
52
};
53
54
struct merge_node {
55
Operand value = Operand(); /* original value: can be an SSA-def or constant value */
56
uint32_t index = -1u; /* index into the vector of merge sets */
57
uint32_t defined_at = -1u; /* defining block */
58
59
/* we also remember two dominating defs with the same value: */
60
Temp equal_anc_in = Temp(); /* within the same merge set */
61
Temp equal_anc_out = Temp(); /* from a different set */
62
};
63
64
struct cssa_ctx {
65
Program* program;
66
std::vector<IDSet>& live_out; /* live-out sets per block */
67
std::vector<std::vector<copy>> parallelcopies; /* copies per block */
68
std::vector<merge_set> merge_sets; /* each vector is one (ordered) merge set */
69
std::unordered_map<uint32_t, merge_node> merge_node_table; /* tempid -> merge node */
70
};
71
72
/* create (virtual) parallelcopies for each phi instruction and
73
* already merge copy-definitions with phi-defs into merge sets */
74
void
75
collect_parallelcopies(cssa_ctx& ctx)
76
{
77
ctx.parallelcopies.resize(ctx.program->blocks.size());
78
Builder bld(ctx.program);
79
for (Block& block : ctx.program->blocks) {
80
for (aco_ptr<Instruction>& phi : block.instructions) {
81
if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
82
break;
83
84
const Definition& def = phi->definitions[0];
85
86
/* if the definition is not temp, it is the exec mask.
87
* We can reload the exec mask directly from the spill slot.
88
*/
89
if (!def.isTemp())
90
continue;
91
92
std::vector<unsigned>& preds =
93
phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
94
uint32_t index = ctx.merge_sets.size();
95
merge_set set;
96
97
bool has_preheader_copy = false;
98
for (unsigned i = 0; i < phi->operands.size(); i++) {
99
Operand op = phi->operands[i];
100
if (op.isUndefined())
101
continue;
102
103
if (def.regClass().type() == RegType::sgpr && !op.isTemp()) {
104
/* SGPR inline constants and literals on GFX10+ can be spilled
105
* and reloaded directly (without intermediate register) */
106
if (op.isConstant()) {
107
if (ctx.program->chip_class >= GFX10)
108
continue;
109
if (op.size() == 1 && !op.isLiteral())
110
continue;
111
} else {
112
assert(op.isFixed() && op.physReg() == exec);
113
continue;
114
}
115
}
116
117
/* create new temporary and rename operands */
118
Temp tmp = bld.tmp(def.regClass());
119
ctx.parallelcopies[preds[i]].emplace_back(copy{Definition(tmp), op});
120
phi->operands[i] = Operand(tmp);
121
122
/* place the new operands in the same merge set */
123
set.emplace_back(tmp);
124
ctx.merge_node_table[tmp.id()] = {op, index, preds[i]};
125
126
/* update the liveness information */
127
if (op.isKill())
128
ctx.live_out[preds[i]].erase(op.tempId());
129
ctx.live_out[preds[i]].insert(tmp.id());
130
131
has_preheader_copy |= i == 0 && block.kind & block_kind_loop_header;
132
}
133
134
if (set.empty())
135
continue;
136
137
/* place the definition in dominance-order */
138
if (def.isTemp()) {
139
if (has_preheader_copy)
140
set.emplace(std::next(set.begin()), def.getTemp());
141
else if (block.kind & block_kind_loop_header)
142
set.emplace(set.begin(), def.getTemp());
143
else
144
set.emplace_back(def.getTemp());
145
ctx.merge_node_table[def.tempId()] = {Operand(def.getTemp()), index, block.index};
146
}
147
ctx.merge_sets.emplace_back(set);
148
}
149
}
150
}
151
152
/* check whether the definition of a comes after b. */
153
inline bool
154
defined_after(cssa_ctx& ctx, Temp a, Temp b)
155
{
156
merge_node& node_a = ctx.merge_node_table[a.id()];
157
merge_node& node_b = ctx.merge_node_table[b.id()];
158
if (node_a.defined_at == node_b.defined_at)
159
return a.id() > b.id();
160
161
return node_a.defined_at > node_b.defined_at;
162
}
163
164
/* check whether a dominates b where b is defined after a */
165
inline bool
166
dominates(cssa_ctx& ctx, Temp a, Temp b)
167
{
168
assert(defined_after(ctx, b, a));
169
merge_node& node_a = ctx.merge_node_table[a.id()];
170
merge_node& node_b = ctx.merge_node_table[b.id()];
171
unsigned idom = node_b.defined_at;
172
while (idom > node_a.defined_at)
173
idom = b.regClass().type() == RegType::vgpr ? ctx.program->blocks[idom].logical_idom
174
: ctx.program->blocks[idom].linear_idom;
175
176
return idom == node_a.defined_at;
177
}
178
179
/* check intersection between var and parent:
180
* We already know that parent dominates var. */
181
inline bool
182
intersects(cssa_ctx& ctx, Temp var, Temp parent)
183
{
184
merge_node& node_var = ctx.merge_node_table[var.id()];
185
merge_node& node_parent = ctx.merge_node_table[parent.id()];
186
assert(node_var.index != node_parent.index);
187
uint32_t block_idx = node_var.defined_at;
188
189
/* if the parent is live-out at the definition block of var, they intersect */
190
bool parent_live = ctx.live_out[block_idx].count(parent.id());
191
if (parent_live)
192
return true;
193
194
/* parent is defined in a different block than var */
195
if (node_parent.defined_at < node_var.defined_at) {
196
/* if the parent is not live-in, they don't interfere */
197
std::vector<uint32_t>& preds = var.type() == RegType::vgpr
198
? ctx.program->blocks[block_idx].logical_preds
199
: ctx.program->blocks[block_idx].linear_preds;
200
for (uint32_t pred : preds) {
201
if (!ctx.live_out[pred].count(parent.id()))
202
return false;
203
}
204
}
205
206
for (const copy& cp : ctx.parallelcopies[block_idx]) {
207
/* if var is defined at the edge, they don't intersect */
208
if (cp.def.getTemp() == var)
209
return false;
210
if (cp.op.isTemp() && cp.op.getTemp() == parent)
211
parent_live = true;
212
}
213
/* if the parent is live at the edge, they intersect */
214
if (parent_live)
215
return true;
216
217
/* both, parent and var, are present in the same block */
218
const Block& block = ctx.program->blocks[block_idx];
219
for (auto it = block.instructions.crbegin(); it != block.instructions.crend(); ++it) {
220
/* if the parent was not encountered yet, it can only be used by a phi */
221
if (is_phi(it->get()))
222
break;
223
224
for (const Definition& def : (*it)->definitions) {
225
if (!def.isTemp())
226
continue;
227
/* if parent was not found yet, they don't intersect */
228
if (def.getTemp() == var)
229
return false;
230
}
231
232
for (const Operand& op : (*it)->operands) {
233
if (!op.isTemp())
234
continue;
235
/* if the var was defined before this point, they intersect */
236
if (op.getTemp() == parent)
237
return true;
238
}
239
}
240
241
return false;
242
}
243
244
/* check interference between var and parent:
245
* i.e. they have different values and intersect.
246
* If parent and var share the same value, also updates the equal ancestor. */
247
inline bool
248
interference(cssa_ctx& ctx, Temp var, Temp parent)
249
{
250
assert(var != parent);
251
merge_node& node_var = ctx.merge_node_table[var.id()];
252
node_var.equal_anc_out = Temp();
253
254
if (node_var.index == ctx.merge_node_table[parent.id()].index) {
255
/* check/update in other set */
256
parent = ctx.merge_node_table[parent.id()].equal_anc_out;
257
}
258
259
Temp tmp = parent;
260
/* check if var intersects with parent or any equal-valued ancestor */
261
while (tmp != Temp() && !intersects(ctx, var, tmp)) {
262
merge_node& node_tmp = ctx.merge_node_table[tmp.id()];
263
tmp = node_tmp.equal_anc_in;
264
}
265
266
/* no intersection found */
267
if (tmp == Temp())
268
return false;
269
270
/* var and parent, same value, but in different sets */
271
if (node_var.value == ctx.merge_node_table[parent.id()].value) {
272
node_var.equal_anc_out = tmp;
273
return false;
274
}
275
276
/* var and parent, different values and intersect */
277
return true;
278
}
279
280
/* tries to merge set_b into set_a of given temporary and
281
* drops that temporary as it is being coalesced */
282
bool
283
try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
284
{
285
auto def_node_it = ctx.merge_node_table.find(dst.id());
286
uint32_t index = def_node_it->second.index;
287
merge_set& set_a = ctx.merge_sets[index];
288
std::vector<Temp> dom; /* stack of the traversal */
289
merge_set union_set; /* the new merged merge-set */
290
uint32_t i_a = 0;
291
uint32_t i_b = 0;
292
293
while (i_a < set_a.size() || i_b < set_b.size()) {
294
Temp current;
295
if (i_a == set_a.size())
296
current = set_b[i_b++];
297
else if (i_b == set_b.size())
298
current = set_a[i_a++];
299
/* else pick the one defined first */
300
else if (defined_after(ctx, set_a[i_a], set_b[i_b]))
301
current = set_b[i_b++];
302
else
303
current = set_a[i_a++];
304
305
while (!dom.empty() && !dominates(ctx, dom.back(), current))
306
dom.pop_back(); /* not the desired parent, remove */
307
308
if (!dom.empty() && interference(ctx, current, dom.back()))
309
return false; /* intersection detected */
310
311
dom.emplace_back(current); /* otherwise, keep checking */
312
if (current != dst)
313
union_set.emplace_back(current); /* maintain the new merge-set sorted */
314
}
315
316
/* update hashmap */
317
for (Temp t : union_set) {
318
merge_node& node = ctx.merge_node_table[t.id()];
319
/* update the equal ancestors:
320
* i.e. the 'closest' dominating def with the same value */
321
Temp in = node.equal_anc_in;
322
Temp out = node.equal_anc_out;
323
if (in == Temp() || (out != Temp() && defined_after(ctx, out, in)))
324
node.equal_anc_in = out;
325
node.equal_anc_out = Temp();
326
/* update merge-set index */
327
node.index = index;
328
}
329
set_b = merge_set(); /* free the old set_b */
330
ctx.merge_sets[index] = union_set;
331
ctx.merge_node_table.erase(dst.id()); /* remove the temporary */
332
333
return true;
334
}
335
336
/* returns true if the copy can safely be omitted */
337
bool
338
try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
339
{
340
/* we can only coalesce temporaries */
341
if (!copy.op.isTemp())
342
return false;
343
344
/* try emplace a merge_node for the copy operand */
345
merge_node& op_node = ctx.merge_node_table[copy.op.tempId()];
346
if (op_node.defined_at == -1u) {
347
/* find defining block of operand */
348
uint32_t pred = block_idx;
349
do {
350
block_idx = pred;
351
pred = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[pred].logical_idom
352
: ctx.program->blocks[pred].linear_idom;
353
} while (block_idx != pred && ctx.live_out[pred].count(copy.op.tempId()));
354
op_node.defined_at = block_idx;
355
op_node.value = copy.op;
356
}
357
358
/* we can only coalesce copies of the same register class */
359
if (copy.op.regClass() != copy.def.regClass())
360
return false;
361
362
/* check if this operand has not yet been coalesced */
363
if (op_node.index == -1u) {
364
merge_set op_set = merge_set{copy.op.getTemp()};
365
return try_merge_merge_set(ctx, copy.def.getTemp(), op_set);
366
}
367
368
/* check if this operand has been coalesced into the same set */
369
assert(ctx.merge_node_table.count(copy.def.tempId()));
370
if (op_node.index == ctx.merge_node_table[copy.def.tempId()].index)
371
return true;
372
373
/* otherwise, try to coalesce both merge sets */
374
return try_merge_merge_set(ctx, copy.def.getTemp(), ctx.merge_sets[op_node.index]);
375
}
376
377
/* node in the location-transfer-graph */
378
struct ltg_node {
379
copy cp;
380
uint32_t read_idx;
381
uint32_t num_uses = 0;
382
};
383
384
/* emit the copies in an order that does not
385
* create interferences within a merge-set */
386
void
387
emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
388
{
389
auto&& it = ltg.begin();
390
while (it != ltg.end()) {
391
const copy& cp = it->second.cp;
392
/* wrong regclass or still needed as operand */
393
if (cp.def.regClass().type() != type || it->second.num_uses > 0) {
394
++it;
395
continue;
396
}
397
398
/* emit the copy */
399
bld.copy(cp.def, it->second.cp.op);
400
401
/* update the location transfer graph */
402
if (it->second.read_idx != -1u) {
403
auto&& other = ltg.find(it->second.read_idx);
404
if (other != ltg.end())
405
other->second.num_uses--;
406
}
407
ltg.erase(it);
408
it = ltg.begin();
409
}
410
411
/* count the number of remaining circular dependencies */
412
unsigned num = std::count_if(ltg.begin(), ltg.end(),
413
[&](auto& n) { return n.second.cp.def.regClass().type() == type; });
414
415
/* if there are circular dependencies, we just emit them as single parallelcopy */
416
if (num) {
417
// TODO: this should be restricted to a feasible number of registers
418
// and otherwise use a temporary to avoid having to reload more (spilled)
419
// variables than we have registers.
420
aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(
421
aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
422
it = ltg.begin();
423
for (unsigned i = 0; i < num; i++) {
424
while (it->second.cp.def.regClass().type() != type)
425
++it;
426
427
copy->definitions[i] = it->second.cp.def;
428
copy->operands[i] = it->second.cp.op;
429
it = ltg.erase(it);
430
}
431
bld.insert(std::move(copy));
432
}
433
}
434
435
/* either emits or coalesces all parallelcopies and
436
* renames the phi-operands accordingly. */
437
void
438
emit_parallelcopies(cssa_ctx& ctx)
439
{
440
std::unordered_map<uint32_t, Operand> renames;
441
442
/* we iterate backwards to prioritize coalescing in else-blocks */
443
for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {
444
if (ctx.parallelcopies[i].empty())
445
continue;
446
447
std::map<uint32_t, ltg_node> ltg;
448
/* first, try to coalesce all parallelcopies */
449
for (const copy& cp : ctx.parallelcopies[i]) {
450
if (try_coalesce_copy(ctx, cp, i)) {
451
renames.emplace(cp.def.tempId(), cp.op);
452
/* update liveness info */
453
ctx.live_out[i].erase(cp.def.tempId());
454
ctx.live_out[i].insert(cp.op.tempId());
455
} else {
456
uint32_t read_idx = -1u;
457
if (cp.op.isTemp())
458
read_idx = ctx.merge_node_table[cp.op.tempId()].index;
459
uint32_t write_idx = ctx.merge_node_table[cp.def.tempId()].index;
460
assert(write_idx != -1u);
461
ltg[write_idx] = {cp, read_idx};
462
}
463
}
464
465
/* build location-transfer-graph */
466
for (auto& pair : ltg) {
467
if (pair.second.read_idx == -1u)
468
continue;
469
auto&& it = ltg.find(pair.second.read_idx);
470
if (it != ltg.end())
471
it->second.num_uses++;
472
}
473
474
/* emit parallelcopies ordered */
475
Builder bld(ctx.program);
476
Block& block = ctx.program->blocks[i];
477
478
/* emit VGPR copies */
479
auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
480
{ return inst->opcode == aco_opcode::p_logical_end; };
481
auto it = std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd);
482
bld.reset(&block.instructions, std::prev(it.base()));
483
emit_copies_block(bld, ltg, RegType::vgpr);
484
485
/* emit SGPR copies */
486
aco_ptr<Instruction> branch = std::move(block.instructions.back());
487
block.instructions.pop_back();
488
bld.reset(&block.instructions);
489
emit_copies_block(bld, ltg, RegType::sgpr);
490
bld.insert(std::move(branch));
491
}
492
493
/* finally, rename coalesced phi operands */
494
for (Block& block : ctx.program->blocks) {
495
for (aco_ptr<Instruction>& phi : block.instructions) {
496
if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
497
break;
498
499
for (Operand& op : phi->operands) {
500
if (!op.isTemp())
501
continue;
502
auto&& it = renames.find(op.tempId());
503
if (it != renames.end()) {
504
op = it->second;
505
renames.erase(it);
506
}
507
}
508
}
509
}
510
assert(renames.empty());
511
}
512
513
} /* end namespace */
514
515
void
516
lower_to_cssa(Program* program, live& live_vars)
517
{
518
reindex_ssa(program, live_vars.live_out);
519
cssa_ctx ctx = {program, live_vars.live_out};
520
collect_parallelcopies(ctx);
521
emit_parallelcopies(ctx);
522
523
/* update live variable information */
524
live_vars = live_var_analysis(program);
525
}
526
} // namespace aco
527
528