CoCalc -- aco_lower_to

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_lower_to_cssa.cpp
⁴⁵⁵⁰ views
1
/*
2
 * Copyright © 2019 Valve Corporation
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 *
23
 */
24

25
#include "aco_builder.h"
26
#include "aco_ir.h"
27

28
#include <algorithm>
29
#include <map>
30
#include <unordered_map>
31
#include <vector>
32

33
/*
34
 * Implements an algorithm to lower to Concentional SSA Form (CSSA).
35
 * After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency"
36
 * by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon,
37
 *
38
 * By lowering the IR to CSSA, the insertion of parallelcopies is separated from
39
 * the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling.
40
 * The algorithm coalesces non-interfering phi-resources while taking value-equality
41
 * into account. Re-indexes the SSA-defs.
42
 */
43

44
namespace aco {
45
namespace {
46

47
typedef std::vector<Temp> merge_set;
48

49
struct copy {
50
   Definition def;
51
   Operand op;
52
};
53

54
struct merge_node {
55
   Operand value = Operand(); /* original value: can be an SSA-def or constant value */
56
   uint32_t index = -1u;      /* index into the vector of merge sets */
57
   uint32_t defined_at = -1u; /* defining block */
58

59
   /* we also remember two dominating defs with the same value: */
60
   Temp equal_anc_in = Temp();  /* within the same merge set */
61
   Temp equal_anc_out = Temp(); /* from a different set */
62
};
63

64
struct cssa_ctx {
65
   Program* program;
66
   std::vector<IDSet>& live_out;                  /* live-out sets per block */
67
   std::vector<std::vector<copy>> parallelcopies; /* copies per block */
68
   std::vector<merge_set> merge_sets;             /* each vector is one (ordered) merge set */
69
   std::unordered_map<uint32_t, merge_node> merge_node_table; /* tempid -> merge node */
70
};
71

72
/* create (virtual) parallelcopies for each phi instruction and
73
 * already merge copy-definitions with phi-defs into merge sets */
74
void
75
collect_parallelcopies(cssa_ctx& ctx)
76
{
77
   ctx.parallelcopies.resize(ctx.program->blocks.size());
78
   Builder bld(ctx.program);
79
   for (Block& block : ctx.program->blocks) {
80
      for (aco_ptr<Instruction>& phi : block.instructions) {
81
         if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
82
            break;
83

84
         const Definition& def = phi->definitions[0];
85

86
         /* if the definition is not temp, it is the exec mask.
87
          * We can reload the exec mask directly from the spill slot.
88
          */
89
         if (!def.isTemp())
90
            continue;
91

92
         std::vector<unsigned>& preds =
93
            phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
94
         uint32_t index = ctx.merge_sets.size();
95
         merge_set set;
96

97
         bool has_preheader_copy = false;
98
         for (unsigned i = 0; i < phi->operands.size(); i++) {
99
            Operand op = phi->operands[i];
100
            if (op.isUndefined())
101
               continue;
102

103
            if (def.regClass().type() == RegType::sgpr && !op.isTemp()) {
104
               /* SGPR inline constants and literals on GFX10+ can be spilled
105
                * and reloaded directly (without intermediate register) */
106
               if (op.isConstant()) {
107
                  if (ctx.program->chip_class >= GFX10)
108
                     continue;
109
                  if (op.size() == 1 && !op.isLiteral())
110
                     continue;
111
               } else {
112
                  assert(op.isFixed() && op.physReg() == exec);
113
                  continue;
114
               }
115
            }
116

117
            /* create new temporary and rename operands */
118
            Temp tmp = bld.tmp(def.regClass());
119
            ctx.parallelcopies[preds[i]].emplace_back(copy{Definition(tmp), op});
120
            phi->operands[i] = Operand(tmp);
121

122
            /* place the new operands in the same merge set */
123
            set.emplace_back(tmp);
124
            ctx.merge_node_table[tmp.id()] = {op, index, preds[i]};
125

126
            /* update the liveness information */
127
            if (op.isKill())
128
               ctx.live_out[preds[i]].erase(op.tempId());
129
            ctx.live_out[preds[i]].insert(tmp.id());
130

131
            has_preheader_copy |= i == 0 && block.kind & block_kind_loop_header;
132
         }
133

134
         if (set.empty())
135
            continue;
136

137
         /* place the definition in dominance-order */
138
         if (def.isTemp()) {
139
            if (has_preheader_copy)
140
               set.emplace(std::next(set.begin()), def.getTemp());
141
            else if (block.kind & block_kind_loop_header)
142
               set.emplace(set.begin(), def.getTemp());
143
            else
144
               set.emplace_back(def.getTemp());
145
            ctx.merge_node_table[def.tempId()] = {Operand(def.getTemp()), index, block.index};
146
         }
147
         ctx.merge_sets.emplace_back(set);
148
      }
149
   }
150
}
151

152
/* check whether the definition of a comes after b. */
153
inline bool
154
defined_after(cssa_ctx& ctx, Temp a, Temp b)
155
{
156
   merge_node& node_a = ctx.merge_node_table[a.id()];
157
   merge_node& node_b = ctx.merge_node_table[b.id()];
158
   if (node_a.defined_at == node_b.defined_at)
159
      return a.id() > b.id();
160

161
   return node_a.defined_at > node_b.defined_at;
162
}
163

164
/* check whether a dominates b where b is defined after a */
165
inline bool
166
dominates(cssa_ctx& ctx, Temp a, Temp b)
167
{
168
   assert(defined_after(ctx, b, a));
169
   merge_node& node_a = ctx.merge_node_table[a.id()];
170
   merge_node& node_b = ctx.merge_node_table[b.id()];
171
   unsigned idom = node_b.defined_at;
172
   while (idom > node_a.defined_at)
173
      idom = b.regClass().type() == RegType::vgpr ? ctx.program->blocks[idom].logical_idom
174
                                                  : ctx.program->blocks[idom].linear_idom;
175

176
   return idom == node_a.defined_at;
177
}
178

179
/* check intersection between var and parent:
180
 * We already know that parent dominates var. */
181
inline bool
182
intersects(cssa_ctx& ctx, Temp var, Temp parent)
183
{
184
   merge_node& node_var = ctx.merge_node_table[var.id()];
185
   merge_node& node_parent = ctx.merge_node_table[parent.id()];
186
   assert(node_var.index != node_parent.index);
187
   uint32_t block_idx = node_var.defined_at;
188

189
   /* if the parent is live-out at the definition block of var, they intersect */
190
   bool parent_live = ctx.live_out[block_idx].count(parent.id());
191
   if (parent_live)
192
      return true;
193

194
   /* parent is defined in a different block than var */
195
   if (node_parent.defined_at < node_var.defined_at) {
196
      /* if the parent is not live-in, they don't interfere */
197
      std::vector<uint32_t>& preds = var.type() == RegType::vgpr
198
                                        ? ctx.program->blocks[block_idx].logical_preds
199
                                        : ctx.program->blocks[block_idx].linear_preds;
200
      for (uint32_t pred : preds) {
201
         if (!ctx.live_out[pred].count(parent.id()))
202
            return false;
203
      }
204
   }
205

206
   for (const copy& cp : ctx.parallelcopies[block_idx]) {
207
      /* if var is defined at the edge, they don't intersect */
208
      if (cp.def.getTemp() == var)
209
         return false;
210
      if (cp.op.isTemp() && cp.op.getTemp() == parent)
211
         parent_live = true;
212
   }
213
   /* if the parent is live at the edge, they intersect */
214
   if (parent_live)
215
      return true;
216

217
   /* both, parent and var, are present in the same block */
218
   const Block& block = ctx.program->blocks[block_idx];
219
   for (auto it = block.instructions.crbegin(); it != block.instructions.crend(); ++it) {
220
      /* if the parent was not encountered yet, it can only be used by a phi */
221
      if (is_phi(it->get()))
222
         break;
223

224
      for (const Definition& def : (*it)->definitions) {
225
         if (!def.isTemp())
226
            continue;
227
         /* if parent was not found yet, they don't intersect */
228
         if (def.getTemp() == var)
229
            return false;
230
      }
231

232
      for (const Operand& op : (*it)->operands) {
233
         if (!op.isTemp())
234
            continue;
235
         /* if the var was defined before this point, they intersect */
236
         if (op.getTemp() == parent)
237
            return true;
238
      }
239
   }
240

241
   return false;
242
}
243

244
/* check interference between var and parent:
245
 * i.e. they have different values and intersect.
246
 * If parent and var share the same value, also updates the equal ancestor. */
247
inline bool
248
interference(cssa_ctx& ctx, Temp var, Temp parent)
249
{
250
   assert(var != parent);
251
   merge_node& node_var = ctx.merge_node_table[var.id()];
252
   node_var.equal_anc_out = Temp();
253

254
   if (node_var.index == ctx.merge_node_table[parent.id()].index) {
255
      /* check/update in other set */
256
      parent = ctx.merge_node_table[parent.id()].equal_anc_out;
257
   }
258

259
   Temp tmp = parent;
260
   /* check if var intersects with parent or any equal-valued ancestor */
261
   while (tmp != Temp() && !intersects(ctx, var, tmp)) {
262
      merge_node& node_tmp = ctx.merge_node_table[tmp.id()];
263
      tmp = node_tmp.equal_anc_in;
264
   }
265

266
   /* no intersection found */
267
   if (tmp == Temp())
268
      return false;
269

270
   /* var and parent, same value, but in different sets */
271
   if (node_var.value == ctx.merge_node_table[parent.id()].value) {
272
      node_var.equal_anc_out = tmp;
273
      return false;
274
   }
275

276
   /* var and parent, different values and intersect */
277
   return true;
278
}
279

280
/* tries to merge set_b into set_a of given temporary and
281
 * drops that temporary as it is being coalesced */
282
bool
283
try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
284
{
285
   auto def_node_it = ctx.merge_node_table.find(dst.id());
286
   uint32_t index = def_node_it->second.index;
287
   merge_set& set_a = ctx.merge_sets[index];
288
   std::vector<Temp> dom; /* stack of the traversal */
289
   merge_set union_set;   /* the new merged merge-set */
290
   uint32_t i_a = 0;
291
   uint32_t i_b = 0;
292

293
   while (i_a < set_a.size() || i_b < set_b.size()) {
294
      Temp current;
295
      if (i_a == set_a.size())
296
         current = set_b[i_b++];
297
      else if (i_b == set_b.size())
298
         current = set_a[i_a++];
299
      /* else pick the one defined first */
300
      else if (defined_after(ctx, set_a[i_a], set_b[i_b]))
301
         current = set_b[i_b++];
302
      else
303
         current = set_a[i_a++];
304

305
      while (!dom.empty() && !dominates(ctx, dom.back(), current))
306
         dom.pop_back(); /* not the desired parent, remove */
307

308
      if (!dom.empty() && interference(ctx, current, dom.back()))
309
         return false; /* intersection detected */
310

311
      dom.emplace_back(current); /* otherwise, keep checking */
312
      if (current != dst)
313
         union_set.emplace_back(current); /* maintain the new merge-set sorted */
314
   }
315

316
   /* update hashmap */
317
   for (Temp t : union_set) {
318
      merge_node& node = ctx.merge_node_table[t.id()];
319
      /* update the equal ancestors:
320
       * i.e. the 'closest' dominating def with the same value */
321
      Temp in = node.equal_anc_in;
322
      Temp out = node.equal_anc_out;
323
      if (in == Temp() || (out != Temp() && defined_after(ctx, out, in)))
324
         node.equal_anc_in = out;
325
      node.equal_anc_out = Temp();
326
      /* update merge-set index */
327
      node.index = index;
328
   }
329
   set_b = merge_set(); /* free the old set_b */
330
   ctx.merge_sets[index] = union_set;
331
   ctx.merge_node_table.erase(dst.id()); /* remove the temporary */
332

333
   return true;
334
}
335

336
/* returns true if the copy can safely be omitted */
337
bool
338
try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
339
{
340
   /* we can only coalesce temporaries */
341
   if (!copy.op.isTemp())
342
      return false;
343

344
   /* try emplace a merge_node for the copy operand */
345
   merge_node& op_node = ctx.merge_node_table[copy.op.tempId()];
346
   if (op_node.defined_at == -1u) {
347
      /* find defining block of operand */
348
      uint32_t pred = block_idx;
349
      do {
350
         block_idx = pred;
351
         pred = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[pred].logical_idom
352
                                                           : ctx.program->blocks[pred].linear_idom;
353
      } while (block_idx != pred && ctx.live_out[pred].count(copy.op.tempId()));
354
      op_node.defined_at = block_idx;
355
      op_node.value = copy.op;
356
   }
357

358
   /* we can only coalesce copies of the same register class */
359
   if (copy.op.regClass() != copy.def.regClass())
360
      return false;
361

362
   /* check if this operand has not yet been coalesced */
363
   if (op_node.index == -1u) {
364
      merge_set op_set = merge_set{copy.op.getTemp()};
365
      return try_merge_merge_set(ctx, copy.def.getTemp(), op_set);
366
   }
367

368
   /* check if this operand has been coalesced into the same set */
369
   assert(ctx.merge_node_table.count(copy.def.tempId()));
370
   if (op_node.index == ctx.merge_node_table[copy.def.tempId()].index)
371
      return true;
372

373
   /* otherwise, try to coalesce both merge sets */
374
   return try_merge_merge_set(ctx, copy.def.getTemp(), ctx.merge_sets[op_node.index]);
375
}
376

377
/* node in the location-transfer-graph */
378
struct ltg_node {
379
   copy cp;
380
   uint32_t read_idx;
381
   uint32_t num_uses = 0;
382
};
383

384
/* emit the copies in an order that does not
385
 * create interferences within a merge-set */
386
void
387
emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
388
{
389
   auto&& it = ltg.begin();
390
   while (it != ltg.end()) {
391
      const copy& cp = it->second.cp;
392
      /* wrong regclass or still needed as operand */
393
      if (cp.def.regClass().type() != type || it->second.num_uses > 0) {
394
         ++it;
395
         continue;
396
      }
397

398
      /* emit the copy */
399
      bld.copy(cp.def, it->second.cp.op);
400

401
      /* update the location transfer graph */
402
      if (it->second.read_idx != -1u) {
403
         auto&& other = ltg.find(it->second.read_idx);
404
         if (other != ltg.end())
405
            other->second.num_uses--;
406
      }
407
      ltg.erase(it);
408
      it = ltg.begin();
409
   }
410

411
   /* count the number of remaining circular dependencies */
412
   unsigned num = std::count_if(ltg.begin(), ltg.end(),
413
                                [&](auto& n) { return n.second.cp.def.regClass().type() == type; });
414

415
   /* if there are circular dependencies, we just emit them as single parallelcopy */
416
   if (num) {
417
      // TODO: this should be restricted to a feasible number of registers
418
      // and otherwise use a temporary to avoid having to reload more (spilled)
419
      // variables than we have registers.
420
      aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(
421
         aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
422
      it = ltg.begin();
423
      for (unsigned i = 0; i < num; i++) {
424
         while (it->second.cp.def.regClass().type() != type)
425
            ++it;
426

427
         copy->definitions[i] = it->second.cp.def;
428
         copy->operands[i] = it->second.cp.op;
429
         it = ltg.erase(it);
430
      }
431
      bld.insert(std::move(copy));
432
   }
433
}
434

435
/* either emits or coalesces all parallelcopies and
436
 * renames the phi-operands accordingly. */
437
void
438
emit_parallelcopies(cssa_ctx& ctx)
439
{
440
   std::unordered_map<uint32_t, Operand> renames;
441

442
   /* we iterate backwards to prioritize coalescing in else-blocks */
443
   for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {
444
      if (ctx.parallelcopies[i].empty())
445
         continue;
446

447
      std::map<uint32_t, ltg_node> ltg;
448
      /* first, try to coalesce all parallelcopies */
449
      for (const copy& cp : ctx.parallelcopies[i]) {
450
         if (try_coalesce_copy(ctx, cp, i)) {
451
            renames.emplace(cp.def.tempId(), cp.op);
452
            /* update liveness info */
453
            ctx.live_out[i].erase(cp.def.tempId());
454
            ctx.live_out[i].insert(cp.op.tempId());
455
         } else {
456
            uint32_t read_idx = -1u;
457
            if (cp.op.isTemp())
458
               read_idx = ctx.merge_node_table[cp.op.tempId()].index;
459
            uint32_t write_idx = ctx.merge_node_table[cp.def.tempId()].index;
460
            assert(write_idx != -1u);
461
            ltg[write_idx] = {cp, read_idx};
462
         }
463
      }
464

465
      /* build location-transfer-graph */
466
      for (auto& pair : ltg) {
467
         if (pair.second.read_idx == -1u)
468
            continue;
469
         auto&& it = ltg.find(pair.second.read_idx);
470
         if (it != ltg.end())
471
            it->second.num_uses++;
472
      }
473

474
      /* emit parallelcopies ordered */
475
      Builder bld(ctx.program);
476
      Block& block = ctx.program->blocks[i];
477

478
      /* emit VGPR copies */
479
      auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
480
      { return inst->opcode == aco_opcode::p_logical_end; };
481
      auto it = std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd);
482
      bld.reset(&block.instructions, std::prev(it.base()));
483
      emit_copies_block(bld, ltg, RegType::vgpr);
484

485
      /* emit SGPR copies */
486
      aco_ptr<Instruction> branch = std::move(block.instructions.back());
487
      block.instructions.pop_back();
488
      bld.reset(&block.instructions);
489
      emit_copies_block(bld, ltg, RegType::sgpr);
490
      bld.insert(std::move(branch));
491
   }
492

493
   /* finally, rename coalesced phi operands */
494
   for (Block& block : ctx.program->blocks) {
495
      for (aco_ptr<Instruction>& phi : block.instructions) {
496
         if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
497
            break;
498

499
         for (Operand& op : phi->operands) {
500
            if (!op.isTemp())
501
               continue;
502
            auto&& it = renames.find(op.tempId());
503
            if (it != renames.end()) {
504
               op = it->second;
505
               renames.erase(it);
506
            }
507
         }
508
      }
509
   }
510
   assert(renames.empty());
511
}
512

513
} /* end namespace */
514

515
void
516
lower_to_cssa(Program* program, live& live_vars)
517
{
518
   reindex_ssa(program, live_vars.live_out);
519
   cssa_ctx ctx = {program, live_vars.live_out};
520
   collect_parallelcopies(ctx);
521
   emit_parallelcopies(ctx);
522

523
   /* update live variable information */
524
   live_vars = live_var_analysis(program);
525
}
526
} // namespace aco
527

528
Product

Resources

Company