CoCalc -- aco_live_var

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_live_var_analysis.cpp
⁴⁵⁵⁰ views
1
/*
2
 * Copyright © 2018 Valve Corporation
3
 * Copyright © 2018 Google
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
 * and/or sell copies of the Software, and to permit persons to whom the
10
 * Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
 * IN THE SOFTWARE.
23
 *
24
 */
25

26
#include "aco_ir.h"
27

28
#include "util/u_math.h"
29

30
#include <set>
31
#include <vector>
32

33
namespace aco {
34
RegisterDemand
35
get_live_changes(aco_ptr<Instruction>& instr)
36
{
37
   RegisterDemand changes;
38
   for (const Definition& def : instr->definitions) {
39
      if (!def.isTemp() || def.isKill())
40
         continue;
41
      changes += def.getTemp();
42
   }
43

44
   for (const Operand& op : instr->operands) {
45
      if (!op.isTemp() || !op.isFirstKill())
46
         continue;
47
      changes -= op.getTemp();
48
   }
49

50
   return changes;
51
}
52

53
RegisterDemand
54
get_temp_registers(aco_ptr<Instruction>& instr)
55
{
56
   RegisterDemand temp_registers;
57

58
   for (Definition def : instr->definitions) {
59
      if (!def.isTemp())
60
         continue;
61
      if (def.isKill())
62
         temp_registers += def.getTemp();
63
   }
64

65
   for (Operand op : instr->operands) {
66
      if (op.isTemp() && op.isLateKill() && op.isFirstKill())
67
         temp_registers += op.getTemp();
68
   }
69

70
   return temp_registers;
71
}
72

73
RegisterDemand
74
get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
75
                  aco_ptr<Instruction>& instr_before)
76
{
77
   demand -= get_live_changes(instr);
78
   demand -= get_temp_registers(instr);
79
   if (instr_before)
80
      demand += get_temp_registers(instr_before);
81
   return demand;
82
}
83

84
namespace {
85
void
86
process_live_temps_per_block(Program* program, live& lives, Block* block,
87
                             unsigned& worklist, std::vector<uint16_t>& phi_sgpr_ops)
88
{
89
   std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index];
90
   RegisterDemand new_demand;
91

92
   register_demand.resize(block->instructions.size());
93
   RegisterDemand block_register_demand;
94
   IDSet live = lives.live_out[block->index];
95

96
   /* initialize register demand */
97
   for (unsigned t : live)
98
      new_demand += Temp(t, program->temp_rc[t]);
99
   new_demand.sgpr -= phi_sgpr_ops[block->index];
100

101
   /* traverse the instructions backwards */
102
   int idx;
103
   for (idx = block->instructions.size() - 1; idx >= 0; idx--) {
104
      Instruction* insn = block->instructions[idx].get();
105
      if (is_phi(insn))
106
         break;
107

108
      register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr);
109

110
      /* KILL */
111
      for (Definition& definition : insn->definitions) {
112
         if (!definition.isTemp()) {
113
            continue;
114
         }
115
         if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
116
            program->needs_vcc = true;
117

118
         const Temp temp = definition.getTemp();
119
         const size_t n = live.erase(temp.id());
120

121
         if (n) {
122
            new_demand -= temp;
123
            definition.setKill(false);
124
         } else {
125
            register_demand[idx] += temp;
126
            definition.setKill(true);
127
         }
128
      }
129

130
      /* GEN */
131
      if (insn->opcode == aco_opcode::p_logical_end) {
132
         new_demand.sgpr += phi_sgpr_ops[block->index];
133
      } else {
134
         /* we need to do this in a separate loop because the next one can
135
          * setKill() for several operands at once and we don't want to
136
          * overwrite that in a later iteration */
137
         for (Operand& op : insn->operands)
138
            op.setKill(false);
139

140
         for (unsigned i = 0; i < insn->operands.size(); ++i) {
141
            Operand& operand = insn->operands[i];
142
            if (!operand.isTemp())
143
               continue;
144
            if (operand.isFixed() && operand.physReg() == vcc)
145
               program->needs_vcc = true;
146
            const Temp temp = operand.getTemp();
147
            const bool inserted = live.insert(temp.id()).second;
148
            if (inserted) {
149
               operand.setFirstKill(true);
150
               for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
151
                  if (insn->operands[j].isTemp() &&
152
                      insn->operands[j].tempId() == operand.tempId()) {
153
                     insn->operands[j].setFirstKill(false);
154
                     insn->operands[j].setKill(true);
155
                  }
156
               }
157
               if (operand.isLateKill())
158
                  register_demand[idx] += temp;
159
               new_demand += temp;
160
            }
161
         }
162
      }
163

164
      block_register_demand.update(register_demand[idx]);
165
   }
166

167
   /* update block's register demand for a last time */
168
   block_register_demand.update(new_demand);
169
   if (program->progress < CompilationProgress::after_ra)
170
      block->register_demand = block_register_demand;
171

172
   /* handle phi definitions */
173
   int phi_idx = idx;
174
   while (phi_idx >= 0) {
175
      register_demand[phi_idx] = new_demand;
176
      Instruction* insn = block->instructions[phi_idx].get();
177

178
      assert(is_phi(insn) && insn->definitions.size() == 1);
179
      if (!insn->definitions[0].isTemp()) {
180
         assert(insn->definitions[0].isFixed() && insn->definitions[0].physReg() == exec);
181
         phi_idx--;
182
         continue;
183
      }
184
      Definition& definition = insn->definitions[0];
185
      if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
186
         program->needs_vcc = true;
187
      const Temp temp = definition.getTemp();
188
      const size_t n = live.erase(temp.id());
189

190
      if (n)
191
         definition.setKill(false);
192
      else
193
         definition.setKill(true);
194

195
      phi_idx--;
196
   }
197

198
   /* now, we need to merge the live-ins into the live-out sets */
199
   for (unsigned t : live) {
200
      RegClass rc = program->temp_rc[t];
201
      std::vector<unsigned>& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;
202

203
#ifndef NDEBUG
204
      if (preds.empty())
205
         aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t,
206
                 block->index);
207
#endif
208

209
      for (unsigned pred_idx : preds) {
210
         auto it = lives.live_out[pred_idx].insert(t);
211
         if (it.second)
212
            worklist = std::max(worklist, pred_idx + 1);
213
      }
214
   }
215

216
   /* handle phi operands */
217
   phi_idx = idx;
218
   while (phi_idx >= 0) {
219
      Instruction* insn = block->instructions[phi_idx].get();
220
      assert(is_phi(insn));
221
      /* directly insert into the predecessors live-out set */
222
      std::vector<unsigned>& preds =
223
         insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
224
      for (unsigned i = 0; i < preds.size(); ++i) {
225
         Operand& operand = insn->operands[i];
226
         if (!operand.isTemp())
227
            continue;
228
         if (operand.isFixed() && operand.physReg() == vcc)
229
            program->needs_vcc = true;
230
         /* check if we changed an already processed block */
231
         const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second;
232
         if (inserted) {
233
            worklist = std::max(worklist, preds[i] + 1);
234
            if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr)
235
               phi_sgpr_ops[preds[i]] += operand.size();
236
         }
237

238
         /* set if the operand is killed by this (or another) phi instruction */
239
         operand.setKill(!live.count(operand.tempId()));
240
      }
241
      phi_idx--;
242
   }
243

244
   assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty()));
245
}
246

247
unsigned
248
calc_waves_per_workgroup(Program* program)
249
{
250
   /* When workgroup size is not known, just go with wave_size */
251
   unsigned workgroup_size =
252
      program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size;
253

254
   return align(workgroup_size, program->wave_size) / program->wave_size;
255
}
256
} /* end namespace */
257

258
uint16_t
259
get_extra_sgprs(Program* program)
260
{
261
   if (program->chip_class >= GFX10) {
262
      assert(!program->needs_flat_scr);
263
      assert(!program->dev.xnack_enabled);
264
      return 0;
265
   } else if (program->chip_class >= GFX8) {
266
      if (program->needs_flat_scr)
267
         return 6;
268
      else if (program->dev.xnack_enabled)
269
         return 4;
270
      else if (program->needs_vcc)
271
         return 2;
272
      else
273
         return 0;
274
   } else {
275
      assert(!program->dev.xnack_enabled);
276
      if (program->needs_flat_scr)
277
         return 4;
278
      else if (program->needs_vcc)
279
         return 2;
280
      else
281
         return 0;
282
   }
283
}
284

285
uint16_t
286
get_sgpr_alloc(Program* program, uint16_t addressable_sgprs)
287
{
288
   uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
289
   uint16_t granule = program->dev.sgpr_alloc_granule;
290
   return ALIGN_NPOT(std::max(sgprs, granule), granule);
291
}
292

293
uint16_t
294
get_vgpr_alloc(Program* program, uint16_t addressable_vgprs)
295
{
296
   assert(addressable_vgprs <= program->dev.vgpr_limit);
297
   uint16_t granule = program->dev.vgpr_alloc_granule;
298
   return align(std::max(addressable_vgprs, granule), granule);
299
}
300

301
unsigned
302
round_down(unsigned a, unsigned b)
303
{
304
   return a - (a % b);
305
}
306

307
uint16_t
308
get_addr_sgpr_from_waves(Program* program, uint16_t waves)
309
{
310
   /* it's not possible to allocate more than 128 SGPRs */
311
   uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128);
312
   sgprs = round_down(sgprs, program->dev.sgpr_alloc_granule);
313
   sgprs -= get_extra_sgprs(program);
314
   return std::min(sgprs, program->dev.sgpr_limit);
315
}
316

317
uint16_t
318
get_addr_vgpr_from_waves(Program* program, uint16_t waves)
319
{
320
   uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1);
321
   vgprs -= program->config->num_shared_vgprs / 2;
322
   return std::min(vgprs, program->dev.vgpr_limit);
323
}
324

325
void
326
calc_min_waves(Program* program)
327
{
328
   unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
329
   unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
330
   program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
331
}
332

333
void
334
update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
335
{
336
   unsigned max_waves_per_simd = program->dev.max_wave64_per_simd * (64 / program->wave_size);
337
   unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
338
   unsigned lds_limit = program->wgp_mode ? program->dev.lds_limit * 2 : program->dev.lds_limit;
339

340
   assert(program->min_waves >= 1);
341
   uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
342
   uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
343

344
   /* this won't compile, register pressure reduction necessary */
345
   if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) {
346
      program->num_waves = 0;
347
      program->max_reg_demand = new_demand;
348
   } else {
349
      program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
350
      uint16_t vgpr_demand =
351
         get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
352
      program->num_waves =
353
         std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
354
      program->max_waves = max_waves_per_simd;
355

356
      /* adjust max_waves for workgroup and LDS limits */
357
      unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
358
      unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup;
359
      if (program->config->lds_size) {
360
         unsigned lds = program->config->lds_size * program->dev.lds_encoding_granule;
361
         lds = align(lds, program->dev.lds_alloc_granule);
362
         workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
363
      }
364
      if (waves_per_workgroup > 1 && program->chip_class < GFX10)
365
         workgroups_per_cu_wgp = std::min(
366
            workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */
367

368
      /* in cases like waves_per_workgroup=3 or lds=65536 and
369
       * waves_per_workgroup=1, we want the maximum possible number of waves per
370
       * SIMD and not the minimum. so DIV_ROUND_UP is used */
371
      program->max_waves = std::min<uint16_t>(
372
         program->max_waves,
373
         DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));
374

375
      /* incorporate max_waves and calculate max_reg_demand */
376
      program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves);
377
      program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves);
378
      program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
379
   }
380
}
381

382
live
383
live_var_analysis(Program* program)
384
{
385
   live result;
386
   result.live_out.resize(program->blocks.size());
387
   result.register_demand.resize(program->blocks.size());
388
   unsigned worklist = program->blocks.size();
389
   std::vector<uint16_t> phi_sgpr_ops(program->blocks.size());
390
   RegisterDemand new_demand;
391

392
   program->needs_vcc = false;
393

394
   /* this implementation assumes that the block idx corresponds to the block's position in
395
    * program->blocks vector */
396
   while (worklist) {
397
      unsigned block_idx = --worklist;
398
      process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist,
399
                                   phi_sgpr_ops);
400
      new_demand.update(program->blocks[block_idx].register_demand);
401
   }
402

403
   /* calculate the program's register demand and number of waves */
404
   if (program->progress < CompilationProgress::after_ra)
405
      update_vgpr_sgpr_demand(program, new_demand);
406

407
   return result;
408
}
409

410
} // namespace aco
411

412
Product

Resources

Company