Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_live_var_analysis.cpp
4550 views
1
/*
2
* Copyright © 2018 Valve Corporation
3
* Copyright © 2018 Google
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
* IN THE SOFTWARE.
23
*
24
*/
25
26
#include "aco_ir.h"
27
28
#include "util/u_math.h"
29
30
#include <set>
31
#include <vector>
32
33
namespace aco {
34
RegisterDemand
35
get_live_changes(aco_ptr<Instruction>& instr)
36
{
37
RegisterDemand changes;
38
for (const Definition& def : instr->definitions) {
39
if (!def.isTemp() || def.isKill())
40
continue;
41
changes += def.getTemp();
42
}
43
44
for (const Operand& op : instr->operands) {
45
if (!op.isTemp() || !op.isFirstKill())
46
continue;
47
changes -= op.getTemp();
48
}
49
50
return changes;
51
}
52
53
RegisterDemand
54
get_temp_registers(aco_ptr<Instruction>& instr)
55
{
56
RegisterDemand temp_registers;
57
58
for (Definition def : instr->definitions) {
59
if (!def.isTemp())
60
continue;
61
if (def.isKill())
62
temp_registers += def.getTemp();
63
}
64
65
for (Operand op : instr->operands) {
66
if (op.isTemp() && op.isLateKill() && op.isFirstKill())
67
temp_registers += op.getTemp();
68
}
69
70
return temp_registers;
71
}
72
73
RegisterDemand
74
get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
75
aco_ptr<Instruction>& instr_before)
76
{
77
demand -= get_live_changes(instr);
78
demand -= get_temp_registers(instr);
79
if (instr_before)
80
demand += get_temp_registers(instr_before);
81
return demand;
82
}
83
84
namespace {
85
void
86
process_live_temps_per_block(Program* program, live& lives, Block* block,
87
unsigned& worklist, std::vector<uint16_t>& phi_sgpr_ops)
88
{
89
std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index];
90
RegisterDemand new_demand;
91
92
register_demand.resize(block->instructions.size());
93
RegisterDemand block_register_demand;
94
IDSet live = lives.live_out[block->index];
95
96
/* initialize register demand */
97
for (unsigned t : live)
98
new_demand += Temp(t, program->temp_rc[t]);
99
new_demand.sgpr -= phi_sgpr_ops[block->index];
100
101
/* traverse the instructions backwards */
102
int idx;
103
for (idx = block->instructions.size() - 1; idx >= 0; idx--) {
104
Instruction* insn = block->instructions[idx].get();
105
if (is_phi(insn))
106
break;
107
108
register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr);
109
110
/* KILL */
111
for (Definition& definition : insn->definitions) {
112
if (!definition.isTemp()) {
113
continue;
114
}
115
if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
116
program->needs_vcc = true;
117
118
const Temp temp = definition.getTemp();
119
const size_t n = live.erase(temp.id());
120
121
if (n) {
122
new_demand -= temp;
123
definition.setKill(false);
124
} else {
125
register_demand[idx] += temp;
126
definition.setKill(true);
127
}
128
}
129
130
/* GEN */
131
if (insn->opcode == aco_opcode::p_logical_end) {
132
new_demand.sgpr += phi_sgpr_ops[block->index];
133
} else {
134
/* we need to do this in a separate loop because the next one can
135
* setKill() for several operands at once and we don't want to
136
* overwrite that in a later iteration */
137
for (Operand& op : insn->operands)
138
op.setKill(false);
139
140
for (unsigned i = 0; i < insn->operands.size(); ++i) {
141
Operand& operand = insn->operands[i];
142
if (!operand.isTemp())
143
continue;
144
if (operand.isFixed() && operand.physReg() == vcc)
145
program->needs_vcc = true;
146
const Temp temp = operand.getTemp();
147
const bool inserted = live.insert(temp.id()).second;
148
if (inserted) {
149
operand.setFirstKill(true);
150
for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
151
if (insn->operands[j].isTemp() &&
152
insn->operands[j].tempId() == operand.tempId()) {
153
insn->operands[j].setFirstKill(false);
154
insn->operands[j].setKill(true);
155
}
156
}
157
if (operand.isLateKill())
158
register_demand[idx] += temp;
159
new_demand += temp;
160
}
161
}
162
}
163
164
block_register_demand.update(register_demand[idx]);
165
}
166
167
/* update block's register demand for a last time */
168
block_register_demand.update(new_demand);
169
if (program->progress < CompilationProgress::after_ra)
170
block->register_demand = block_register_demand;
171
172
/* handle phi definitions */
173
int phi_idx = idx;
174
while (phi_idx >= 0) {
175
register_demand[phi_idx] = new_demand;
176
Instruction* insn = block->instructions[phi_idx].get();
177
178
assert(is_phi(insn) && insn->definitions.size() == 1);
179
if (!insn->definitions[0].isTemp()) {
180
assert(insn->definitions[0].isFixed() && insn->definitions[0].physReg() == exec);
181
phi_idx--;
182
continue;
183
}
184
Definition& definition = insn->definitions[0];
185
if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
186
program->needs_vcc = true;
187
const Temp temp = definition.getTemp();
188
const size_t n = live.erase(temp.id());
189
190
if (n)
191
definition.setKill(false);
192
else
193
definition.setKill(true);
194
195
phi_idx--;
196
}
197
198
/* now, we need to merge the live-ins into the live-out sets */
199
for (unsigned t : live) {
200
RegClass rc = program->temp_rc[t];
201
std::vector<unsigned>& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;
202
203
#ifndef NDEBUG
204
if (preds.empty())
205
aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t,
206
block->index);
207
#endif
208
209
for (unsigned pred_idx : preds) {
210
auto it = lives.live_out[pred_idx].insert(t);
211
if (it.second)
212
worklist = std::max(worklist, pred_idx + 1);
213
}
214
}
215
216
/* handle phi operands */
217
phi_idx = idx;
218
while (phi_idx >= 0) {
219
Instruction* insn = block->instructions[phi_idx].get();
220
assert(is_phi(insn));
221
/* directly insert into the predecessors live-out set */
222
std::vector<unsigned>& preds =
223
insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
224
for (unsigned i = 0; i < preds.size(); ++i) {
225
Operand& operand = insn->operands[i];
226
if (!operand.isTemp())
227
continue;
228
if (operand.isFixed() && operand.physReg() == vcc)
229
program->needs_vcc = true;
230
/* check if we changed an already processed block */
231
const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second;
232
if (inserted) {
233
worklist = std::max(worklist, preds[i] + 1);
234
if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr)
235
phi_sgpr_ops[preds[i]] += operand.size();
236
}
237
238
/* set if the operand is killed by this (or another) phi instruction */
239
operand.setKill(!live.count(operand.tempId()));
240
}
241
phi_idx--;
242
}
243
244
assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty()));
245
}
246
247
unsigned
248
calc_waves_per_workgroup(Program* program)
249
{
250
/* When workgroup size is not known, just go with wave_size */
251
unsigned workgroup_size =
252
program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size;
253
254
return align(workgroup_size, program->wave_size) / program->wave_size;
255
}
256
} /* end namespace */
257
258
uint16_t
259
get_extra_sgprs(Program* program)
260
{
261
if (program->chip_class >= GFX10) {
262
assert(!program->needs_flat_scr);
263
assert(!program->dev.xnack_enabled);
264
return 0;
265
} else if (program->chip_class >= GFX8) {
266
if (program->needs_flat_scr)
267
return 6;
268
else if (program->dev.xnack_enabled)
269
return 4;
270
else if (program->needs_vcc)
271
return 2;
272
else
273
return 0;
274
} else {
275
assert(!program->dev.xnack_enabled);
276
if (program->needs_flat_scr)
277
return 4;
278
else if (program->needs_vcc)
279
return 2;
280
else
281
return 0;
282
}
283
}
284
285
uint16_t
286
get_sgpr_alloc(Program* program, uint16_t addressable_sgprs)
287
{
288
uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
289
uint16_t granule = program->dev.sgpr_alloc_granule;
290
return ALIGN_NPOT(std::max(sgprs, granule), granule);
291
}
292
293
uint16_t
294
get_vgpr_alloc(Program* program, uint16_t addressable_vgprs)
295
{
296
assert(addressable_vgprs <= program->dev.vgpr_limit);
297
uint16_t granule = program->dev.vgpr_alloc_granule;
298
return align(std::max(addressable_vgprs, granule), granule);
299
}
300
301
unsigned
302
round_down(unsigned a, unsigned b)
303
{
304
return a - (a % b);
305
}
306
307
uint16_t
308
get_addr_sgpr_from_waves(Program* program, uint16_t waves)
309
{
310
/* it's not possible to allocate more than 128 SGPRs */
311
uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128);
312
sgprs = round_down(sgprs, program->dev.sgpr_alloc_granule);
313
sgprs -= get_extra_sgprs(program);
314
return std::min(sgprs, program->dev.sgpr_limit);
315
}
316
317
uint16_t
318
get_addr_vgpr_from_waves(Program* program, uint16_t waves)
319
{
320
uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1);
321
vgprs -= program->config->num_shared_vgprs / 2;
322
return std::min(vgprs, program->dev.vgpr_limit);
323
}
324
325
void
326
calc_min_waves(Program* program)
327
{
328
unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
329
unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
330
program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
331
}
332
333
void
334
update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
335
{
336
unsigned max_waves_per_simd = program->dev.max_wave64_per_simd * (64 / program->wave_size);
337
unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
338
unsigned lds_limit = program->wgp_mode ? program->dev.lds_limit * 2 : program->dev.lds_limit;
339
340
assert(program->min_waves >= 1);
341
uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
342
uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
343
344
/* this won't compile, register pressure reduction necessary */
345
if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) {
346
program->num_waves = 0;
347
program->max_reg_demand = new_demand;
348
} else {
349
program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
350
uint16_t vgpr_demand =
351
get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
352
program->num_waves =
353
std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
354
program->max_waves = max_waves_per_simd;
355
356
/* adjust max_waves for workgroup and LDS limits */
357
unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
358
unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup;
359
if (program->config->lds_size) {
360
unsigned lds = program->config->lds_size * program->dev.lds_encoding_granule;
361
lds = align(lds, program->dev.lds_alloc_granule);
362
workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
363
}
364
if (waves_per_workgroup > 1 && program->chip_class < GFX10)
365
workgroups_per_cu_wgp = std::min(
366
workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */
367
368
/* in cases like waves_per_workgroup=3 or lds=65536 and
369
* waves_per_workgroup=1, we want the maximum possible number of waves per
370
* SIMD and not the minimum. so DIV_ROUND_UP is used */
371
program->max_waves = std::min<uint16_t>(
372
program->max_waves,
373
DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));
374
375
/* incorporate max_waves and calculate max_reg_demand */
376
program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves);
377
program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves);
378
program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
379
}
380
}
381
382
live
383
live_var_analysis(Program* program)
384
{
385
live result;
386
result.live_out.resize(program->blocks.size());
387
result.register_demand.resize(program->blocks.size());
388
unsigned worklist = program->blocks.size();
389
std::vector<uint16_t> phi_sgpr_ops(program->blocks.size());
390
RegisterDemand new_demand;
391
392
program->needs_vcc = false;
393
394
/* this implementation assumes that the block idx corresponds to the block's position in
395
* program->blocks vector */
396
while (worklist) {
397
unsigned block_idx = --worklist;
398
process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist,
399
phi_sgpr_ops);
400
new_demand.update(program->blocks[block_idx].register_demand);
401
}
402
403
/* calculate the program's register demand and number of waves */
404
if (program->progress < CompilationProgress::after_ra)
405
update_vgpr_sgpr_demand(program, new_demand);
406
407
return result;
408
}
409
410
} // namespace aco
411
412