Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_instruction_selection_setup.cpp
4550 views
1
/*
2
* Copyright © 2018 Valve Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*
23
*/
24
25
#include "aco_instruction_selection.h"
26
27
#include "common/ac_exp_param.h"
28
#include "common/sid.h"
29
#include "vulkan/radv_descriptor_set.h"
30
31
#include "nir_control_flow.h"
32
33
#include <vector>
34
35
namespace aco {
36
37
namespace {
38
39
unsigned
40
get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
41
{
42
switch (interp) {
43
case INTERP_MODE_SMOOTH:
44
case INTERP_MODE_NONE:
45
if (intrin == nir_intrinsic_load_barycentric_pixel ||
46
intrin == nir_intrinsic_load_barycentric_at_sample ||
47
intrin == nir_intrinsic_load_barycentric_at_offset)
48
return S_0286CC_PERSP_CENTER_ENA(1);
49
else if (intrin == nir_intrinsic_load_barycentric_centroid)
50
return S_0286CC_PERSP_CENTROID_ENA(1);
51
else if (intrin == nir_intrinsic_load_barycentric_sample)
52
return S_0286CC_PERSP_SAMPLE_ENA(1);
53
break;
54
case INTERP_MODE_NOPERSPECTIVE:
55
if (intrin == nir_intrinsic_load_barycentric_pixel)
56
return S_0286CC_LINEAR_CENTER_ENA(1);
57
else if (intrin == nir_intrinsic_load_barycentric_centroid)
58
return S_0286CC_LINEAR_CENTROID_ENA(1);
59
else if (intrin == nir_intrinsic_load_barycentric_sample)
60
return S_0286CC_LINEAR_SAMPLE_ENA(1);
61
break;
62
default: break;
63
}
64
return 0;
65
}
66
67
bool
68
is_loop_header_block(nir_block* block)
69
{
70
return block->cf_node.parent->type == nir_cf_node_loop &&
71
block == nir_loop_first_block(nir_cf_node_as_loop(block->cf_node.parent));
72
}
73
74
/* similar to nir_block_is_unreachable(), but does not require dominance information */
75
bool
76
is_block_reachable(nir_function_impl* impl, nir_block* known_reachable, nir_block* block)
77
{
78
if (block == nir_start_block(impl) || block == known_reachable)
79
return true;
80
81
/* skip loop back-edges */
82
if (is_loop_header_block(block)) {
83
nir_loop* loop = nir_cf_node_as_loop(block->cf_node.parent);
84
nir_block* preheader = nir_block_cf_tree_prev(nir_loop_first_block(loop));
85
return is_block_reachable(impl, known_reachable, preheader);
86
}
87
88
set_foreach (block->predecessors, entry) {
89
if (is_block_reachable(impl, known_reachable, (nir_block*)entry->key))
90
return true;
91
}
92
93
return false;
94
}
95
96
/* Check whether the given SSA def is only used by cross-lane instructions. */
97
bool
98
only_used_by_cross_lane_instrs(nir_ssa_def* ssa, bool follow_phis = true)
99
{
100
nir_foreach_use (src, ssa) {
101
switch (src->parent_instr->type) {
102
case nir_instr_type_alu: {
103
nir_alu_instr* alu = nir_instr_as_alu(src->parent_instr);
104
if (alu->op != nir_op_unpack_64_2x32_split_x && alu->op != nir_op_unpack_64_2x32_split_y)
105
return false;
106
if (!only_used_by_cross_lane_instrs(&alu->dest.dest.ssa, follow_phis))
107
return false;
108
109
continue;
110
}
111
case nir_instr_type_intrinsic: {
112
nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(src->parent_instr);
113
if (intrin->intrinsic != nir_intrinsic_read_invocation &&
114
intrin->intrinsic != nir_intrinsic_read_first_invocation &&
115
intrin->intrinsic != nir_intrinsic_lane_permute_16_amd)
116
return false;
117
118
continue;
119
}
120
case nir_instr_type_phi: {
121
/* Don't follow more than 1 phis, this avoids infinite loops. */
122
if (!follow_phis)
123
return false;
124
125
nir_phi_instr* phi = nir_instr_as_phi(src->parent_instr);
126
if (!only_used_by_cross_lane_instrs(&phi->dest.ssa, false))
127
return false;
128
129
continue;
130
}
131
default: return false;
132
}
133
}
134
135
return true;
136
}
137
138
/* If one side of a divergent IF ends in a branch and the other doesn't, we
139
* might have to emit the contents of the side without the branch at the merge
140
* block instead. This is so that we can use any SGPR live-out of the side
141
* without the branch without creating a linear phi in the invert or merge block. */
142
bool
143
sanitize_if(nir_function_impl* impl, nir_if* nif)
144
{
145
// TODO: skip this if the condition is uniform and there are no divergent breaks/continues?
146
147
nir_block* then_block = nir_if_last_then_block(nif);
148
nir_block* else_block = nir_if_last_else_block(nif);
149
bool then_jump = nir_block_ends_in_jump(then_block) ||
150
!is_block_reachable(impl, nir_if_first_then_block(nif), then_block);
151
bool else_jump = nir_block_ends_in_jump(else_block) ||
152
!is_block_reachable(impl, nir_if_first_else_block(nif), else_block);
153
if (then_jump == else_jump)
154
return false;
155
156
/* If the continue from block is empty then return as there is nothing to
157
* move.
158
*/
159
if (nir_cf_list_is_empty_block(else_jump ? &nif->then_list : &nif->else_list))
160
return false;
161
162
/* Even though this if statement has a jump on one side, we may still have
163
* phis afterwards. Single-source phis can be produced by loop unrolling
164
* or dead control-flow passes and are perfectly legal. Run a quick phi
165
* removal on the block after the if to clean up any such phis.
166
*/
167
nir_opt_remove_phis_block(nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));
168
169
/* Finally, move the continue from branch after the if-statement. */
170
nir_block* last_continue_from_blk = else_jump ? then_block : else_block;
171
nir_block* first_continue_from_blk =
172
else_jump ? nir_if_first_then_block(nif) : nir_if_first_else_block(nif);
173
174
nir_cf_list tmp;
175
nir_cf_extract(&tmp, nir_before_block(first_continue_from_blk),
176
nir_after_block(last_continue_from_blk));
177
nir_cf_reinsert(&tmp, nir_after_cf_node(&nif->cf_node));
178
179
return true;
180
}
181
182
bool
183
sanitize_cf_list(nir_function_impl* impl, struct exec_list* cf_list)
184
{
185
bool progress = false;
186
foreach_list_typed (nir_cf_node, cf_node, node, cf_list) {
187
switch (cf_node->type) {
188
case nir_cf_node_block: break;
189
case nir_cf_node_if: {
190
nir_if* nif = nir_cf_node_as_if(cf_node);
191
progress |= sanitize_cf_list(impl, &nif->then_list);
192
progress |= sanitize_cf_list(impl, &nif->else_list);
193
progress |= sanitize_if(impl, nif);
194
break;
195
}
196
case nir_cf_node_loop: {
197
nir_loop* loop = nir_cf_node_as_loop(cf_node);
198
progress |= sanitize_cf_list(impl, &loop->body);
199
break;
200
}
201
case nir_cf_node_function: unreachable("Invalid cf type");
202
}
203
}
204
205
return progress;
206
}
207
208
void
209
apply_nuw_to_ssa(isel_context* ctx, nir_ssa_def* ssa)
210
{
211
nir_ssa_scalar scalar;
212
scalar.def = ssa;
213
scalar.comp = 0;
214
215
if (!nir_ssa_scalar_is_alu(scalar) || nir_ssa_scalar_alu_op(scalar) != nir_op_iadd)
216
return;
217
218
nir_alu_instr* add = nir_instr_as_alu(ssa->parent_instr);
219
220
if (add->no_unsigned_wrap)
221
return;
222
223
nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
224
nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
225
226
if (nir_ssa_scalar_is_const(src0)) {
227
nir_ssa_scalar tmp = src0;
228
src0 = src1;
229
src1 = tmp;
230
}
231
232
uint32_t src1_ub = nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, src1, &ctx->ub_config);
233
add->no_unsigned_wrap =
234
!nir_addition_might_overflow(ctx->shader, ctx->range_ht, src0, src1_ub, &ctx->ub_config);
235
}
236
237
void
238
apply_nuw_to_offsets(isel_context* ctx, nir_function_impl* impl)
239
{
240
nir_foreach_block (block, impl) {
241
nir_foreach_instr (instr, block) {
242
if (instr->type != nir_instr_type_intrinsic)
243
continue;
244
nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(instr);
245
246
switch (intrin->intrinsic) {
247
case nir_intrinsic_load_constant:
248
case nir_intrinsic_load_uniform:
249
case nir_intrinsic_load_push_constant:
250
if (!nir_src_is_divergent(intrin->src[0]))
251
apply_nuw_to_ssa(ctx, intrin->src[0].ssa);
252
break;
253
case nir_intrinsic_load_ubo:
254
case nir_intrinsic_load_ssbo:
255
if (!nir_src_is_divergent(intrin->src[1]))
256
apply_nuw_to_ssa(ctx, intrin->src[1].ssa);
257
break;
258
case nir_intrinsic_store_ssbo:
259
if (!nir_src_is_divergent(intrin->src[2]))
260
apply_nuw_to_ssa(ctx, intrin->src[2].ssa);
261
break;
262
default: break;
263
}
264
}
265
}
266
}
267
268
RegClass
269
get_reg_class(isel_context* ctx, RegType type, unsigned components, unsigned bitsize)
270
{
271
if (bitsize == 1)
272
return RegClass(RegType::sgpr, ctx->program->lane_mask.size() * components);
273
else
274
return RegClass::get(type, components * bitsize / 8u);
275
}
276
277
void
278
setup_vs_output_info(isel_context* ctx, nir_shader* nir, bool export_prim_id,
279
bool export_clip_dists, radv_vs_output_info* outinfo)
280
{
281
memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
282
sizeof(outinfo->vs_output_param_offset));
283
284
outinfo->param_exports = 0;
285
int pos_written = 0x1;
286
bool writes_primitive_shading_rate =
287
outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates;
288
if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer ||
289
writes_primitive_shading_rate)
290
pos_written |= 1 << 1;
291
292
uint64_t mask = nir->info.outputs_written;
293
while (mask) {
294
int idx = u_bit_scan64(&mask);
295
if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER ||
296
idx == VARYING_SLOT_PRIMITIVE_ID || idx == VARYING_SLOT_VIEWPORT ||
297
((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) &&
298
export_clip_dists)) {
299
if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)
300
outinfo->vs_output_param_offset[idx] = outinfo->param_exports++;
301
}
302
}
303
if (outinfo->writes_layer &&
304
outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {
305
/* when ctx->options->key.has_multiview_view_index = true, the layer
306
* variable isn't declared in NIR and it's isel's job to get the layer */
307
outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;
308
}
309
310
if (export_prim_id) {
311
assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);
312
outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
313
}
314
315
ctx->export_clip_dists = export_clip_dists;
316
ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask);
317
ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask);
318
319
assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8);
320
321
if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
322
pos_written |= 1 << 2;
323
if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
324
pos_written |= 1 << 3;
325
326
outinfo->pos_exports = util_bitcount(pos_written);
327
328
/* GFX10+ early rasterization:
329
* When there are no param exports in an NGG (or legacy VS) shader,
330
* RADV sets NO_PC_EXPORT=1, which means the HW will start clipping and rasterization
331
* as soon as it encounters a DONE pos export. When this happens, PS waves can launch
332
* before the NGG (or VS) waves finish.
333
*/
334
ctx->program->early_rast = ctx->program->chip_class >= GFX10 && outinfo->param_exports == 0;
335
}
336
337
void
338
setup_vs_variables(isel_context* ctx, nir_shader* nir)
339
{
340
if (ctx->stage == vertex_vs || ctx->stage == vertex_ngg) {
341
radv_vs_output_info* outinfo = &ctx->program->info->vs.outinfo;
342
setup_vs_output_info(ctx, nir, outinfo->export_prim_id,
343
ctx->options->key.vs_common_out.export_clip_dists, outinfo);
344
345
/* TODO: NGG streamout */
346
if (ctx->stage.hw == HWStage::NGG)
347
assert(!ctx->args->shader_info->so.num_outputs);
348
}
349
350
if (ctx->stage == vertex_ngg) {
351
ctx->program->config->lds_size =
352
DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
353
assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <
354
(32 * 1024));
355
}
356
}
357
358
void
359
setup_gs_variables(isel_context* ctx, nir_shader* nir)
360
{
361
if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
362
ctx->program->config->lds_size =
363
ctx->program->info->gs_ring_info.lds_size; /* Already in units of the alloc granularity */
364
} else if (ctx->stage == vertex_geometry_ngg || ctx->stage == tess_eval_geometry_ngg) {
365
radv_vs_output_info* outinfo = &ctx->program->info->vs.outinfo;
366
setup_vs_output_info(ctx, nir, false, ctx->options->key.vs_common_out.export_clip_dists,
367
outinfo);
368
369
ctx->program->config->lds_size =
370
DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
371
}
372
373
if (ctx->stage.has(SWStage::VS))
374
ctx->program->info->gs.es_type = MESA_SHADER_VERTEX;
375
else if (ctx->stage.has(SWStage::TES))
376
ctx->program->info->gs.es_type = MESA_SHADER_TESS_EVAL;
377
}
378
379
void
380
setup_tcs_info(isel_context* ctx, nir_shader* nir, nir_shader* vs)
381
{
382
ctx->tcs_in_out_eq = ctx->args->shader_info->vs.tcs_in_out_eq;
383
ctx->tcs_temp_only_inputs = ctx->args->shader_info->vs.tcs_temp_only_input_mask;
384
ctx->tcs_num_patches = ctx->args->shader_info->num_tess_patches;
385
ctx->program->config->lds_size = ctx->args->shader_info->tcs.num_lds_blocks;
386
}
387
388
void
389
setup_tes_variables(isel_context* ctx, nir_shader* nir)
390
{
391
ctx->tcs_num_patches = ctx->args->shader_info->num_tess_patches;
392
393
if (ctx->stage == tess_eval_vs || ctx->stage == tess_eval_ngg) {
394
radv_vs_output_info* outinfo = &ctx->program->info->tes.outinfo;
395
setup_vs_output_info(ctx, nir, outinfo->export_prim_id,
396
ctx->options->key.vs_common_out.export_clip_dists, outinfo);
397
398
/* TODO: NGG streamout */
399
if (ctx->stage.hw == HWStage::NGG)
400
assert(!ctx->args->shader_info->so.num_outputs);
401
}
402
403
if (ctx->stage == tess_eval_ngg) {
404
ctx->program->config->lds_size =
405
DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
406
assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <
407
(32 * 1024));
408
}
409
}
410
411
void
412
setup_variables(isel_context* ctx, nir_shader* nir)
413
{
414
switch (nir->info.stage) {
415
case MESA_SHADER_FRAGMENT: {
416
break;
417
}
418
case MESA_SHADER_COMPUTE: {
419
ctx->program->config->lds_size =
420
DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
421
break;
422
}
423
case MESA_SHADER_VERTEX: {
424
setup_vs_variables(ctx, nir);
425
break;
426
}
427
case MESA_SHADER_GEOMETRY: {
428
setup_gs_variables(ctx, nir);
429
break;
430
}
431
case MESA_SHADER_TESS_CTRL: {
432
break;
433
}
434
case MESA_SHADER_TESS_EVAL: {
435
setup_tes_variables(ctx, nir);
436
break;
437
}
438
default: unreachable("Unhandled shader stage.");
439
}
440
441
/* Make sure we fit the available LDS space. */
442
assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <=
443
ctx->program->dev.lds_limit);
444
}
445
446
void
447
setup_nir(isel_context* ctx, nir_shader* nir)
448
{
449
/* the variable setup has to be done before lower_io / CSE */
450
setup_variables(ctx, nir);
451
452
nir_convert_to_lcssa(nir, true, false);
453
nir_lower_phis_to_scalar(nir, true);
454
455
nir_function_impl* func = nir_shader_get_entrypoint(nir);
456
nir_index_ssa_defs(func);
457
}
458
459
} /* end namespace */
460
461
void
462
init_context(isel_context* ctx, nir_shader* shader)
463
{
464
nir_function_impl* impl = nir_shader_get_entrypoint(shader);
465
ctx->shader = shader;
466
467
/* Init NIR range analysis. */
468
ctx->range_ht = _mesa_pointer_hash_table_create(NULL);
469
ctx->ub_config.min_subgroup_size = 64;
470
ctx->ub_config.max_subgroup_size = 64;
471
if (ctx->shader->info.stage == MESA_SHADER_COMPUTE && ctx->options->key.cs.subgroup_size) {
472
ctx->ub_config.min_subgroup_size = ctx->options->key.cs.subgroup_size;
473
ctx->ub_config.max_subgroup_size = ctx->options->key.cs.subgroup_size;
474
}
475
ctx->ub_config.max_workgroup_invocations = 2048;
476
ctx->ub_config.max_workgroup_count[0] = 65535;
477
ctx->ub_config.max_workgroup_count[1] = 65535;
478
ctx->ub_config.max_workgroup_count[2] = 65535;
479
ctx->ub_config.max_workgroup_size[0] = 2048;
480
ctx->ub_config.max_workgroup_size[1] = 2048;
481
ctx->ub_config.max_workgroup_size[2] = 2048;
482
for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
483
unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[i];
484
unsigned dfmt = attrib_format & 0xf;
485
unsigned nfmt = (attrib_format >> 4) & 0x7;
486
487
uint32_t max = UINT32_MAX;
488
if (nfmt == V_008F0C_BUF_NUM_FORMAT_UNORM) {
489
max = 0x3f800000u;
490
} else if (nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED) {
491
bool uscaled = nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED;
492
switch (dfmt) {
493
case V_008F0C_BUF_DATA_FORMAT_8:
494
case V_008F0C_BUF_DATA_FORMAT_8_8:
495
case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: max = uscaled ? 0x437f0000u : UINT8_MAX; break;
496
case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
497
case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: max = uscaled ? 0x447fc000u : 1023; break;
498
case V_008F0C_BUF_DATA_FORMAT_10_11_11:
499
case V_008F0C_BUF_DATA_FORMAT_11_11_10: max = uscaled ? 0x44ffe000u : 2047; break;
500
case V_008F0C_BUF_DATA_FORMAT_16:
501
case V_008F0C_BUF_DATA_FORMAT_16_16:
502
case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: max = uscaled ? 0x477fff00u : UINT16_MAX; break;
503
case V_008F0C_BUF_DATA_FORMAT_32:
504
case V_008F0C_BUF_DATA_FORMAT_32_32:
505
case V_008F0C_BUF_DATA_FORMAT_32_32_32:
506
case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: max = uscaled ? 0x4f800000u : UINT32_MAX; break;
507
}
508
}
509
ctx->ub_config.vertex_attrib_max[i] = max;
510
}
511
512
nir_divergence_analysis(shader);
513
nir_opt_uniform_atomics(shader);
514
515
apply_nuw_to_offsets(ctx, impl);
516
517
/* sanitize control flow */
518
sanitize_cf_list(impl, &impl->body);
519
nir_metadata_preserve(impl, nir_metadata_none);
520
521
/* we'll need these for isel */
522
nir_metadata_require(impl, nir_metadata_block_index);
523
524
if (!ctx->stage.has(SWStage::GSCopy) && ctx->options->dump_preoptir) {
525
fprintf(stderr, "NIR shader before instruction selection:\n");
526
nir_print_shader(shader, stderr);
527
}
528
529
ctx->first_temp_id = ctx->program->peekAllocationId();
530
ctx->program->allocateRange(impl->ssa_alloc);
531
RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id;
532
533
unsigned spi_ps_inputs = 0;
534
535
std::unique_ptr<unsigned[]> nir_to_aco{new unsigned[impl->num_blocks]()};
536
537
/* TODO: make this recursive to improve compile times */
538
bool done = false;
539
while (!done) {
540
done = true;
541
nir_foreach_block (block, impl) {
542
nir_foreach_instr (instr, block) {
543
switch (instr->type) {
544
case nir_instr_type_alu: {
545
nir_alu_instr* alu_instr = nir_instr_as_alu(instr);
546
RegType type =
547
nir_dest_is_divergent(alu_instr->dest.dest) ? RegType::vgpr : RegType::sgpr;
548
switch (alu_instr->op) {
549
case nir_op_fmul:
550
case nir_op_fadd:
551
case nir_op_fsub:
552
case nir_op_fmax:
553
case nir_op_fmin:
554
case nir_op_fneg:
555
case nir_op_fabs:
556
case nir_op_fsat:
557
case nir_op_fsign:
558
case nir_op_frcp:
559
case nir_op_frsq:
560
case nir_op_fsqrt:
561
case nir_op_fexp2:
562
case nir_op_flog2:
563
case nir_op_ffract:
564
case nir_op_ffloor:
565
case nir_op_fceil:
566
case nir_op_ftrunc:
567
case nir_op_fround_even:
568
case nir_op_fsin:
569
case nir_op_fcos:
570
case nir_op_f2f16:
571
case nir_op_f2f16_rtz:
572
case nir_op_f2f16_rtne:
573
case nir_op_f2f32:
574
case nir_op_f2f64:
575
case nir_op_u2f16:
576
case nir_op_u2f32:
577
case nir_op_u2f64:
578
case nir_op_i2f16:
579
case nir_op_i2f32:
580
case nir_op_i2f64:
581
case nir_op_pack_half_2x16_split:
582
case nir_op_unpack_half_2x16_split_x:
583
case nir_op_unpack_half_2x16_split_y:
584
case nir_op_fddx:
585
case nir_op_fddy:
586
case nir_op_fddx_fine:
587
case nir_op_fddy_fine:
588
case nir_op_fddx_coarse:
589
case nir_op_fddy_coarse:
590
case nir_op_fquantize2f16:
591
case nir_op_ldexp:
592
case nir_op_frexp_sig:
593
case nir_op_frexp_exp:
594
case nir_op_cube_face_index_amd:
595
case nir_op_cube_face_coord_amd:
596
case nir_op_sad_u8x4: type = RegType::vgpr; break;
597
case nir_op_f2i16:
598
case nir_op_f2u16:
599
case nir_op_f2i32:
600
case nir_op_f2u32:
601
case nir_op_f2i64:
602
case nir_op_f2u64:
603
case nir_op_b2i8:
604
case nir_op_b2i16:
605
case nir_op_b2i32:
606
case nir_op_b2i64:
607
case nir_op_b2b32:
608
case nir_op_b2f16:
609
case nir_op_b2f32:
610
case nir_op_mov: break;
611
case nir_op_iadd:
612
case nir_op_isub:
613
case nir_op_imul:
614
case nir_op_imin:
615
case nir_op_imax:
616
case nir_op_umin:
617
case nir_op_umax:
618
case nir_op_ishl:
619
case nir_op_ishr:
620
case nir_op_ushr:
621
/* packed 16bit instructions have to be VGPR */
622
type = alu_instr->dest.dest.ssa.num_components == 2 ? RegType::vgpr : type;
623
FALLTHROUGH;
624
default:
625
for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {
626
if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)
627
type = RegType::vgpr;
628
}
629
break;
630
}
631
632
RegClass rc = get_reg_class(ctx, type, alu_instr->dest.dest.ssa.num_components,
633
alu_instr->dest.dest.ssa.bit_size);
634
regclasses[alu_instr->dest.dest.ssa.index] = rc;
635
break;
636
}
637
case nir_instr_type_load_const: {
638
unsigned num_components = nir_instr_as_load_const(instr)->def.num_components;
639
unsigned bit_size = nir_instr_as_load_const(instr)->def.bit_size;
640
RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);
641
regclasses[nir_instr_as_load_const(instr)->def.index] = rc;
642
break;
643
}
644
case nir_instr_type_intrinsic: {
645
nir_intrinsic_instr* intrinsic = nir_instr_as_intrinsic(instr);
646
if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest)
647
break;
648
RegType type = RegType::sgpr;
649
switch (intrinsic->intrinsic) {
650
case nir_intrinsic_load_push_constant:
651
case nir_intrinsic_load_workgroup_id:
652
case nir_intrinsic_load_num_workgroups:
653
case nir_intrinsic_load_subgroup_id:
654
case nir_intrinsic_load_num_subgroups:
655
case nir_intrinsic_load_first_vertex:
656
case nir_intrinsic_load_base_instance:
657
case nir_intrinsic_vote_all:
658
case nir_intrinsic_vote_any:
659
case nir_intrinsic_read_first_invocation:
660
case nir_intrinsic_read_invocation:
661
case nir_intrinsic_first_invocation:
662
case nir_intrinsic_ballot:
663
case nir_intrinsic_load_ring_tess_factors_amd:
664
case nir_intrinsic_load_ring_tess_factors_offset_amd:
665
case nir_intrinsic_load_ring_tess_offchip_amd:
666
case nir_intrinsic_load_ring_tess_offchip_offset_amd:
667
case nir_intrinsic_load_ring_esgs_amd:
668
case nir_intrinsic_load_ring_es2gs_offset_amd:
669
case nir_intrinsic_image_deref_samples:
670
case nir_intrinsic_has_input_vertex_amd:
671
case nir_intrinsic_has_input_primitive_amd:
672
case nir_intrinsic_load_workgroup_num_input_vertices_amd:
673
case nir_intrinsic_load_workgroup_num_input_primitives_amd:
674
case nir_intrinsic_load_shader_query_enabled_amd:
675
case nir_intrinsic_load_cull_front_face_enabled_amd:
676
case nir_intrinsic_load_cull_back_face_enabled_amd:
677
case nir_intrinsic_load_cull_ccw_amd:
678
case nir_intrinsic_load_cull_small_primitives_enabled_amd:
679
case nir_intrinsic_load_cull_any_enabled_amd:
680
case nir_intrinsic_load_viewport_x_scale:
681
case nir_intrinsic_load_viewport_y_scale:
682
case nir_intrinsic_load_viewport_x_offset:
683
case nir_intrinsic_load_viewport_y_offset: type = RegType::sgpr; break;
684
case nir_intrinsic_load_sample_id:
685
case nir_intrinsic_load_sample_mask_in:
686
case nir_intrinsic_load_input:
687
case nir_intrinsic_load_output:
688
case nir_intrinsic_load_input_vertex:
689
case nir_intrinsic_load_per_vertex_input:
690
case nir_intrinsic_load_per_vertex_output:
691
case nir_intrinsic_load_vertex_id:
692
case nir_intrinsic_load_vertex_id_zero_base:
693
case nir_intrinsic_load_barycentric_sample:
694
case nir_intrinsic_load_barycentric_pixel:
695
case nir_intrinsic_load_barycentric_model:
696
case nir_intrinsic_load_barycentric_centroid:
697
case nir_intrinsic_load_barycentric_at_sample:
698
case nir_intrinsic_load_barycentric_at_offset:
699
case nir_intrinsic_load_interpolated_input:
700
case nir_intrinsic_load_frag_coord:
701
case nir_intrinsic_load_frag_shading_rate:
702
case nir_intrinsic_load_sample_pos:
703
case nir_intrinsic_load_layer_id:
704
case nir_intrinsic_load_local_invocation_id:
705
case nir_intrinsic_load_local_invocation_index:
706
case nir_intrinsic_load_subgroup_invocation:
707
case nir_intrinsic_load_tess_coord:
708
case nir_intrinsic_write_invocation_amd:
709
case nir_intrinsic_mbcnt_amd:
710
case nir_intrinsic_byte_permute_amd:
711
case nir_intrinsic_lane_permute_16_amd:
712
case nir_intrinsic_load_instance_id:
713
case nir_intrinsic_ssbo_atomic_add:
714
case nir_intrinsic_ssbo_atomic_imin:
715
case nir_intrinsic_ssbo_atomic_umin:
716
case nir_intrinsic_ssbo_atomic_imax:
717
case nir_intrinsic_ssbo_atomic_umax:
718
case nir_intrinsic_ssbo_atomic_and:
719
case nir_intrinsic_ssbo_atomic_or:
720
case nir_intrinsic_ssbo_atomic_xor:
721
case nir_intrinsic_ssbo_atomic_exchange:
722
case nir_intrinsic_ssbo_atomic_comp_swap:
723
case nir_intrinsic_global_atomic_add:
724
case nir_intrinsic_global_atomic_imin:
725
case nir_intrinsic_global_atomic_umin:
726
case nir_intrinsic_global_atomic_imax:
727
case nir_intrinsic_global_atomic_umax:
728
case nir_intrinsic_global_atomic_and:
729
case nir_intrinsic_global_atomic_or:
730
case nir_intrinsic_global_atomic_xor:
731
case nir_intrinsic_global_atomic_exchange:
732
case nir_intrinsic_global_atomic_comp_swap:
733
case nir_intrinsic_image_deref_atomic_add:
734
case nir_intrinsic_image_deref_atomic_umin:
735
case nir_intrinsic_image_deref_atomic_imin:
736
case nir_intrinsic_image_deref_atomic_umax:
737
case nir_intrinsic_image_deref_atomic_imax:
738
case nir_intrinsic_image_deref_atomic_and:
739
case nir_intrinsic_image_deref_atomic_or:
740
case nir_intrinsic_image_deref_atomic_xor:
741
case nir_intrinsic_image_deref_atomic_exchange:
742
case nir_intrinsic_image_deref_atomic_comp_swap:
743
case nir_intrinsic_image_deref_size:
744
case nir_intrinsic_shared_atomic_add:
745
case nir_intrinsic_shared_atomic_imin:
746
case nir_intrinsic_shared_atomic_umin:
747
case nir_intrinsic_shared_atomic_imax:
748
case nir_intrinsic_shared_atomic_umax:
749
case nir_intrinsic_shared_atomic_and:
750
case nir_intrinsic_shared_atomic_or:
751
case nir_intrinsic_shared_atomic_xor:
752
case nir_intrinsic_shared_atomic_exchange:
753
case nir_intrinsic_shared_atomic_comp_swap:
754
case nir_intrinsic_shared_atomic_fadd:
755
case nir_intrinsic_load_scratch:
756
case nir_intrinsic_load_invocation_id:
757
case nir_intrinsic_load_primitive_id:
758
case nir_intrinsic_load_buffer_amd:
759
case nir_intrinsic_load_tess_rel_patch_id_amd:
760
case nir_intrinsic_load_gs_vertex_offset_amd:
761
case nir_intrinsic_load_initial_edgeflag_amd:
762
case nir_intrinsic_load_packed_passthrough_primitive_amd:
763
case nir_intrinsic_gds_atomic_add_amd:
764
case nir_intrinsic_load_sbt_amd:
765
case nir_intrinsic_bvh64_intersect_ray_amd:
766
case nir_intrinsic_load_cull_small_prim_precision_amd: type = RegType::vgpr; break;
767
case nir_intrinsic_load_shared:
768
/* When the result of these loads is only used by cross-lane instructions,
769
* it is beneficial to use a VGPR destination. This is because this allows
770
* to put the s_waitcnt further down, which decreases latency.
771
*/
772
if (only_used_by_cross_lane_instrs(&intrinsic->dest.ssa)) {
773
type = RegType::vgpr;
774
break;
775
}
776
FALLTHROUGH;
777
case nir_intrinsic_shuffle:
778
case nir_intrinsic_quad_broadcast:
779
case nir_intrinsic_quad_swap_horizontal:
780
case nir_intrinsic_quad_swap_vertical:
781
case nir_intrinsic_quad_swap_diagonal:
782
case nir_intrinsic_quad_swizzle_amd:
783
case nir_intrinsic_masked_swizzle_amd:
784
case nir_intrinsic_inclusive_scan:
785
case nir_intrinsic_exclusive_scan:
786
case nir_intrinsic_reduce:
787
case nir_intrinsic_load_ubo:
788
case nir_intrinsic_load_ssbo:
789
case nir_intrinsic_load_global:
790
case nir_intrinsic_vulkan_resource_index:
791
case nir_intrinsic_get_ssbo_size:
792
type = nir_dest_is_divergent(intrinsic->dest) ? RegType::vgpr : RegType::sgpr;
793
break;
794
case nir_intrinsic_load_view_index:
795
type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr;
796
break;
797
default:
798
for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
799
i++) {
800
if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr)
801
type = RegType::vgpr;
802
}
803
break;
804
}
805
RegClass rc = get_reg_class(ctx, type, intrinsic->dest.ssa.num_components,
806
intrinsic->dest.ssa.bit_size);
807
regclasses[intrinsic->dest.ssa.index] = rc;
808
809
switch (intrinsic->intrinsic) {
810
case nir_intrinsic_load_barycentric_sample:
811
case nir_intrinsic_load_barycentric_pixel:
812
case nir_intrinsic_load_barycentric_centroid:
813
case nir_intrinsic_load_barycentric_at_sample:
814
case nir_intrinsic_load_barycentric_at_offset: {
815
glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic);
816
spi_ps_inputs |= get_interp_input(intrinsic->intrinsic, mode);
817
break;
818
}
819
case nir_intrinsic_load_barycentric_model:
820
spi_ps_inputs |= S_0286CC_PERSP_PULL_MODEL_ENA(1);
821
break;
822
case nir_intrinsic_load_front_face:
823
spi_ps_inputs |= S_0286CC_FRONT_FACE_ENA(1);
824
break;
825
case nir_intrinsic_load_frag_coord:
826
case nir_intrinsic_load_sample_pos: {
827
uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa);
828
for (unsigned i = 0; i < 4; i++) {
829
if (mask & (1 << i))
830
spi_ps_inputs |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
831
}
832
833
if (ctx->options->adjust_frag_coord_z &&
834
intrinsic->intrinsic == nir_intrinsic_load_frag_coord &&
835
G_0286CC_POS_Z_FLOAT_ENA(spi_ps_inputs)) {
836
/* Enable ancillary for adjusting gl_FragCoord.z for
837
* VRS due to a hw bug on some GFX10.3 chips.
838
*/
839
spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
840
}
841
break;
842
}
843
case nir_intrinsic_load_sample_id:
844
case nir_intrinsic_load_frag_shading_rate:
845
spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
846
break;
847
case nir_intrinsic_load_sample_mask_in:
848
spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
849
spi_ps_inputs |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
850
break;
851
default: break;
852
}
853
break;
854
}
855
case nir_instr_type_tex: {
856
nir_tex_instr* tex = nir_instr_as_tex(instr);
857
RegType type = nir_dest_is_divergent(tex->dest) ? RegType::vgpr : RegType::sgpr;
858
859
if (tex->op == nir_texop_texture_samples) {
860
assert(!tex->dest.ssa.divergent);
861
}
862
863
RegClass rc =
864
get_reg_class(ctx, type, tex->dest.ssa.num_components, tex->dest.ssa.bit_size);
865
regclasses[tex->dest.ssa.index] = rc;
866
break;
867
}
868
case nir_instr_type_parallel_copy: {
869
nir_foreach_parallel_copy_entry (entry, nir_instr_as_parallel_copy(instr)) {
870
regclasses[entry->dest.ssa.index] = regclasses[entry->src.ssa->index];
871
}
872
break;
873
}
874
case nir_instr_type_ssa_undef: {
875
unsigned num_components = nir_instr_as_ssa_undef(instr)->def.num_components;
876
unsigned bit_size = nir_instr_as_ssa_undef(instr)->def.bit_size;
877
RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);
878
regclasses[nir_instr_as_ssa_undef(instr)->def.index] = rc;
879
break;
880
}
881
case nir_instr_type_phi: {
882
nir_phi_instr* phi = nir_instr_as_phi(instr);
883
RegType type = RegType::sgpr;
884
unsigned num_components = phi->dest.ssa.num_components;
885
assert((phi->dest.ssa.bit_size != 1 || num_components == 1) &&
886
"Multiple components not supported on boolean phis.");
887
888
if (nir_dest_is_divergent(phi->dest)) {
889
type = RegType::vgpr;
890
} else {
891
nir_foreach_phi_src (src, phi) {
892
if (regclasses[src->src.ssa->index].type() == RegType::vgpr)
893
type = RegType::vgpr;
894
}
895
}
896
897
RegClass rc = get_reg_class(ctx, type, num_components, phi->dest.ssa.bit_size);
898
if (rc != regclasses[phi->dest.ssa.index])
899
done = false;
900
regclasses[phi->dest.ssa.index] = rc;
901
break;
902
}
903
default: break;
904
}
905
}
906
}
907
}
908
909
if (G_0286CC_POS_W_FLOAT_ENA(spi_ps_inputs)) {
910
/* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */
911
spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1);
912
}
913
914
if (!(spi_ps_inputs & 0x7F)) {
915
/* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */
916
spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1);
917
}
918
919
ctx->program->config->spi_ps_input_ena = spi_ps_inputs;
920
ctx->program->config->spi_ps_input_addr = spi_ps_inputs;
921
922
ctx->cf_info.nir_to_aco.reset(nir_to_aco.release());
923
924
/* align and copy constant data */
925
while (ctx->program->constant_data.size() % 4u)
926
ctx->program->constant_data.push_back(0);
927
ctx->constant_data_offset = ctx->program->constant_data.size();
928
ctx->program->constant_data.insert(ctx->program->constant_data.end(),
929
(uint8_t*)shader->constant_data,
930
(uint8_t*)shader->constant_data + shader->constant_data_size);
931
}
932
933
void
934
cleanup_context(isel_context* ctx)
935
{
936
_mesa_hash_table_destroy(ctx->range_ht, NULL);
937
}
938
939
isel_context
940
setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
941
ac_shader_config* config, struct radv_shader_args* args, bool is_gs_copy_shader)
942
{
943
SWStage sw_stage = SWStage::None;
944
for (unsigned i = 0; i < shader_count; i++) {
945
switch (shaders[i]->info.stage) {
946
case MESA_SHADER_VERTEX: sw_stage = sw_stage | SWStage::VS; break;
947
case MESA_SHADER_TESS_CTRL: sw_stage = sw_stage | SWStage::TCS; break;
948
case MESA_SHADER_TESS_EVAL: sw_stage = sw_stage | SWStage::TES; break;
949
case MESA_SHADER_GEOMETRY:
950
sw_stage = sw_stage | (is_gs_copy_shader ? SWStage::GSCopy : SWStage::GS);
951
break;
952
case MESA_SHADER_FRAGMENT: sw_stage = sw_stage | SWStage::FS; break;
953
case MESA_SHADER_COMPUTE: sw_stage = sw_stage | SWStage::CS; break;
954
default: unreachable("Shader stage not implemented");
955
}
956
}
957
bool gfx9_plus = args->options->chip_class >= GFX9;
958
bool ngg = args->shader_info->is_ngg && args->options->chip_class >= GFX10;
959
HWStage hw_stage{};
960
if (sw_stage == SWStage::VS && args->shader_info->vs.as_es && !ngg)
961
hw_stage = HWStage::ES;
962
else if (sw_stage == SWStage::VS && !args->shader_info->vs.as_ls && !ngg)
963
hw_stage = HWStage::VS;
964
else if (sw_stage == SWStage::VS && ngg)
965
hw_stage = HWStage::NGG; /* GFX10/NGG: VS without GS uses the HW GS stage */
966
else if (sw_stage == SWStage::GS)
967
hw_stage = HWStage::GS;
968
else if (sw_stage == SWStage::FS)
969
hw_stage = HWStage::FS;
970
else if (sw_stage == SWStage::CS)
971
hw_stage = HWStage::CS;
972
else if (sw_stage == SWStage::GSCopy)
973
hw_stage = HWStage::VS;
974
else if (sw_stage == SWStage::VS_GS && gfx9_plus && !ngg)
975
hw_stage = HWStage::GS; /* GFX6-9: VS+GS merged into a GS (and GFX10/legacy) */
976
else if (sw_stage == SWStage::VS_GS && ngg)
977
hw_stage = HWStage::NGG; /* GFX10+: VS+GS merged into an NGG GS */
978
else if (sw_stage == SWStage::VS && args->shader_info->vs.as_ls)
979
hw_stage = HWStage::LS; /* GFX6-8: VS is a Local Shader, when tessellation is used */
980
else if (sw_stage == SWStage::TCS)
981
hw_stage = HWStage::HS; /* GFX6-8: TCS is a Hull Shader */
982
else if (sw_stage == SWStage::VS_TCS)
983
hw_stage = HWStage::HS; /* GFX9-10: VS+TCS merged into a Hull Shader */
984
else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && !ngg)
985
hw_stage = HWStage::VS; /* GFX6-9: TES without GS uses the HW VS stage (and GFX10/legacy) */
986
else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && ngg)
987
hw_stage = HWStage::NGG; /* GFX10/NGG: TES without GS */
988
else if (sw_stage == SWStage::TES && args->shader_info->tes.as_es && !ngg)
989
hw_stage = HWStage::ES; /* GFX6-8: TES is an Export Shader */
990
else if (sw_stage == SWStage::TES_GS && gfx9_plus && !ngg)
991
hw_stage = HWStage::GS; /* GFX9: TES+GS merged into a GS (and GFX10/legacy) */
992
else if (sw_stage == SWStage::TES_GS && ngg)
993
hw_stage = HWStage::NGG; /* GFX10+: TES+GS merged into an NGG GS */
994
else
995
unreachable("Shader stage not implemented");
996
997
init_program(program, Stage{hw_stage, sw_stage}, args->shader_info, args->options->chip_class,
998
args->options->family, args->options->wgp_mode, config);
999
1000
isel_context ctx = {};
1001
ctx.program = program;
1002
ctx.args = args;
1003
ctx.options = args->options;
1004
ctx.stage = program->stage;
1005
1006
/* TODO: Check if we need to adjust min_waves for unknown workgroup sizes. */
1007
if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS) {
1008
/* PS and legacy VS have separate waves, no workgroups */
1009
program->workgroup_size = program->wave_size;
1010
} else if (program->stage == compute_cs) {
1011
/* CS sets the workgroup size explicitly */
1012
program->workgroup_size = shaders[0]->info.workgroup_size[0] *
1013
shaders[0]->info.workgroup_size[1] *
1014
shaders[0]->info.workgroup_size[2];
1015
} else if (program->stage.hw == HWStage::ES || program->stage == geometry_gs) {
1016
/* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-8
1017
* (not implemented in Mesa) */
1018
program->workgroup_size = program->wave_size;
1019
} else if (program->stage.hw == HWStage::GS) {
1020
/* If on-chip GS (LDS rings) are enabled on GFX9 or later, merged GS operates in workgroups */
1021
assert(program->chip_class >= GFX9);
1022
uint32_t es_verts_per_subgrp =
1023
G_028A44_ES_VERTS_PER_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);
1024
uint32_t gs_instr_prims_in_subgrp =
1025
G_028A44_GS_INST_PRIMS_IN_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);
1026
uint32_t workgroup_size = MAX2(es_verts_per_subgrp, gs_instr_prims_in_subgrp);
1027
program->workgroup_size = MAX2(MIN2(workgroup_size, 256), 1);
1028
} else if (program->stage == vertex_ls) {
1029
/* Unmerged LS operates in workgroups */
1030
program->workgroup_size = UINT_MAX; /* TODO: probably tcs_num_patches * tcs_vertices_in, but
1031
those are not plumbed to ACO for LS */
1032
} else if (program->stage == tess_control_hs) {
1033
/* Unmerged HS operates in workgroups, size is determined by the output vertices */
1034
setup_tcs_info(&ctx, shaders[0], NULL);
1035
program->workgroup_size = ctx.tcs_num_patches * shaders[0]->info.tess.tcs_vertices_out;
1036
} else if (program->stage == vertex_tess_control_hs) {
1037
/* Merged LSHS operates in workgroups, but can still have a different number of LS and HS
1038
* invocations */
1039
setup_tcs_info(&ctx, shaders[1], shaders[0]);
1040
program->workgroup_size =
1041
ctx.tcs_num_patches *
1042
MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices);
1043
} else if (program->stage.hw == HWStage::NGG) {
1044
gfx10_ngg_info& ngg_info = args->shader_info->ngg_info;
1045
unsigned num_gs_invocations =
1046
(program->stage.has(SWStage::GS)) ? MAX2(shaders[1]->info.gs.invocations, 1) : 1;
1047
1048
/* Max ES (SW VS/TES) threads */
1049
uint32_t max_esverts = ngg_info.hw_max_esverts;
1050
/* Max GS input primitives = max GS threads */
1051
uint32_t max_gs_input_prims = ngg_info.max_gsprims * num_gs_invocations;
1052
/* Maximum output vertices -- each thread can export only 1 vertex */
1053
uint32_t max_out_vtx = ngg_info.max_out_verts;
1054
/* Maximum output primitives -- each thread can export only 1 or 0 primitive */
1055
uint32_t max_out_prm = ngg_info.max_gsprims * num_gs_invocations * ngg_info.prim_amp_factor;
1056
1057
program->workgroup_size = MAX4(max_esverts, max_gs_input_prims, max_out_vtx, max_out_prm);
1058
} else {
1059
unreachable("Unsupported shader stage.");
1060
}
1061
1062
calc_min_waves(program);
1063
1064
unsigned scratch_size = 0;
1065
if (program->stage == gs_copy_vs) {
1066
assert(shader_count == 1);
1067
setup_vs_output_info(&ctx, shaders[0], false, true, &args->shader_info->vs.outinfo);
1068
} else {
1069
for (unsigned i = 0; i < shader_count; i++) {
1070
nir_shader* nir = shaders[i];
1071
setup_nir(&ctx, nir);
1072
}
1073
1074
for (unsigned i = 0; i < shader_count; i++)
1075
scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
1076
}
1077
1078
ctx.program->config->scratch_bytes_per_wave = align(scratch_size * ctx.program->wave_size, 1024);
1079
1080
ctx.block = ctx.program->create_and_insert_block();
1081
ctx.block->kind = block_kind_top_level;
1082
1083
return ctx;
1084
}
1085
1086
} // namespace aco
1087
1088