Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
4570 views
1
/*
2
* Copyright 2017 Advanced Micro Devices, Inc.
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* on the rights to use, copy, modify, merge, publish, distribute, sub
8
* license, and/or sell copies of the Software, and to permit persons to whom
9
* the Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21
* USE OR OTHER DEALINGS IN THE SOFTWARE.
22
*/
23
24
#include "ac_llvm_cull.h"
25
#include "si_pipe.h"
26
#include "si_shader_internal.h"
27
#include "sid.h"
28
#include "util/u_memory.h"
29
#include "util/u_prim.h"
30
31
static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx)
32
{
33
return si_unpack_param(ctx, ctx->args.merged_wave_info, 24, 4);
34
}
35
36
static LLVMValueRef get_tgsize(struct si_shader_context *ctx)
37
{
38
return si_unpack_param(ctx, ctx->args.merged_wave_info, 28, 4);
39
}
40
41
static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx)
42
{
43
LLVMBuilderRef builder = ctx->ac.builder;
44
LLVMValueRef tmp;
45
tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
46
LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
47
return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
48
}
49
50
static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx)
51
{
52
return si_unpack_param(ctx, ctx->args.gs_tg_info, 12, 9);
53
}
54
55
static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
56
{
57
return si_unpack_param(ctx, ctx->args.gs_tg_info, 22, 9);
58
}
59
60
static LLVMValueRef ngg_get_ordered_id(struct si_shader_context *ctx)
61
{
62
return si_unpack_param(ctx, ctx->args.gs_tg_info, 0, 12);
63
}
64
65
static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
66
{
67
LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
68
69
return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
70
LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false));
71
}
72
73
static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index)
74
{
75
if (ctx->stage == MESA_SHADER_VERTEX) {
76
LLVMValueRef tmp;
77
tmp = LLVMBuildLShr(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id),
78
LLVMConstInt(ctx->ac.i32, 8 + index, false), "");
79
return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
80
}
81
return ctx->ac.i1false;
82
}
83
84
/**
85
* Return the number of vertices as a constant in \p num_vertices,
86
* and return a more precise value as LLVMValueRef from the function.
87
*/
88
static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, unsigned *num_vertices)
89
{
90
const struct si_shader_info *info = &ctx->shader->selector->info;
91
92
if (ctx->stage == MESA_SHADER_VERTEX) {
93
if (info->base.vs.blit_sgprs_amd) {
94
/* Blits always use axis-aligned rectangles with 3 vertices. */
95
*num_vertices = 3;
96
return LLVMConstInt(ctx->ac.i32, 3, 0);
97
} else {
98
/* We always build up all three indices for the prim export
99
* independent of the primitive type. The additional garbage
100
* data shouldn't hurt. This number doesn't matter with
101
* NGG passthrough.
102
*/
103
*num_vertices = 3;
104
105
/* Extract OUTPRIM field. */
106
LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2);
107
return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, "");
108
}
109
} else {
110
assert(ctx->stage == MESA_SHADER_TESS_EVAL);
111
112
if (info->base.tess.point_mode)
113
*num_vertices = 1;
114
else if (info->base.tess.primitive_mode == GL_LINES)
115
*num_vertices = 2;
116
else
117
*num_vertices = 3;
118
119
return LLVMConstInt(ctx->ac.i32, *num_vertices, false);
120
}
121
}
122
123
bool gfx10_ngg_export_prim_early(struct si_shader *shader)
124
{
125
struct si_shader_selector *sel = shader->selector;
126
127
assert(shader->key.as_ngg && !shader->key.as_es);
128
129
return sel->info.stage != MESA_SHADER_GEOMETRY && !sel->info.writes_edgeflag;
130
}
131
132
void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx)
133
{
134
/* Newer chips can use PRIMGEN_PASSTHRU_NO_MSG to skip gs_alloc_req for NGG passthrough. */
135
if (gfx10_is_ngg_passthrough(ctx->shader) &&
136
ctx->screen->info.family >= CHIP_DIMGREY_CAVEFISH)
137
return;
138
139
ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx),
140
ngg_get_prim_cnt(ctx));
141
}
142
143
void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
144
LLVMValueRef prim_passthrough)
145
{
146
LLVMBuilderRef builder = ctx->ac.builder;
147
148
if (gfx10_is_ngg_passthrough(ctx->shader) || ctx->shader->key.opt.ngg_culling) {
149
ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
150
{
151
struct ac_ngg_prim prim = {};
152
153
if (prim_passthrough)
154
prim.passthrough = prim_passthrough;
155
else
156
prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
157
158
/* This is only used with NGG culling, which returns the NGG
159
* passthrough prim export encoding.
160
*/
161
if (ctx->shader->selector->info.writes_edgeflag) {
162
unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
163
LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0);
164
165
unsigned num_vertices;
166
ngg_get_vertices_per_prim(ctx, &num_vertices);
167
168
for (unsigned i = 0; i < num_vertices; i++) {
169
unsigned shift = 9 + i * 10;
170
LLVMValueRef edge;
171
172
edge = LLVMBuildLoad(builder, user_edgeflags[i], "");
173
edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, "");
174
edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), "");
175
edgeflags = LLVMBuildOr(builder, edgeflags, edge, "");
176
}
177
prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, "");
178
}
179
180
ac_build_export_prim(&ctx->ac, &prim);
181
}
182
ac_build_endif(&ctx->ac, 6001);
183
return;
184
}
185
186
ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
187
{
188
struct ac_ngg_prim prim = {};
189
190
ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
191
192
prim.isnull = ctx->ac.i1false;
193
prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
194
prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
195
prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
196
197
for (unsigned i = 0; i < prim.num_vertices; ++i) {
198
prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
199
200
if (ctx->shader->selector->info.writes_edgeflag) {
201
LLVMValueRef edge;
202
203
edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
204
edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
205
prim.edgeflag[i] = edge;
206
}
207
}
208
209
ac_build_export_prim(&ctx->ac, &prim);
210
}
211
ac_build_endif(&ctx->ac, 6001);
212
}
213
214
static void build_streamout_vertex(struct si_shader_context *ctx, LLVMValueRef *so_buffer,
215
LLVMValueRef *wg_offset_dw, unsigned stream,
216
LLVMValueRef offset_vtx, LLVMValueRef vertexptr)
217
{
218
struct si_shader_info *info = &ctx->shader->selector->info;
219
struct pipe_stream_output_info *so = &ctx->shader->selector->so;
220
LLVMBuilderRef builder = ctx->ac.builder;
221
LLVMValueRef offset[4] = {};
222
LLVMValueRef tmp;
223
224
for (unsigned buffer = 0; buffer < 4; ++buffer) {
225
if (!wg_offset_dw[buffer])
226
continue;
227
228
tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->stride[buffer], false),
229
"");
230
tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
231
offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
232
}
233
234
for (unsigned i = 0; i < so->num_outputs; ++i) {
235
if (so->output[i].stream != stream)
236
continue;
237
238
unsigned reg = so->output[i].register_index;
239
struct si_shader_output_values out;
240
out.semantic = info->output_semantic[reg];
241
242
for (unsigned comp = 0; comp < 4; comp++) {
243
tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false));
244
out.values[comp] = LLVMBuildLoad(builder, tmp, "");
245
out.vertex_stream[comp] = (info->output_streams[reg] >> (2 * comp)) & 3;
246
}
247
248
si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out);
249
}
250
}
251
252
struct ngg_streamout {
253
LLVMValueRef num_vertices;
254
255
/* per-thread data */
256
LLVMValueRef prim_enable[4]; /* i1 per stream */
257
LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */
258
259
/* Output */
260
LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
261
};
262
263
/**
264
* Build streamout logic.
265
*
266
* Implies a barrier.
267
*
268
* Writes number of emitted primitives to gs_ngg_scratch[4:8].
269
*
270
* Clobbers gs_ngg_scratch[8:].
271
*/
272
static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout *nggso)
273
{
274
struct si_shader_info *info = &ctx->shader->selector->info;
275
struct pipe_stream_output_info *so = &ctx->shader->selector->so;
276
LLVMBuilderRef builder = ctx->ac.builder;
277
LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
278
LLVMValueRef tid = get_thread_id_in_tg(ctx);
279
LLVMValueRef tmp, tmp2;
280
LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
281
LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
282
LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
283
LLVMValueRef so_buffer[4] = {};
284
unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0);
285
LLVMValueRef prim_stride_dw[4] = {};
286
LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
287
int stream_for_buffer[4] = {-1, -1, -1, -1};
288
unsigned bufmask_for_stream[4] = {};
289
bool isgs = ctx->stage == MESA_SHADER_GEOMETRY;
290
unsigned scratch_emit_base = isgs ? 4 : 0;
291
LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
292
unsigned scratch_offset_base = isgs ? 8 : 4;
293
LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
294
295
ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
296
297
/* Determine the mapping of streamout buffers to vertex streams. */
298
for (unsigned i = 0; i < so->num_outputs; ++i) {
299
unsigned buf = so->output[i].output_buffer;
300
unsigned stream = so->output[i].stream;
301
assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
302
stream_for_buffer[buf] = stream;
303
bufmask_for_stream[stream] |= 1 << buf;
304
}
305
306
for (unsigned buffer = 0; buffer < 4; ++buffer) {
307
if (stream_for_buffer[buffer] == -1)
308
continue;
309
310
assert(so->stride[buffer]);
311
312
tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false);
313
prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, "");
314
prim_stride_dw_vgpr =
315
ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
316
LLVMConstInt(ctx->ac.i32, buffer, false));
317
318
so_buffer[buffer] = ac_build_load_to_sgpr(
319
&ctx->ac, buf_ptr, LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false));
320
}
321
322
tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
323
ac_build_ifcc(&ctx->ac, tmp, 5200);
324
{
325
LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
326
LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
327
328
/* Advance the streamout offsets in GDS. */
329
LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
330
LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
331
332
tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
333
ac_build_ifcc(&ctx->ac, tmp, 5210);
334
{
335
if (isgs) {
336
tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
337
tmp = LLVMBuildLoad(builder, tmp, "");
338
} else {
339
tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
340
}
341
LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
342
343
unsigned swizzle[4];
344
int unused_stream = -1;
345
for (unsigned stream = 0; stream < 4; ++stream) {
346
if (!info->num_stream_output_components[stream]) {
347
unused_stream = stream;
348
break;
349
}
350
}
351
for (unsigned buffer = 0; buffer < 4; ++buffer) {
352
if (stream_for_buffer[buffer] >= 0) {
353
swizzle[buffer] = stream_for_buffer[buffer];
354
} else {
355
assert(unused_stream >= 0);
356
swizzle[buffer] = unused_stream;
357
}
358
}
359
360
tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
361
tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
362
363
LLVMValueRef args[] = {
364
LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
365
tmp,
366
ctx->ac.i32_0, // ordering
367
ctx->ac.i32_0, // scope
368
ctx->ac.i1false, // isVolatile
369
LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
370
ctx->ac.i1true, // wave release
371
ctx->ac.i1true, // wave done
372
};
373
tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args,
374
ARRAY_SIZE(args), 0);
375
376
/* Keep offsets in a VGPR for quick retrieval via readlane by
377
* the first wave for bounds checking, and also store in LDS
378
* for retrieval by all waves later. */
379
LLVMBuildStore(builder, tmp, offsets_vgpr);
380
381
tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, "");
382
tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
383
LLVMBuildStore(builder, tmp, tmp2);
384
}
385
ac_build_endif(&ctx->ac, 5210);
386
387
/* Determine the max emit per buffer. This is done via the SALU, in part
388
* because LLVM can't generate divide-by-multiply if we try to do this
389
* via VALU with one lane per buffer.
390
*/
391
LLVMValueRef max_emit[4] = {};
392
for (unsigned buffer = 0; buffer < 4; ++buffer) {
393
if (stream_for_buffer[buffer] == -1)
394
continue;
395
396
LLVMValueRef bufsize_dw = LLVMBuildLShr(
397
builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, "");
398
399
tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
400
LLVMValueRef offset_dw =
401
ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false));
402
403
tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
404
tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], "");
405
406
tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
407
max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, "");
408
}
409
410
/* Determine the number of emitted primitives per stream and fixup the
411
* GDS counter if necessary.
412
*
413
* This is complicated by the fact that a single stream can emit to
414
* multiple buffers (but luckily not vice versa).
415
*/
416
LLVMValueRef emit_vgpr = ctx->ac.i32_0;
417
418
for (unsigned stream = 0; stream < 4; ++stream) {
419
if (!info->num_stream_output_components[stream])
420
continue;
421
422
tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
423
LLVMValueRef generated =
424
ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false));
425
426
LLVMValueRef emit = generated;
427
for (unsigned buffer = 0; buffer < 4; ++buffer) {
428
if (stream_for_buffer[buffer] == stream)
429
emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
430
}
431
432
emit_vgpr =
433
ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false));
434
435
/* Fixup the offset using a plain GDS atomic if we overflowed. */
436
tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
437
ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */
438
tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
439
ac_get_thread_id(&ctx->ac), "");
440
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
441
ac_build_ifcc(&ctx->ac, tmp, 5222);
442
{
443
tmp = LLVMBuildSub(builder, generated, emit, "");
444
tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
445
tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
446
LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
447
LLVMAtomicOrderingMonotonic, false);
448
}
449
ac_build_endif(&ctx->ac, 5222);
450
ac_build_endif(&ctx->ac, 5221);
451
}
452
453
tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
454
ac_build_ifcc(&ctx->ac, tmp, 5225);
455
{
456
tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, "");
457
tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
458
LLVMBuildStore(builder, emit_vgpr, tmp);
459
}
460
ac_build_endif(&ctx->ac, 5225);
461
}
462
ac_build_endif(&ctx->ac, 5200);
463
464
/* Determine the workgroup-relative per-thread / primitive offset into
465
* the streamout buffers */
466
struct ac_wg_scan primemit_scan[4] = {};
467
468
if (isgs) {
469
for (unsigned stream = 0; stream < 4; ++stream) {
470
if (!info->num_stream_output_components[stream])
471
continue;
472
473
primemit_scan[stream].enable_exclusive = true;
474
primemit_scan[stream].op = nir_op_iadd;
475
primemit_scan[stream].src = nggso->prim_enable[stream];
476
primemit_scan[stream].scratch = ac_build_gep0(
477
&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
478
primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
479
primemit_scan[stream].numwaves = get_tgsize(ctx);
480
if (ctx->stage == MESA_SHADER_GEOMETRY) {
481
/* ngg_subgroup_size is only the input size. GS can always generate up to 256 vertices. */
482
primemit_scan[stream].maxwaves = DIV_ROUND_UP(256, ctx->ac.wave_size);
483
} else {
484
primemit_scan[stream].maxwaves = DIV_ROUND_UP(ctx->screen->ngg_subgroup_size,
485
ctx->ac.wave_size);
486
}
487
ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
488
}
489
}
490
491
ac_build_s_barrier(&ctx->ac);
492
493
/* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
494
LLVMValueRef wgoffset_dw[4] = {};
495
496
{
497
LLVMValueRef scratch_vgpr;
498
499
tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
500
scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
501
502
for (unsigned buffer = 0; buffer < 4; ++buffer) {
503
if (stream_for_buffer[buffer] >= 0) {
504
wgoffset_dw[buffer] =
505
ac_build_readlane(&ctx->ac, scratch_vgpr,
506
LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
507
}
508
}
509
510
for (unsigned stream = 0; stream < 4; ++stream) {
511
if (info->num_stream_output_components[stream]) {
512
nggso->emit[stream] =
513
ac_build_readlane(&ctx->ac, scratch_vgpr,
514
LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
515
}
516
}
517
}
518
519
/* Write out primitive data */
520
for (unsigned stream = 0; stream < 4; ++stream) {
521
if (!info->num_stream_output_components[stream])
522
continue;
523
524
if (isgs) {
525
ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
526
} else {
527
primemit_scan[stream].result_exclusive = tid;
528
}
529
530
tmp = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive,
531
nggso->emit[stream], "");
532
tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], "");
533
ac_build_ifcc(&ctx->ac, tmp, 5240);
534
{
535
LLVMValueRef offset_vtx =
536
LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, "");
537
538
for (unsigned i = 0; i < max_num_vertices; ++i) {
539
tmp = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false),
540
nggso->num_vertices, "");
541
ac_build_ifcc(&ctx->ac, tmp, 5241);
542
build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx,
543
nggso->vertices[i]);
544
ac_build_endif(&ctx->ac, 5241);
545
offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
546
}
547
}
548
ac_build_endif(&ctx->ac, 5240);
549
}
550
}
551
552
/* LDS layout of ES vertex data for NGG culling. */
553
enum
554
{
555
/* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old
556
* ES thread ID. After vertex compaction, compacted ES threads
557
* store the old thread ID here to copy input VGPRs from uncompacted
558
* ES threads.
559
* Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
560
* Byte 2: TES rel patch ID
561
* Byte 3: Unused
562
*/
563
lds_byte0_accept_flag = 0,
564
lds_byte1_new_thread_id,
565
lds_byte2_tes_rel_patch_id,
566
lds_byte3_unused,
567
568
lds_packed_data = 0, /* lds_byteN_... */
569
lds_pos_cull_x_div_w,
570
lds_pos_cull_y_div_w,
571
lds_pos_cull_w,
572
573
lds_pos_x = lds_packed_data + 1,
574
lds_pos_y,
575
lds_pos_z,
576
lds_pos_w,
577
/* If VS: */
578
lds_vertex_id,
579
lds_instance_id, /* optional */
580
/* If TES: */
581
lds_tes_u = lds_vertex_id,
582
lds_tes_v = lds_instance_id,
583
lds_tes_patch_id, /* optional */
584
};
585
586
static LLVMValueRef si_build_gep_i8_var(struct si_shader_context *ctx, LLVMValueRef ptr,
587
LLVMValueRef index)
588
{
589
LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
590
591
return LLVMBuildGEP(ctx->ac.builder, LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""), &index,
592
1, "");
593
}
594
595
static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx, LLVMValueRef ptr,
596
unsigned byte_index)
597
{
598
assert(byte_index < 4);
599
return si_build_gep_i8_var(ctx, ptr, LLVMConstInt(ctx->ac.i32, byte_index, 0));
600
}
601
602
static unsigned ngg_nogs_vertex_size(struct si_shader *shader)
603
{
604
unsigned lds_vertex_size = 0;
605
606
/* The edgeflag is always stored in the last element that's also
607
* used for padding to reduce LDS bank conflicts. */
608
if (shader->selector->so.num_outputs)
609
lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
610
if (shader->selector->info.writes_edgeflag)
611
lds_vertex_size = MAX2(lds_vertex_size, 1);
612
613
/* LDS size for passing data from GS to ES.
614
* GS stores Primitive IDs into LDS at the address corresponding
615
* to the ES thread of the provoking vertex. All ES threads
616
* load and export PrimitiveID for their thread.
617
*/
618
if (shader->selector->info.stage == MESA_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)
619
lds_vertex_size = MAX2(lds_vertex_size, 1);
620
621
if (shader->key.opt.ngg_culling) {
622
if (shader->selector->info.stage == MESA_SHADER_VERTEX) {
623
STATIC_ASSERT(lds_instance_id + 1 == 7);
624
lds_vertex_size = MAX2(lds_vertex_size, 7);
625
} else {
626
assert(shader->selector->info.stage == MESA_SHADER_TESS_EVAL);
627
628
if (shader->selector->info.uses_primid || shader->key.mono.u.vs_export_prim_id) {
629
STATIC_ASSERT(lds_tes_patch_id + 2 == 9); /* +1 for LDS padding */
630
lds_vertex_size = MAX2(lds_vertex_size, 9);
631
} else {
632
STATIC_ASSERT(lds_tes_v + 1 == 7);
633
lds_vertex_size = MAX2(lds_vertex_size, 7);
634
}
635
}
636
}
637
638
return lds_vertex_size;
639
}
640
641
/**
642
* Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
643
* for the vertex outputs.
644
*/
645
static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vtxid)
646
{
647
/* The extra dword is used to avoid LDS bank conflicts. */
648
unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader);
649
LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
650
LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
651
LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
652
return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
653
}
654
655
static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx, LLVMValueRef ret,
656
struct ac_arg param, unsigned return_index)
657
{
658
LLVMValueRef v = ac_get_arg(&ctx->ac, param);
659
660
for (unsigned i = 0; i < 4; i++) {
661
ret = LLVMBuildInsertValue(ctx->ac.builder, ret, ac_llvm_extract_elem(&ctx->ac, v, i),
662
return_index + i, "");
663
}
664
return ret;
665
}
666
667
static void load_vertex_counts(struct si_shader_context *ctx, LLVMValueRef lds,
668
unsigned max_waves, LLVMValueRef tid,
669
LLVMValueRef *total_count,
670
LLVMValueRef *prefix_sum)
671
{
672
LLVMBuilderRef builder = ctx->ac.builder;
673
LLVMValueRef i8vec4_lane = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
674
unsigned num_i8vec4 = DIV_ROUND_UP(max_waves, 4);
675
676
/* If all threads loaded the vertex counts, it would cause many LDS bank conflicts
677
* and the performance could decrease up to WaveSize times (32x or 64x).
678
*
679
* Therefore, only load the i-th tuple of vertex counts in the i-th thread. Other threads will
680
* get them through readlane. 4 8-bit vertex counts are loaded per thread.
681
*/
682
ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntULT, tid,
683
LLVMConstInt(ctx->ac.i32, num_i8vec4, 0), ""), 17771);
684
LLVMBuildStore(builder, LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, lds, tid), ""), i8vec4_lane);
685
ac_build_endif(&ctx->ac, 17771);
686
687
/* Compute the number of ES waves. */
688
LLVMValueRef num_waves = get_tgsize(ctx);
689
690
/* Compute a byte mask where each byte is either 0 or 0xff depending on whether the wave
691
* exists. We need the mask to clear uninitialized bytes in LDS and to compute the prefix sum.
692
*
693
* 8 waves: valid_mask = ~0ull >> (64 - num_waves * 8)
694
* 4 waves: valid_mask = ~0 >> (32 - num_waves * 8)
695
*/
696
LLVMValueRef num_waves8 = LLVMBuildShl(builder, num_waves, LLVMConstInt(ctx->ac.i32, 3, 0), "");
697
LLVMValueRef valid_mask;
698
699
if (max_waves > 4) {
700
LLVMValueRef num_waves8_rev = LLVMBuildSub(builder, LLVMConstInt(ctx->ac.i32, 64, 0),
701
num_waves8, "");
702
valid_mask = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i64, ~0ull, 0),
703
LLVMBuildZExt(builder, num_waves8_rev, ctx->ac.i64, ""), "");
704
} else {
705
LLVMValueRef num_waves8_rev = LLVMBuildSub(builder, LLVMConstInt(ctx->ac.i32, 32, 0),
706
num_waves8, "");
707
valid_mask = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, ~0, 0), num_waves8_rev, "");
708
}
709
710
/* Compute a byte mask where bytes below wave_id are 0xff, else they are 0.
711
*
712
* prefix_mask = ~(~0 << (wave_id * 8))
713
*/
714
LLVMTypeRef type = max_waves > 4 ? ctx->ac.i64 : ctx->ac.i32;
715
LLVMValueRef wave_id8 = LLVMBuildShl(builder, get_wave_id_in_tg(ctx),
716
LLVMConstInt(ctx->ac.i32, 3, 0), "");
717
LLVMValueRef prefix_mask =
718
LLVMBuildNot(builder, LLVMBuildShl(builder, LLVMConstInt(type, ~0ull, 0),
719
LLVMBuildZExt(builder, wave_id8, type, ""), ""), "");
720
721
/* Compute the total vertex count and the vertex count of previous waves (prefix). */
722
*total_count = ctx->ac.i32_0;
723
*prefix_sum = ctx->ac.i32_0;
724
725
for (unsigned i = 0; i < num_i8vec4; i++) {
726
LLVMValueRef i8vec4;
727
728
i8vec4 = ac_build_readlane_no_opt_barrier(&ctx->ac, LLVMBuildLoad(builder, i8vec4_lane, ""),
729
LLVMConstInt(ctx->ac.i32, i, 0));
730
/* Inactive waves have uninitialized vertex counts. Set them to 0 using this. */
731
i8vec4 = LLVMBuildAnd(builder, i8vec4,
732
ac_unpack_param(&ctx->ac, valid_mask, 32 * i, 32), "");
733
/* Compute the sum of all i8vec4 components and add it to the result. */
734
*total_count = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.sad.u8", ctx->ac.i32,
735
(LLVMValueRef[]){i8vec4, ctx->ac.i32_0, *total_count},
736
3, AC_FUNC_ATTR_READNONE);
737
ac_set_range_metadata(&ctx->ac, *total_count, 0, 64*4 + 1); /* the result is at most 64*4 */
738
739
/* Compute the sum of the vertex counts of all previous waves. */
740
i8vec4 = LLVMBuildAnd(builder, i8vec4,
741
ac_unpack_param(&ctx->ac, prefix_mask, 32 * i, 32), "");
742
*prefix_sum = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.sad.u8", ctx->ac.i32,
743
(LLVMValueRef[]){i8vec4, ctx->ac.i32_0, *prefix_sum},
744
3, AC_FUNC_ATTR_READNONE);
745
ac_set_range_metadata(&ctx->ac, *prefix_sum, 0, 64*4 + 1); /* the result is at most 64*4 */
746
}
747
*total_count = ac_build_readlane_no_opt_barrier(&ctx->ac, *total_count, NULL);
748
}
749
750
/**
751
* Given a total thread count, update total and per-wave thread counts in input SGPRs
752
* and return the per-wave thread count.
753
*
754
* \param new_num_threads Total thread count on the input, per-wave thread count on the output.
755
* \param tg_info tg_info SGPR value
756
* \param tg_info_num_bits the bit size of thread count field in tg_info
757
* \param tg_info_shift the bit offset of the thread count field in tg_info
758
* \param wave_info merged_wave_info SGPR value
759
* \param wave_info_num_bits the bit size of thread count field in merged_wave_info
760
* \param wave_info_shift the bit offset of the thread count field in merged_wave_info
761
*/
762
static void update_thread_counts(struct si_shader_context *ctx, LLVMValueRef *new_num_threads,
763
LLVMValueRef *tg_info, unsigned tg_info_num_bits,
764
unsigned tg_info_shift, LLVMValueRef *wave_info,
765
unsigned wave_info_num_bits, unsigned wave_info_shift)
766
{
767
LLVMBuilderRef builder = ctx->ac.builder;
768
769
/* Update the total thread count. */
770
unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift);
771
*tg_info = LLVMBuildAnd(builder, *tg_info, LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), "");
772
*tg_info = LLVMBuildOr(
773
builder, *tg_info,
774
LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), "");
775
776
/* Update the per-wave thread count. */
777
LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
778
LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), "");
779
*new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, "");
780
*new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0);
781
*new_num_threads =
782
ac_build_imin(&ctx->ac, *new_num_threads, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0));
783
unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift);
784
*wave_info = LLVMBuildAnd(builder, *wave_info, LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), "");
785
*wave_info = LLVMBuildOr(
786
builder, *wave_info,
787
LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""),
788
"");
789
}
790
791
static void gfx10_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted,
792
void *userdata)
793
{
794
struct si_shader_context *ctx = container_of(ac, struct si_shader_context, ac);
795
LLVMValueRef *params = (LLVMValueRef *)userdata;
796
LLVMValueRef gs_accepted = params[0];
797
LLVMValueRef *gs_vtxptr = (LLVMValueRef *)params[1];
798
799
ac_build_ifcc(&ctx->ac, accepted, 0);
800
LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_1, gs_accepted);
801
for (unsigned vtx = 0; vtx < 3; vtx++) {
802
LLVMBuildStore(ctx->ac.builder, ctx->ac.i8_1,
803
si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
804
}
805
ac_build_endif(&ctx->ac, 0);
806
}
807
808
/**
809
* Cull primitives for NGG VS or TES, then compact vertices, which happens
810
* before the VS or TES main function. Return values for the main function.
811
* Also return the position, which is passed to the shader as an input,
812
* so that we don't compute it twice.
813
*/
814
void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
815
LLVMValueRef *addrs)
816
{
817
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
818
struct si_shader *shader = ctx->shader;
819
struct si_shader_selector *sel = shader->selector;
820
struct si_shader_info *info = &sel->info;
821
LLVMBuilderRef builder = ctx->ac.builder;
822
unsigned max_waves = DIV_ROUND_UP(ctx->screen->ngg_subgroup_size, ctx->ac.wave_size);
823
824
assert(shader->key.opt.ngg_culling);
825
assert(shader->key.as_ngg);
826
assert(sel->info.stage == MESA_SHADER_VERTEX ||
827
(sel->info.stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es));
828
829
LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
830
unsigned pos_index = 0;
831
832
for (unsigned i = 0; i < info->num_outputs; i++) {
833
LLVMValueRef position[4];
834
835
switch (info->output_semantic[i]) {
836
case VARYING_SLOT_POS:
837
/* If we are going to cull everything (rasterizer_discard), discard
838
* the position. This is useful for analyzing maximum theoretical
839
* performance without VS input loads.
840
*/
841
if (shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE &&
842
shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE) {
843
for (unsigned j = 0; j < 4; j++)
844
LLVMBuildStore(builder, LLVMGetUndef(ctx->ac.f32), addrs[4 * i + j]);
845
break;
846
}
847
848
pos_index = i;
849
for (unsigned j = 0; j < 4; j++) {
850
position[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
851
}
852
853
/* Store Position.W into LDS. */
854
LLVMBuildStore(
855
builder, ac_to_integer(&ctx->ac, position[3]),
856
ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_cull_w, 0)));
857
858
/* Store Position.XY / W into LDS. */
859
for (unsigned chan = 0; chan < 2; chan++) {
860
LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
861
LLVMBuildStore(
862
builder, ac_to_integer(&ctx->ac, val),
863
ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_cull_x_div_w + chan, 0)));
864
}
865
break;
866
}
867
}
868
869
/* Initialize the packed data. */
870
LLVMBuildStore(
871
builder, ctx->ac.i32_0,
872
ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
873
ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
874
ac_build_s_barrier(&ctx->ac);
875
876
LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
877
878
/* The hardware requires that there are no holes between unculled vertices,
879
* which means we have to pack ES threads, i.e. reduce the ES thread count
880
* and move ES input VGPRs to lower threads. The upside is that varyings
881
* are only fetched and computed for unculled vertices.
882
*
883
* Vertex compaction:
884
*
885
* Part 1: Store the surviving vertex count for each wave in LDS.
886
* - The GS culling code notifies ES threads which vertices were accepted.
887
* - Barrier
888
* - ES threads will compute the vertex count and store it in LDS.
889
* - Barrier
890
* - Each wave loads the vertex counts from LDS.
891
*
892
* Part 2: Compact ES threads:
893
* - Compute the prefix sum for each surviving vertex. This is the new thread ID
894
* of the vertex.
895
* - Write input VGPRs and vertex positions for each surviving vertex into the LDS
896
* address of the new thread ID.
897
* - Now kill all waves that have inactive threads.
898
* - Barrier
899
* - Update vertex indices and null flag in the GS input VGPRs.
900
*
901
* Part 3: Update inputs GPRs
902
* - For all waves, update per-wave thread counts in input SGPRs.
903
* - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
904
*/
905
906
LLVMValueRef vtxindex[3];
907
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
908
/* For the GS fast launch, the VS prolog simply puts the Vertex IDs
909
* into these VGPRs.
910
*/
911
vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
912
vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
913
vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
914
} else {
915
vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
916
vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
917
vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
918
};
919
LLVMValueRef gs_vtxptr[] = {
920
ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
921
ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
922
ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
923
};
924
es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
925
926
/* Adding these optimization barriers improves the generated code as follows. Crazy right?
927
*
928
* - s_mov_b32 s4, 0xffff
929
* - v_lshrrev_b32_e32 v10, 16, v0
930
* - v_and_b32_e32 v12, s4, v0
931
* - v_and_b32_e32 v11, s4, v1
932
* s_bfe_u32 s4, s3, 0x80008
933
* - s_mov_b64 s[8:9], 0
934
* - v_mul_u32_u24_e32 v0, 28, v10
935
* - v_mul_u32_u24_e32 v9, 28, v12
936
* - v_mul_u32_u24_e32 v1, 28, v11
937
* + v_mov_b32_e32 v11, 28
938
* v_cmp_gt_u32_e32 vcc, s4, v2
939
* + s_mov_b64 s[8:9], 0
940
* s_waitcnt lgkmcnt(0)
941
* s_barrier
942
* + v_mul_u32_u24_sdwa v10, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
943
* + v_mul_u32_u24_sdwa v23, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
944
* + v_mul_u32_u24_sdwa v0, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
945
* s_and_saveexec_b64 s[44:45], vcc
946
* s_cbranch_execz BB2_8
947
* - v_mul_u32_u24_e32 v16, 28, v12
948
* - v_mul_u32_u24_e32 v17, 28, v11
949
* - v_mul_u32_u24_e32 v18, 28, v10
950
*/
951
ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[0], false);
952
ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[1], false);
953
ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[2], false);
954
955
LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
956
957
/* Do culling in GS threads. */
958
ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002);
959
{
960
/* Load positions. */
961
LLVMValueRef pos[3][4] = {};
962
for (unsigned vtx = 0; vtx < 3; vtx++) {
963
for (unsigned chan = 0; chan < 4; chan++) {
964
unsigned index;
965
if (chan == 0 || chan == 1)
966
index = lds_pos_cull_x_div_w + chan;
967
else if (chan == 3)
968
index = lds_pos_cull_w;
969
else
970
continue;
971
972
LLVMValueRef addr =
973
ac_build_gep0(&ctx->ac, gs_vtxptr[vtx], LLVMConstInt(ctx->ac.i32, index, 0));
974
pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
975
pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
976
}
977
}
978
979
/* Load the viewport state for small prim culling. */
980
LLVMValueRef vp = ac_build_load_invariant(
981
&ctx->ac, ac_get_arg(&ctx->ac, ctx->small_prim_cull_info), ctx->ac.i32_0);
982
vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
983
LLVMValueRef vp_scale[2], vp_translate[2];
984
vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
985
vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
986
vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
987
vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
988
989
/* Get the small prim filter precision. */
990
LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
991
small_prim_precision =
992
LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
993
small_prim_precision =
994
LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), "");
995
small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
996
997
/* Execute culling code. */
998
struct ac_cull_options options = {};
999
options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
1000
options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
1001
options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS;
1002
options.cull_small_prims = options.cull_view_xy;
1003
options.cull_zero_area = options.cull_front || options.cull_back;
1004
options.cull_w = true;
1005
1006
/* Tell ES threads whether their vertex survived. */
1007
LLVMValueRef params[] = {
1008
gs_accepted,
1009
(void*)gs_vtxptr,
1010
};
1011
ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true, vp_scale, vp_translate,
1012
small_prim_precision, &options,
1013
gfx10_build_primitive_accepted, params);
1014
}
1015
ac_build_endif(&ctx->ac, 16002);
1016
ac_build_s_barrier(&ctx->ac);
1017
1018
gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
1019
1020
LLVMValueRef vertex_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, "");
1021
LLVMValueRef vertex_mask = ac_build_alloca(&ctx->ac, ctx->ac.iN_wavemask, "");
1022
1023
/* Convert the per-vertex accept flag to a vertex thread mask, store it in registers. */
1024
ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007);
1025
{
1026
LLVMValueRef accepted =
1027
LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), "");
1028
accepted = LLVMBuildICmp(builder, LLVMIntNE, accepted, ctx->ac.i8_0, "");
1029
LLVMValueRef mask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
1030
1031
LLVMBuildStore(builder, accepted, vertex_accepted);
1032
LLVMBuildStore(builder, mask, vertex_mask);
1033
}
1034
ac_build_endif(&ctx->ac, 16007);
1035
1036
/* Store the per-wave vertex count to LDS. Non-ES waves store 0. */
1037
vertex_mask = LLVMBuildLoad(builder, vertex_mask, "");
1038
ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, ctx->ac.i32_0, ""), 16008);
1039
{
1040
LLVMValueRef vertex_count = ac_build_bit_count(&ctx->ac, vertex_mask);
1041
LLVMBuildStore(builder, LLVMBuildTrunc(builder, vertex_count, ctx->ac.i8, ""),
1042
si_build_gep_i8_var(ctx, ctx->gs_ngg_scratch, get_wave_id_in_tg(ctx)));
1043
}
1044
ac_build_endif(&ctx->ac, 16008);
1045
1046
ac_build_s_barrier(&ctx->ac);
1047
1048
/* Load the vertex masks and compute the new ES thread count. */
1049
LLVMValueRef new_num_es_threads, prefix_sum, kill_wave;
1050
load_vertex_counts(ctx, ctx->gs_ngg_scratch, max_waves, tid, &new_num_es_threads,
1051
&prefix_sum);
1052
1053
bool uses_instance_id = ctx->stage == MESA_SHADER_VERTEX &&
1054
(sel->info.uses_instanceid ||
1055
shader->key.part.vs.prolog.instance_divisor_is_one ||
1056
shader->key.part.vs.prolog.instance_divisor_is_fetched);
1057
bool uses_tes_prim_id = ctx->stage == MESA_SHADER_TESS_EVAL &&
1058
(sel->info.uses_primid || shader->key.mono.u.vs_export_prim_id);
1059
1060
/* ES threads compute their prefix sum, which is the new ES thread ID.
1061
* Then they write the vertex position and input VGPRs into the LDS address
1062
* of the new thread ID. It will be used to load input VGPRs by compacted
1063
* threads.
1064
*/
1065
vertex_accepted = LLVMBuildLoad(builder, vertex_accepted, "");
1066
ac_build_ifcc(&ctx->ac, vertex_accepted, 16009);
1067
{
1068
/* Add the number of bits set in vertex_mask up to the current thread ID - 1
1069
* to get the prefix sum.
1070
*/
1071
prefix_sum = LLVMBuildAdd(builder, prefix_sum, ac_build_mbcnt(&ctx->ac, vertex_mask), "");
1072
1073
LLVMValueRef new_id = prefix_sum;
1074
LLVMValueRef new_vtx = ngg_nogs_vertex_ptr(ctx, new_id);
1075
1076
LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
1077
si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
1078
1079
/* Store Position.XYZW into LDS. */
1080
for (unsigned chan = 0; chan < 4; chan++) {
1081
LLVMBuildStore(
1082
builder, ac_to_integer(&ctx->ac, LLVMBuildLoad(builder, addrs[4 * pos_index + chan], "")),
1083
ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
1084
}
1085
1086
/* Store VertexID and InstanceID into LDS. ES threads will have to load them
1087
* from LDS after vertex compaction and use them instead of their own
1088
* system values.
1089
*/
1090
if (ctx->stage == MESA_SHADER_VERTEX) {
1091
LLVMBuildStore(
1092
builder, ctx->abi.vertex_id,
1093
ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
1094
if (uses_instance_id) {
1095
LLVMBuildStore(
1096
builder, ctx->abi.instance_id,
1097
ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
1098
}
1099
} else {
1100
assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1101
LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tes_u)),
1102
ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
1103
LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tes_v)),
1104
ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
1105
LLVMBuildStore(builder, LLVMBuildTrunc(builder, ac_get_arg(&ctx->ac, ctx->args.tes_rel_patch_id), ctx->ac.i8, ""),
1106
si_build_gep_i8(ctx, new_vtx, lds_byte2_tes_rel_patch_id));
1107
if (uses_tes_prim_id) {
1108
LLVMBuildStore(
1109
builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
1110
ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
1111
}
1112
}
1113
}
1114
ac_build_endif(&ctx->ac, 16009);
1115
1116
/* If all vertices are culled, set the primitive count to 0, so that all waves are culled here. */
1117
LLVMValueRef num_primitives = ngg_get_prim_cnt(ctx);
1118
num_primitives = LLVMBuildSelect(builder,
1119
LLVMBuildICmp(builder, LLVMIntEQ, new_num_es_threads,
1120
ctx->ac.i32_0, ""),
1121
ctx->ac.i32_0, num_primitives, "");
1122
/* Kill waves that have inactive threads. */
1123
kill_wave = LLVMBuildICmp(builder, LLVMIntULE,
1124
ac_build_imax(&ctx->ac, new_num_es_threads, num_primitives),
1125
LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
1126
LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""),
1127
"");
1128
ac_build_ifcc(&ctx->ac, kill_wave, 19202);
1129
{
1130
/* If we are killing wave 0, send that there are no primitives
1131
* in this threadgroup.
1132
*/
1133
ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ctx->ac.i32_0);
1134
ac_build_s_endpgm(&ctx->ac);
1135
}
1136
ac_build_endif(&ctx->ac, 19202);
1137
ac_build_s_barrier(&ctx->ac);
1138
1139
/* Send the final vertex and primitive counts. */
1140
ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), new_num_es_threads,
1141
ngg_get_prim_cnt(ctx));
1142
1143
/* Update thread counts in SGPRs. */
1144
LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->args.gs_tg_info);
1145
LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->args.merged_wave_info);
1146
1147
/* This also converts the thread count from the total count to the per-wave count. */
1148
update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12, &new_merged_wave_info, 8,
1149
0);
1150
1151
/* Update vertex indices in VGPR0 (same format as NGG passthrough).
1152
*
1153
* Set the null flag at the beginning (culled), and then
1154
* overwrite it for accepted primitives.
1155
*/
1156
LLVMValueRef new_vgpr0 =
1157
ac_build_alloca_init(&ctx->ac, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), "");
1158
1159
/* Get vertex indices after vertex compaction. */
1160
ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011);
1161
{
1162
struct ac_ngg_prim prim = {};
1163
prim.num_vertices = 3;
1164
prim.isnull = ctx->ac.i1false;
1165
1166
for (unsigned vtx = 0; vtx < 3; vtx++) {
1167
prim.index[vtx] = LLVMBuildLoad(
1168
builder, si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte1_new_thread_id), "");
1169
prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, "");
1170
prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx);
1171
}
1172
1173
/* Set the new GS input VGPR. */
1174
LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0);
1175
}
1176
ac_build_endif(&ctx->ac, 16011);
1177
1178
if (gfx10_ngg_export_prim_early(shader))
1179
gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, ""));
1180
1181
/* Prepare LDS addresses of the new ES input VGPRs. */
1182
LLVMValueRef input_vgpr_addresses[4] = {
1183
ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)),
1184
ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)),
1185
};
1186
if (ctx->stage == MESA_SHADER_TESS_EVAL) {
1187
input_vgpr_addresses[2] = si_build_gep_i8(ctx, es_vtxptr, lds_byte2_tes_rel_patch_id);
1188
if (uses_tes_prim_id) {
1189
input_vgpr_addresses[3] = ac_build_gep0(&ctx->ac, es_vtxptr,
1190
LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0));
1191
}
1192
}
1193
1194
/* Return values for the main function. */
1195
LLVMValueRef ret = ctx->return_value;
1196
LLVMValueRef val;
1197
1198
ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, "");
1199
ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
1200
if (ctx->stage == MESA_SHADER_TESS_EVAL)
1201
ret = si_insert_input_ret(ctx, ret, ctx->args.tess_offchip_offset, 4);
1202
1203
ret = si_insert_input_ptr(ctx, ret, ctx->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS);
1204
ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
1205
8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
1206
ret = si_insert_input_ptr(ctx, ret, ctx->const_and_shader_buffers,
1207
8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
1208
ret = si_insert_input_ptr(ctx, ret, ctx->samplers_and_images, 8 + SI_SGPR_SAMPLERS_AND_IMAGES);
1209
ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
1210
1211
if (ctx->stage == MESA_SHADER_VERTEX) {
1212
ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex, 8 + SI_SGPR_BASE_VERTEX);
1213
ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id, 8 + SI_SGPR_DRAWID);
1214
ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance, 8 + SI_SGPR_START_INSTANCE);
1215
ret = si_insert_input_ptr(ctx, ret, ctx->args.vertex_buffers, 8 + SI_VS_NUM_USER_SGPR);
1216
1217
for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
1218
ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
1219
8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
1220
}
1221
} else {
1222
assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1223
ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout, 8 + SI_SGPR_TES_OFFCHIP_LAYOUT);
1224
ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr, 8 + SI_SGPR_TES_OFFCHIP_ADDR);
1225
}
1226
1227
unsigned vgpr;
1228
if (ctx->stage == MESA_SHADER_VERTEX) {
1229
if (shader->selector->num_vbos_in_user_sgprs) {
1230
vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
1231
} else {
1232
vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
1233
}
1234
} else {
1235
vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
1236
}
1237
1238
val = LLVMBuildLoad(builder, new_vgpr0, "");
1239
ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1240
vgpr++; /* gs_vtx23_offset */
1241
1242
ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
1243
ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
1244
vgpr++; /* gs_vtx45_offset */
1245
1246
/* Set the input VPGRs to the corresponding LDS addresses where the VGPR values are
1247
* stored. The VS prolog will load them.
1248
*/
1249
if (ctx->stage == MESA_SHADER_VERTEX) {
1250
val = LLVMBuildPtrToInt(builder, input_vgpr_addresses[0], ctx->ac.i32, "");
1251
ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
1252
""); /* VGPR5 - VertexID */
1253
vgpr += 2;
1254
if (uses_instance_id) {
1255
val = LLVMBuildPtrToInt(builder, input_vgpr_addresses[1], ctx->ac.i32, "");
1256
ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
1257
""); /* VGPR8 - InstanceID */
1258
} else {
1259
vgpr++;
1260
}
1261
} else {
1262
assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1263
unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
1264
for (unsigned i = 0; i < num_vgprs; i++) {
1265
val = LLVMBuildPtrToInt(builder, input_vgpr_addresses[i], ctx->ac.i32, "");
1266
ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1267
}
1268
if (num_vgprs == 3)
1269
vgpr++;
1270
}
1271
1272
/* These two also use LDS. */
1273
if (sel->info.writes_edgeflag ||
1274
(ctx->stage == MESA_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
1275
ac_build_s_barrier(&ctx->ac);
1276
1277
ctx->return_value = ret;
1278
}
1279
1280
/**
1281
* Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
1282
*/
1283
void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
1284
{
1285
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1286
struct si_shader_selector *sel = ctx->shader->selector;
1287
struct si_shader_info *info = &sel->info;
1288
struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
1289
LLVMBuilderRef builder = ctx->ac.builder;
1290
LLVMValueRef tmp, tmp2;
1291
1292
assert(!ctx->shader->is_gs_copy_shader);
1293
assert(info->num_outputs <= max_outputs);
1294
1295
LLVMValueRef vertex_ptr = NULL;
1296
1297
if (sel->so.num_outputs || sel->info.writes_edgeflag)
1298
vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1299
1300
for (unsigned i = 0; i < info->num_outputs; i++) {
1301
outputs[i].semantic = info->output_semantic[i];
1302
1303
for (unsigned j = 0; j < 4; j++) {
1304
outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
1305
1306
/* TODO: we may store more outputs than streamout needs,
1307
* but streamout performance isn't that important.
1308
*/
1309
if (sel->so.num_outputs) {
1310
tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
1311
tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], "");
1312
tmp2 = ac_to_integer(&ctx->ac, tmp2);
1313
LLVMBuildStore(builder, tmp2, tmp);
1314
}
1315
}
1316
1317
/* Store the edgeflag at the end (if streamout is enabled) */
1318
if (info->output_semantic[i] == VARYING_SLOT_EDGE && sel->info.writes_edgeflag) {
1319
LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
1320
/* The output is a float, but the hw expects a 1-bit integer. */
1321
edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, "");
1322
edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1);
1323
1324
tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1325
tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1326
LLVMBuildStore(builder, edgeflag, tmp);
1327
}
1328
}
1329
1330
bool unterminated_es_if_block =
1331
!sel->so.num_outputs && !sel->info.writes_edgeflag &&
1332
!ctx->screen->use_ngg_streamout && /* no query buffer */
1333
(ctx->stage != MESA_SHADER_VERTEX || !ctx->shader->key.mono.u.vs_export_prim_id);
1334
1335
if (!unterminated_es_if_block)
1336
ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1337
1338
LLVMValueRef is_gs_thread = si_is_gs_thread(ctx);
1339
LLVMValueRef is_es_thread = si_is_es_thread(ctx);
1340
LLVMValueRef vtxindex[3];
1341
1342
if (ctx->shader->key.opt.ngg_culling) {
1343
vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9);
1344
vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9);
1345
vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9);
1346
} else {
1347
vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
1348
vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
1349
vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
1350
}
1351
1352
/* Determine the number of vertices per primitive. */
1353
unsigned num_vertices;
1354
LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices);
1355
1356
/* Streamout */
1357
LLVMValueRef emitted_prims = NULL;
1358
1359
if (sel->so.num_outputs) {
1360
assert(!unterminated_es_if_block);
1361
1362
struct ngg_streamout nggso = {};
1363
nggso.num_vertices = num_vertices_val;
1364
nggso.prim_enable[0] = is_gs_thread;
1365
1366
for (unsigned i = 0; i < num_vertices; ++i)
1367
nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1368
1369
build_streamout(ctx, &nggso);
1370
emitted_prims = nggso.emit[0];
1371
}
1372
1373
LLVMValueRef user_edgeflags[3] = {};
1374
1375
if (sel->info.writes_edgeflag) {
1376
assert(!unterminated_es_if_block);
1377
1378
/* Streamout already inserted the barrier, so don't insert it again. */
1379
if (!sel->so.num_outputs)
1380
ac_build_s_barrier(&ctx->ac);
1381
1382
ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1383
/* Load edge flags from ES threads and store them into VGPRs in GS threads. */
1384
for (unsigned i = 0; i < num_vertices; i++) {
1385
tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1386
tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1387
tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
1388
tmp = LLVMBuildLoad(builder, tmp, "");
1389
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1390
1391
user_edgeflags[i] = ac_build_alloca_init(&ctx->ac, tmp, "");
1392
}
1393
ac_build_endif(&ctx->ac, 5400);
1394
}
1395
1396
/* Copy Primitive IDs from GS threads to the LDS address corresponding
1397
* to the ES thread of the provoking vertex.
1398
*/
1399
if (ctx->stage == MESA_SHADER_VERTEX && ctx->shader->key.mono.u.vs_export_prim_id) {
1400
assert(!unterminated_es_if_block);
1401
1402
/* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
1403
if (sel->so.num_outputs || sel->info.writes_edgeflag)
1404
ac_build_s_barrier(&ctx->ac);
1405
1406
ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1407
/* Extract the PROVOKING_VTX_INDEX field. */
1408
LLVMValueRef provoking_vtx_in_prim = si_unpack_param(ctx, ctx->vs_state_bits, 4, 2);
1409
1410
/* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
1411
LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
1412
LLVMValueRef provoking_vtx_index =
1413
LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
1414
LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index);
1415
1416
LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id),
1417
ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0));
1418
ac_build_endif(&ctx->ac, 5400);
1419
}
1420
1421
/* Update query buffer */
1422
if (ctx->screen->use_ngg_streamout && !info->base.vs.blit_sgprs_amd) {
1423
assert(!unterminated_es_if_block);
1424
1425
tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
1426
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1427
ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */
1428
tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
1429
ac_build_ifcc(&ctx->ac, tmp, 5030);
1430
tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac),
1431
sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, "");
1432
ac_build_ifcc(&ctx->ac, tmp, 5031);
1433
{
1434
LLVMValueRef args[] = {
1435
ngg_get_prim_cnt(ctx),
1436
ngg_get_query_buf(ctx),
1437
LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */
1438
ctx->ac.i32_0, /* soffset */
1439
ctx->ac.i32_0, /* cachepolicy */
1440
};
1441
1442
if (sel->so.num_outputs) {
1443
args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1);
1444
args[2] = ac_build_writelane(&ctx->ac, args[2], LLVMConstInt(ctx->ac.i32, 24, false),
1445
ctx->ac.i32_1);
1446
}
1447
1448
/* TODO: should this be 64-bit atomics? */
1449
ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
1450
0);
1451
}
1452
ac_build_endif(&ctx->ac, 5031);
1453
ac_build_endif(&ctx->ac, 5030);
1454
ac_build_endif(&ctx->ac, 5029);
1455
}
1456
1457
/* Build the primitive export. */
1458
if (!gfx10_ngg_export_prim_early(ctx->shader)) {
1459
assert(!unterminated_es_if_block);
1460
gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL);
1461
}
1462
1463
/* Export per-vertex data (positions and parameters). */
1464
if (!unterminated_es_if_block)
1465
ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
1466
{
1467
unsigned i;
1468
1469
/* Unconditionally (re-)load the values for proper SSA form. */
1470
for (i = 0; i < info->num_outputs; i++) {
1471
/* If the NGG cull shader part computed the position, don't
1472
* use the position from the current shader part. Instead,
1473
* load it from LDS.
1474
*/
1475
if (info->output_semantic[i] == VARYING_SLOT_POS &&
1476
ctx->shader->key.opt.ngg_culling) {
1477
vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1478
1479
for (unsigned j = 0; j < 4; j++) {
1480
tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
1481
tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1482
tmp = LLVMBuildLoad(builder, tmp, "");
1483
outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
1484
}
1485
} else {
1486
for (unsigned j = 0; j < 4; j++) {
1487
outputs[i].values[j] = LLVMBuildLoad(builder, addrs[4 * i + j], "");
1488
}
1489
}
1490
}
1491
1492
if (ctx->shader->key.mono.u.vs_export_prim_id) {
1493
outputs[i].semantic = VARYING_SLOT_PRIMITIVE_ID;
1494
1495
if (ctx->stage == MESA_SHADER_VERTEX) {
1496
/* Wait for GS stores to finish. */
1497
ac_build_s_barrier(&ctx->ac);
1498
1499
tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1500
tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
1501
outputs[i].values[0] = LLVMBuildLoad(builder, tmp, "");
1502
} else {
1503
assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1504
outputs[i].values[0] = si_get_primitive_id(ctx, 0);
1505
}
1506
1507
outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]);
1508
for (unsigned j = 1; j < 4; j++)
1509
outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32);
1510
1511
memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
1512
i++;
1513
}
1514
1515
si_llvm_build_vs_exports(ctx, outputs, i);
1516
}
1517
ac_build_endif(&ctx->ac, 6002);
1518
}
1519
1520
static LLVMValueRef ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
1521
{
1522
const struct si_shader_selector *sel = ctx->shader->selector;
1523
const struct si_shader_info *info = &sel->info;
1524
1525
LLVMTypeRef elements[2] = {
1526
LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
1527
LLVMArrayType(ctx->ac.i8, 4),
1528
};
1529
LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
1530
type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
1531
return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
1532
}
1533
1534
/**
1535
* Return a pointer to the LDS storage reserved for the N'th vertex, where N
1536
* is in emit order; that is:
1537
* - during the epilogue, N is the threadidx (relative to the entire threadgroup)
1538
* - during vertex emit, i.e. while the API GS shader invocation is running,
1539
* N = threadidx * gs.vertices_out + emitidx
1540
*
1541
* Goals of the LDS memory layout:
1542
* 1. Eliminate bank conflicts on write for geometry shaders that have all emits
1543
* in uniform control flow
1544
* 2. Eliminate bank conflicts on read for export if, additionally, there is no
1545
* culling
1546
* 3. Agnostic to the number of waves (since we don't know it before compiling)
1547
* 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
1548
* 5. Avoid wasting memory.
1549
*
1550
* We use an AoS layout due to point 4 (this also helps point 3). In an AoS
1551
* layout, elimination of bank conflicts requires that each vertex occupy an
1552
* odd number of dwords. We use the additional dword to store the output stream
1553
* index as well as a flag to indicate whether this vertex ends a primitive
1554
* for rasterization.
1555
*
1556
* Swizzling is required to satisfy points 1 and 2 simultaneously.
1557
*
1558
* Vertices are stored in export order (gsthread * gs.vertices_out + emitidx).
1559
* Indices are swizzled in groups of 32, which ensures point 1 without
1560
* disturbing point 2.
1561
*
1562
* \return an LDS pointer to type {[N x i32], [4 x i8]}
1563
*/
1564
static LLVMValueRef ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
1565
{
1566
struct si_shader_selector *sel = ctx->shader->selector;
1567
LLVMBuilderRef builder = ctx->ac.builder;
1568
LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
1569
1570
/* gs.vertices_out = 2^(write_stride_2exp) * some odd number */
1571
unsigned write_stride_2exp = ffs(sel->info.base.gs.vertices_out) - 1;
1572
if (write_stride_2exp) {
1573
LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");
1574
LLVMValueRef swizzle = LLVMBuildAnd(
1575
builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), "");
1576
vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
1577
}
1578
1579
return ac_build_gep0(&ctx->ac, storage, vertexidx);
1580
}
1581
1582
static LLVMValueRef ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
1583
LLVMValueRef emitidx)
1584
{
1585
struct si_shader_selector *sel = ctx->shader->selector;
1586
LLVMBuilderRef builder = ctx->ac.builder;
1587
LLVMValueRef tmp;
1588
1589
tmp = LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false);
1590
tmp = LLVMBuildMul(builder, tmp, gsthread, "");
1591
const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
1592
return ngg_gs_vertex_ptr(ctx, vertexidx);
1593
}
1594
1595
static LLVMValueRef ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx,
1596
LLVMValueRef vertexptr, unsigned out_idx)
1597
{
1598
LLVMValueRef gep_idx[3] = {
1599
ctx->ac.i32_0, /* implied C-style array */
1600
ctx->ac.i32_0, /* first struct entry */
1601
LLVMConstInt(ctx->ac.i32, out_idx, false),
1602
};
1603
return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1604
}
1605
1606
static LLVMValueRef ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx,
1607
LLVMValueRef vertexptr, unsigned stream)
1608
{
1609
LLVMValueRef gep_idx[3] = {
1610
ctx->ac.i32_0, /* implied C-style array */
1611
ctx->ac.i32_1, /* second struct entry */
1612
LLVMConstInt(ctx->ac.i32, stream, false),
1613
};
1614
return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1615
}
1616
1617
void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs)
1618
{
1619
const struct si_shader_selector *sel = ctx->shader->selector;
1620
const struct si_shader_info *info = &sel->info;
1621
LLVMBuilderRef builder = ctx->ac.builder;
1622
LLVMValueRef tmp;
1623
const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1624
1625
/* If this thread has already emitted the declared maximum number of
1626
* vertices, skip the write: excessive vertex emissions are not
1627
* supposed to have any effect.
1628
*/
1629
const LLVMValueRef can_emit =
1630
LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
1631
LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false), "");
1632
1633
tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1634
tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
1635
LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1636
1637
ac_build_ifcc(&ctx->ac, can_emit, 9001);
1638
1639
const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
1640
unsigned out_idx = 0;
1641
for (unsigned i = 0; i < info->num_outputs; i++) {
1642
for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
1643
if (!(info->output_usagemask[i] & (1 << chan)) ||
1644
((info->output_streams[i] >> (2 * chan)) & 3) != stream)
1645
continue;
1646
1647
LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
1648
out_val = ac_to_integer(&ctx->ac, out_val);
1649
LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
1650
}
1651
}
1652
assert(out_idx * 4 == sel->gsvs_vertex_size);
1653
1654
/* Determine and store whether this vertex completed a primitive. */
1655
const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
1656
1657
tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->info.base.gs.output_primitive) - 1, false);
1658
const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
1659
1660
/* Since the geometry shader emits triangle strips, we need to
1661
* track which primitive is odd and swap vertex indices to get
1662
* the correct vertex order.
1663
*/
1664
LLVMValueRef is_odd = ctx->ac.i1false;
1665
if (stream == 0 && u_vertices_per_prim(sel->info.base.gs.output_primitive) == 3) {
1666
tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
1667
is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
1668
}
1669
1670
tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
1671
LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
1672
1673
/* The per-vertex primitive flag encoding:
1674
* bit 0: whether this vertex finishes a primitive
1675
* bit 1: whether the primitive is odd (if we are emitting triangle strips)
1676
*/
1677
tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
1678
tmp = LLVMBuildOr(
1679
builder, tmp,
1680
LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), "");
1681
LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
1682
1683
tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1684
tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
1685
LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
1686
1687
ac_build_endif(&ctx->ac, 9001);
1688
}
1689
1690
void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
1691
{
1692
/* Zero out the part of LDS scratch that is used to accumulate the
1693
* per-stream generated primitive count.
1694
*/
1695
LLVMBuilderRef builder = ctx->ac.builder;
1696
LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
1697
LLVMValueRef tid = get_thread_id_in_tg(ctx);
1698
LLVMValueRef tmp;
1699
1700
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
1701
ac_build_ifcc(&ctx->ac, tmp, 5090);
1702
{
1703
LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
1704
LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
1705
}
1706
ac_build_endif(&ctx->ac, 5090);
1707
1708
ac_build_s_barrier(&ctx->ac);
1709
}
1710
1711
void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
1712
{
1713
const struct si_shader_selector *sel = ctx->shader->selector;
1714
const struct si_shader_info *info = &sel->info;
1715
const unsigned verts_per_prim = u_vertices_per_prim(sel->info.base.gs.output_primitive);
1716
LLVMBuilderRef builder = ctx->ac.builder;
1717
LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
1718
LLVMValueRef tmp, tmp2;
1719
1720
/* Zero out remaining (non-emitted) primitive flags.
1721
*
1722
* Note: Alternatively, we could pass the relevant gs_next_vertex to
1723
* the emit threads via LDS. This is likely worse in the expected
1724
* typical case where each GS thread emits the full set of
1725
* vertices.
1726
*/
1727
for (unsigned stream = 0; stream < 4; ++stream) {
1728
if (!info->num_stream_output_components[stream])
1729
continue;
1730
1731
const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
1732
1733
ac_build_bgnloop(&ctx->ac, 5100);
1734
1735
const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1736
tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
1737
LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false), "");
1738
ac_build_ifcc(&ctx->ac, tmp, 5101);
1739
ac_build_break(&ctx->ac);
1740
ac_build_endif(&ctx->ac, 5101);
1741
1742
tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1743
LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1744
1745
tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
1746
LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
1747
1748
ac_build_endloop(&ctx->ac, 5100);
1749
}
1750
1751
/* Accumulate generated primitives counts across the entire threadgroup. */
1752
for (unsigned stream = 0; stream < 4; ++stream) {
1753
if (!info->num_stream_output_components[stream])
1754
continue;
1755
1756
LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1757
numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
1758
1759
tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
1760
ac_build_ifcc(&ctx->ac, tmp, 5105);
1761
{
1762
LLVMBuildAtomicRMW(
1763
builder, LLVMAtomicRMWBinOpAdd,
1764
ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)),
1765
numprims, LLVMAtomicOrderingMonotonic, false);
1766
}
1767
ac_build_endif(&ctx->ac, 5105);
1768
}
1769
1770
ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1771
1772
ac_build_s_barrier(&ctx->ac);
1773
1774
const LLVMValueRef tid = get_thread_id_in_tg(ctx);
1775
LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
1776
1777
/* Streamout */
1778
if (sel->so.num_outputs) {
1779
struct ngg_streamout nggso = {};
1780
1781
nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
1782
1783
LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
1784
for (unsigned stream = 0; stream < 4; ++stream) {
1785
if (!info->num_stream_output_components[stream])
1786
continue;
1787
1788
tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
1789
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1790
tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1791
nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
1792
}
1793
1794
for (unsigned i = 0; i < verts_per_prim; ++i) {
1795
tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false),
1796
"");
1797
tmp = ngg_gs_vertex_ptr(ctx, tmp);
1798
nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
1799
}
1800
1801
build_streamout(ctx, &nggso);
1802
}
1803
1804
/* Write shader query data. */
1805
if (ctx->screen->use_ngg_streamout) {
1806
tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
1807
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1808
ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */
1809
unsigned num_query_comps = sel->so.num_outputs ? 8 : 4;
1810
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
1811
LLVMConstInt(ctx->ac.i32, num_query_comps, false), "");
1812
ac_build_ifcc(&ctx->ac, tmp, 5110);
1813
{
1814
LLVMValueRef offset;
1815
tmp = tid;
1816
if (sel->so.num_outputs)
1817
tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), "");
1818
offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), "");
1819
if (sel->so.num_outputs) {
1820
tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), "");
1821
tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), "");
1822
offset = LLVMBuildAdd(builder, offset, tmp, "");
1823
}
1824
1825
tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
1826
LLVMValueRef args[] = {
1827
tmp, ngg_get_query_buf(ctx),
1828
offset, LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */
1829
ctx->ac.i32_0, /* cachepolicy */
1830
};
1831
ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
1832
0);
1833
}
1834
ac_build_endif(&ctx->ac, 5110);
1835
ac_build_endif(&ctx->ac, 5109);
1836
}
1837
1838
/* Determine vertex liveness. */
1839
LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
1840
1841
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1842
ac_build_ifcc(&ctx->ac, tmp, 5120);
1843
{
1844
for (unsigned i = 0; i < verts_per_prim; ++i) {
1845
const LLVMValueRef primidx =
1846
LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), "");
1847
1848
if (i > 0) {
1849
tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
1850
ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
1851
}
1852
1853
/* Load primitive liveness */
1854
tmp = ngg_gs_vertex_ptr(ctx, primidx);
1855
tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1856
const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1857
1858
tmp = LLVMBuildLoad(builder, vertliveptr, "");
1859
tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr);
1860
1861
if (i > 0)
1862
ac_build_endif(&ctx->ac, 5121 + i);
1863
}
1864
}
1865
ac_build_endif(&ctx->ac, 5120);
1866
1867
/* Inclusive scan addition across the current wave. */
1868
LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
1869
struct ac_wg_scan vertlive_scan = {};
1870
vertlive_scan.op = nir_op_iadd;
1871
vertlive_scan.enable_reduce = true;
1872
vertlive_scan.enable_exclusive = true;
1873
vertlive_scan.src = vertlive;
1874
vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
1875
vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
1876
vertlive_scan.numwaves = get_tgsize(ctx);
1877
vertlive_scan.maxwaves = DIV_ROUND_UP(256, ctx->ac.wave_size);
1878
1879
ac_build_wg_scan(&ctx->ac, &vertlive_scan);
1880
1881
/* Skip all exports (including index exports) when possible. */
1882
LLVMValueRef have_exports =
1883
LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
1884
num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
1885
1886
/* Allocate export space. Send this message as early as possible, to
1887
* hide the latency of the SQ <-> SPI roundtrip.
1888
*/
1889
ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce,
1890
num_emit_threads);
1891
1892
/* Setup the reverse vertex compaction permutation. We re-use stream 1
1893
* of the primitive liveness flags, relying on the fact that each
1894
* threadgroup can have at most 256 threads. */
1895
ac_build_ifcc(&ctx->ac, vertlive, 5130);
1896
{
1897
tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
1898
tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
1899
LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
1900
}
1901
ac_build_endif(&ctx->ac, 5130);
1902
1903
ac_build_s_barrier(&ctx->ac);
1904
1905
/* Export primitive data */
1906
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1907
ac_build_ifcc(&ctx->ac, tmp, 5140);
1908
{
1909
LLVMValueRef flags;
1910
struct ac_ngg_prim prim = {};
1911
prim.num_vertices = verts_per_prim;
1912
1913
tmp = ngg_gs_vertex_ptr(ctx, tid);
1914
flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1915
prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
1916
1917
for (unsigned i = 0; i < verts_per_prim; ++i) {
1918
prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
1919
LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
1920
prim.edgeflag[i] = ctx->ac.i1false;
1921
}
1922
1923
/* Geometry shaders output triangle strips, but NGG expects triangles. */
1924
if (verts_per_prim == 3) {
1925
LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
1926
is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
1927
LLVMValueRef flatshade_first = LLVMBuildICmp(
1928
builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, "");
1929
1930
ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);
1931
}
1932
1933
ac_build_export_prim(&ctx->ac, &prim);
1934
}
1935
ac_build_endif(&ctx->ac, 5140);
1936
1937
/* Export position and parameter data */
1938
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
1939
ac_build_ifcc(&ctx->ac, tmp, 5145);
1940
{
1941
struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
1942
1943
tmp = ngg_gs_vertex_ptr(ctx, tid);
1944
tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
1945
tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
1946
const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
1947
1948
unsigned out_idx = 0;
1949
for (unsigned i = 0; i < info->num_outputs; i++) {
1950
outputs[i].semantic = info->output_semantic[i];
1951
1952
for (unsigned j = 0; j < 4; j++, out_idx++) {
1953
tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
1954
tmp = LLVMBuildLoad(builder, tmp, "");
1955
outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
1956
outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
1957
}
1958
}
1959
1960
si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
1961
}
1962
ac_build_endif(&ctx->ac, 5145);
1963
}
1964
1965
static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
1966
unsigned min_verts_per_prim, bool use_adjacency)
1967
{
1968
unsigned max_reuse = max_esverts - min_verts_per_prim;
1969
if (use_adjacency)
1970
max_reuse /= 2;
1971
*max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
1972
}
1973
1974
unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader)
1975
{
1976
const struct si_shader_selector *sel = shader->selector;
1977
1978
if (sel->info.stage == MESA_SHADER_GEOMETRY && sel->so.num_outputs)
1979
return 44;
1980
1981
return 8;
1982
}
1983
1984
/**
1985
* Determine subgroup information like maximum number of vertices and prims.
1986
*
1987
* This happens before the shader is uploaded, since LDS relocations during
1988
* upload depend on the subgroup size.
1989
*/
1990
bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
1991
{
1992
const struct si_shader_selector *gs_sel = shader->selector;
1993
const struct si_shader_selector *es_sel =
1994
shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
1995
const gl_shader_stage gs_stage = gs_sel->info.stage;
1996
const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
1997
const unsigned input_prim = si_get_input_prim(gs_sel);
1998
const bool use_adjacency =
1999
input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
2000
const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
2001
const unsigned min_verts_per_prim = gs_stage == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
2002
2003
/* All these are in dwords: */
2004
/* GE can only use 8K dwords (32KB) of LDS per workgroup.
2005
*/
2006
const unsigned max_lds_size = 8 * 1024 - gfx10_ngg_get_scratch_dw_size(shader);
2007
const unsigned target_lds_size = max_lds_size;
2008
unsigned esvert_lds_size = 0;
2009
unsigned gsprim_lds_size = 0;
2010
2011
/* All these are per subgroup: */
2012
const unsigned min_esverts = gs_sel->screen->info.chip_class >= GFX10_3 ? 29 : 24;
2013
bool max_vert_out_per_gs_instance = false;
2014
unsigned max_gsprims_base = gs_sel->screen->ngg_subgroup_size; /* default prim group size clamp */
2015
unsigned max_esverts_base = gs_sel->screen->ngg_subgroup_size;
2016
2017
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
2018
/* All lanes are filled in wave32. */
2019
max_gsprims_base = ROUND_DOWN_TO(max_gsprims_base / 3, 32);
2020
max_esverts_base = max_gsprims_base * 3;
2021
} else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
2022
max_gsprims_base = max_esverts_base - 2;
2023
}
2024
2025
if (gs_stage == MESA_SHADER_GEOMETRY) {
2026
bool force_multi_cycling = false;
2027
unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations;
2028
2029
retry_select_mode:
2030
if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
2031
if (max_out_verts_per_gsprim) {
2032
max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
2033
}
2034
} else {
2035
/* Use special multi-cycling mode in which each GS
2036
* instance gets its own subgroup. Does not work with
2037
* tessellation. */
2038
max_vert_out_per_gs_instance = true;
2039
max_gsprims_base = 1;
2040
max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out;
2041
}
2042
2043
esvert_lds_size = es_sel->esgs_itemsize / 4;
2044
gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
2045
2046
if (gsprim_lds_size > target_lds_size && !force_multi_cycling) {
2047
if (gs_sel->tess_turns_off_ngg || es_sel->info.stage != MESA_SHADER_TESS_EVAL) {
2048
force_multi_cycling = true;
2049
goto retry_select_mode;
2050
}
2051
}
2052
} else {
2053
/* VS and TES. */
2054
/* LDS size for passing data from ES to GS. */
2055
esvert_lds_size = ngg_nogs_vertex_size(shader);
2056
}
2057
2058
unsigned max_gsprims = max_gsprims_base;
2059
unsigned max_esverts = max_esverts_base;
2060
2061
if (esvert_lds_size)
2062
max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
2063
if (gsprim_lds_size)
2064
max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
2065
2066
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2067
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2068
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2069
2070
if (esvert_lds_size || gsprim_lds_size) {
2071
/* Now that we have a rough proportionality between esverts
2072
* and gsprims based on the primitive type, scale both of them
2073
* down simultaneously based on required LDS space.
2074
*
2075
* We could be smarter about this if we knew how much vertex
2076
* reuse to expect.
2077
*/
2078
unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
2079
if (lds_total > target_lds_size) {
2080
max_esverts = max_esverts * target_lds_size / lds_total;
2081
max_gsprims = max_gsprims * target_lds_size / lds_total;
2082
2083
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2084
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2085
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2086
}
2087
}
2088
2089
/* Round up towards full wave sizes for better ALU utilization. */
2090
if (!max_vert_out_per_gs_instance) {
2091
const unsigned wavesize = si_get_shader_wave_size(shader);
2092
unsigned orig_max_esverts;
2093
unsigned orig_max_gsprims;
2094
do {
2095
orig_max_esverts = max_esverts;
2096
orig_max_gsprims = max_gsprims;
2097
2098
max_esverts = align(max_esverts, wavesize);
2099
max_esverts = MIN2(max_esverts, max_esverts_base);
2100
if (esvert_lds_size)
2101
max_esverts =
2102
MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
2103
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2104
2105
/* Hardware restriction: minimum value of max_esverts */
2106
if (gs_sel->screen->info.chip_class == GFX10)
2107
max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2108
else
2109
max_esverts = MAX2(max_esverts, min_esverts);
2110
2111
max_gsprims = align(max_gsprims, wavesize);
2112
max_gsprims = MIN2(max_gsprims, max_gsprims_base);
2113
if (gsprim_lds_size) {
2114
/* Don't count unusable vertices to the LDS size. Those are vertices above
2115
* the maximum number of vertices that can occur in the workgroup,
2116
* which is e.g. max_gsprims * 3 for triangles.
2117
*/
2118
unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2119
max_gsprims =
2120
MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
2121
}
2122
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2123
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2124
} while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
2125
2126
/* Verify the restriction. */
2127
if (gs_sel->screen->info.chip_class == GFX10)
2128
assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
2129
else
2130
assert(max_esverts >= min_esverts);
2131
} else {
2132
/* Hardware restriction: minimum value of max_esverts */
2133
if (gs_sel->screen->info.chip_class == GFX10)
2134
max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2135
else
2136
max_esverts = MAX2(max_esverts, min_esverts);
2137
}
2138
2139
unsigned max_out_vertices =
2140
max_vert_out_per_gs_instance
2141
? gs_sel->info.base.gs.vertices_out
2142
: gs_stage == MESA_SHADER_GEOMETRY
2143
? max_gsprims * gs_num_invocations * gs_sel->info.base.gs.vertices_out
2144
: max_esverts;
2145
assert(max_out_vertices <= 256);
2146
2147
unsigned prim_amp_factor = 1;
2148
if (gs_stage == MESA_SHADER_GEOMETRY) {
2149
/* Number of output primitives per GS input primitive after
2150
* GS instancing. */
2151
prim_amp_factor = gs_sel->info.base.gs.vertices_out;
2152
}
2153
2154
/* Fix up the thread counts for fast launch. */
2155
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
2156
/* The vertex count must be a multiple of 3. */
2157
max_esverts -= max_esverts % 3;
2158
/* We can only decrease the size, not increase it. */
2159
if (max_gsprims * 3 < max_esverts) {
2160
max_esverts = max_gsprims * 3;
2161
} else {
2162
max_gsprims = max_esverts / 3;
2163
}
2164
} else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
2165
/* The primitive count must be even to get correct winding for triangle strips. */
2166
max_gsprims &= ~1;
2167
if (max_gsprims - 2 < max_esverts) {
2168
max_esverts = max_gsprims + 2;
2169
} else {
2170
max_gsprims = max_esverts - 2;
2171
max_gsprims &= ~1;
2172
max_esverts = max_gsprims + 2;
2173
}
2174
}
2175
2176
shader->ngg.hw_max_esverts = max_esverts;
2177
shader->ngg.max_gsprims = max_gsprims;
2178
shader->ngg.max_out_verts = max_out_vertices;
2179
shader->ngg.prim_amp_factor = prim_amp_factor;
2180
shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
2181
2182
/* Don't count unusable vertices. */
2183
shader->gs_info.esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) *
2184
esvert_lds_size;
2185
shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
2186
2187
assert(shader->ngg.hw_max_esverts >= min_esverts); /* HW limitation */
2188
2189
/* If asserts are disabled, we use the same conditions to return false */
2190
return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
2191
max_out_vertices <= 256 &&
2192
shader->ngg.hw_max_esverts >= min_esverts;
2193
}
2194
2195