Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
4570 views
1
/*
2
* Copyright 2019 Advanced Micro Devices, Inc.
3
* All Rights Reserved.
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* on the rights to use, copy, modify, merge, publish, distribute, sub
9
* license, and/or sell copies of the Software, and to permit persons to whom
10
* the Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22
* USE OR OTHER DEALINGS IN THE SOFTWARE.
23
*
24
*/
25
26
#include "ac_llvm_cull.h"
27
#include "si_build_pm4.h"
28
#include "si_pipe.h"
29
#include "si_shader_internal.h"
30
#include "sid.h"
31
#include "util/fast_idiv_by_const.h"
32
#include "util/u_prim.h"
33
#include "util/u_suballoc.h"
34
#include "util/u_upload_mgr.h"
35
36
/* Based on:
37
* https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
38
*/
39
40
/* This file implements primitive culling using asynchronous compute.
41
*
42
* It takes a monolithic VS in LLVM IR returning gl_Position and invokes it
43
* in a compute shader. The shader processes 1 primitive/thread by invoking
44
* the VS for each vertex to get the positions, decomposes strips
45
* into triangles (if needed), eliminates primitive restart (if needed),
46
* does (W<0) culling, face culling, view XY culling, zero-area and
47
* small-primitive culling, and generates a new index buffer that doesn't
48
* contain culled primitives.
49
*
50
* There is no primitive ordering. The generated index buffer will contain
51
* primitives in a random order.
52
*
53
* IB = a GPU command buffer
54
*
55
* Both the compute and gfx IBs run in parallel sort of like CE and DE.
56
* The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND
57
* doesn't continue if its word isn't 0x80000000. The vertex count is being
58
* atomically incremented within the draw packet. A CS_DONE event will signal
59
* the REWIND packet to continue. It's really a direct draw with command
60
* buffer patching from the compute queue.
61
*
62
* The compute IB doesn't have to start when its corresponding gfx IB starts,
63
* but can start sooner. The compute IB is signaled to start after the last
64
* execution barrier in the *previous* gfx IB. This is handled as follows.
65
* The kernel GPU scheduler starts the compute IB after the previous gfx IB has
66
* started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that
67
* represents the barrier in the previous gfx IB.
68
*
69
* Features:
70
* - Triangle strips are decomposed into an indexed triangle list.
71
* The decomposition differs based on the provoking vertex state.
72
* - Instanced draws are converted into non-instanced draws for 16-bit indices.
73
* (InstanceID is stored in the high bits of VertexID and unpacked by VS)
74
* - W<0 culling (W<0 is behind the viewer, sort of like near Z culling).
75
* - Back face culling, incl. culling zero-area / degenerate primitives.
76
* - View XY culling.
77
* - Small primitive culling for all MSAA modes and all quant modes.
78
*
79
* The following are not implemented:
80
* - ClipVertex/ClipDistance/CullDistance-based culling.
81
* - Scissor culling.
82
* - HiZ culling.
83
*
84
* Limitations (and unimplemented features that may be possible to implement):
85
* - Only triangles and triangle strips are supported.
86
* - Primitive restart is not supported.
87
* - Instancing is only supported with 16-bit indices and instance count <= 2^16.
88
* - The instance divisor buffer is unavailable, so all divisors must be
89
* either 0 or 1.
90
* - Multidraws where the vertex shader reads gl_DrawID are unsupported.
91
* - No support for tessellation and geometry shaders.
92
* (patch elimination where tess factors are 0 would be possible to implement)
93
* - The vertex shader must not contain memory stores.
94
* - All VS resources must not have a write usage in the command buffer.
95
* - Bindless textures and images must not occur in the vertex shader.
96
*
97
* User data SGPR layout:
98
* VERTEX_COUNTER: address of "count" in the draw packet incremented atomically by the shader.
99
* START_OUT_INDEX: output index buffer offset / 12
100
* START_IN_INDEX: input index buffer offset / index_size
101
* VS.BASE_VERTEX: same value as VS
102
* INDEX_BUFFERS: pointer to constants
103
* 0..3: input index buffer - typed buffer view
104
* 4..7: output index buffer - typed buffer view
105
* 8..11: viewport state - scale.xy, translate.xy
106
* VS.VERTEX_BUFFERS: same value as VS
107
* VS.CONST_AND_SHADER_BUFFERS: same value as VS
108
* VS.SAMPLERS_AND_IMAGES: same value as VS
109
* VS.START_INSTANCE: same value as VS
110
* SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
111
* NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives
112
* per instance for instancing.
113
* NUM_PRIMS_UDIV_TERMS:
114
* - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.
115
* - Bits [5:31]: The number of primitives per instance for computing the remainder.
116
*
117
* How to test primitive restart (the most complicated part because it needs
118
* to get the primitive orientation right):
119
* Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave
120
* primitive orientation flips with small draw calls, which is what most tests use.
121
* You can also enable draw call splitting into draw calls with just 2 primitives.
122
*/
123
124
/* At least 256 is needed for the fastest wave launch rate from compute queues
125
* due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
126
#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */
127
#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */
128
#define MAX_WAVES_PER_SH 0 /* no limit */
129
#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */
130
131
/* Grouping compute dispatches for small draw calls: How many primitives from multiple
132
* draw calls to process by compute before signaling the gfx IB. This reduces the number
133
* of EOP events + REWIND packets, because they decrease performance.
134
* This also determines the granularity of draw-level and packet-level splitting.
135
*/
136
#define PRIMS_PER_IB (1024 * 1024) /* size per gfx IB */
137
#define PRIMS_PER_BATCH (128 * 1024) /* size between REWIND packets */
138
139
/* Derived values. */
140
#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
141
142
#define REWIND_SIGNAL_BIT 0x80000000
143
144
static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr);
145
146
void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
147
unsigned *prim_discard_vertex_count_threshold,
148
unsigned *index_ring_size_per_ib)
149
{
150
*prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
151
152
if (sscreen->info.chip_class <= GFX7 || /* SI-CI support is not implemented */
153
sscreen->debug_flags & DBG(NO_PD) || is_aux_context)
154
return;
155
156
/* TODO: enable this */
157
bool enable_by_default = false;
158
159
if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) ||
160
(enable_by_default && sscreen->allow_draw_out_of_order &&
161
sscreen->info.num_se >= 2)) {
162
*prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
163
164
if (sscreen->debug_flags & DBG(ALWAYS_PD))
165
*prim_discard_vertex_count_threshold = 0; /* always enable */
166
167
/* The total size is double this per context. Greater numbers allow bigger gfx IBs. */
168
*index_ring_size_per_ib = PRIMS_PER_IB * 12; /* 3 32-bit indices per primitive. */
169
}
170
}
171
172
static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
173
{
174
uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
175
ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
176
ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
177
return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
178
LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
179
}
180
181
struct si_thread0_section {
182
struct si_shader_context *ctx;
183
LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
184
LLVMValueRef saved_exec;
185
};
186
187
/* Enter a section that only executes on thread 0. */
188
static void si_enter_thread0_section(struct si_shader_context *ctx,
189
struct si_thread0_section *section, LLVMValueRef thread_id,
190
LLVMValueRef check_nonzero)
191
{
192
section->ctx = ctx;
193
section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
194
195
/* This IF has 4 instructions:
196
* v_and_b32_e32 v, 63, v ; get the thread ID
197
* v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0
198
* s_and_saveexec_b64 s, vcc
199
* s_cbranch_execz BB0_4
200
*
201
* It could just be s_and_saveexec_b64 s, 1.
202
*/
203
LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, "");
204
if (check_nonzero) {
205
cond = LLVMBuildAnd(ctx->ac.builder, cond,
206
LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, check_nonzero,
207
ctx->ac.i32_0, ""), "");
208
}
209
ac_build_ifcc(&ctx->ac, cond, 12601);
210
}
211
212
/* Exit a section that only executes on thread 0 and broadcast the result
213
* to all threads. */
214
static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result)
215
{
216
struct si_shader_context *ctx = section->ctx;
217
218
LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
219
220
ac_build_endif(&ctx->ac, 12601);
221
222
/* Broadcast the result from thread 0 to all threads. */
223
*result =
224
ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
225
}
226
227
static void si_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted,
228
void *data);
229
230
void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
231
{
232
struct si_shader_key *key = &ctx->shader->key;
233
LLVMBuilderRef builder = ctx->ac.builder;
234
LLVMValueRef vs = ctx->main_fn;
235
236
/* Always inline the VS function. */
237
ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
238
LLVMSetLinkage(vs, LLVMPrivateLinkage);
239
240
enum ac_arg_type const_desc_type;
241
if (ctx->shader->selector->info.base.num_ubos == 1 &&
242
ctx->shader->selector->info.base.num_ssbos == 0)
243
const_desc_type = AC_ARG_CONST_FLOAT_PTR;
244
else
245
const_desc_type = AC_ARG_CONST_DESC_PTR;
246
247
memset(&ctx->args, 0, sizeof(ctx->args));
248
249
struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
250
struct ac_arg param_vb_desc, param_const_desc, param_start_out_index;
251
struct ac_arg param_base_vertex, param_start_instance, param_start_in_index;
252
struct ac_arg param_block_id, param_local_id, param_smallprim_precision;
253
struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
254
struct ac_arg param_sampler_desc;
255
256
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_counter);
257
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_out_index);
258
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_in_index);
259
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_base_vertex);
260
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &param_index_buffers_and_constants);
261
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &param_vb_desc);
262
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, &param_const_desc);
263
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, &param_sampler_desc);
264
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_instance);
265
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &param_smallprim_precision);
266
if (key->opt.cs_instancing) {
267
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_multiplier);
268
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_terms);
269
}
270
271
/* Block ID and thread ID inputs. */
272
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_block_id);
273
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &param_local_id);
274
275
/* Create the compute shader function. */
276
gl_shader_stage old_stage = ctx->stage;
277
ctx->stage = MESA_SHADER_COMPUTE;
278
si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
279
ctx->stage = old_stage;
280
281
/* Assemble parameters for VS. */
282
LLVMValueRef vs_params[16];
283
unsigned num_vs_params = 0;
284
unsigned param_vertex_id, param_instance_id;
285
286
vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* INTERNAL RESOURCES */
287
vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
288
vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
289
vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
290
vs_params[num_vs_params++] =
291
LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
292
vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
293
vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
294
vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
295
vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
296
297
vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
298
vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
299
vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */
300
vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */
301
302
assert(num_vs_params <= ARRAY_SIZE(vs_params));
303
assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
304
305
/* Load descriptors. (load 8 dwords at once) */
306
LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
307
308
LLVMValueRef index_buffers_and_constants =
309
ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
310
tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
311
ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
312
tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
313
314
for (unsigned i = 0; i < 8; i++)
315
desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
316
317
input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
318
output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
319
320
/* Compute PrimID and InstanceID. */
321
LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
322
LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
323
ac_get_arg(&ctx->ac, param_local_id));
324
LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
325
LLVMValueRef instance_id = ctx->ac.i32_0;
326
327
if (key->opt.cs_instancing) {
328
LLVMValueRef num_prims_udiv_terms = ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
329
LLVMValueRef num_prims_udiv_multiplier =
330
ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
331
/* Unpack num_prims_udiv_terms. */
332
LLVMValueRef post_shift =
333
LLVMBuildAnd(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
334
LLVMValueRef prims_per_instance =
335
LLVMBuildLShr(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 5, 0), "");
336
/* Divide the total prim_id by the number of prims per instance. */
337
instance_id =
338
ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, post_shift);
339
/* Compute the remainder. */
340
prim_id = LLVMBuildSub(builder, prim_id,
341
LLVMBuildMul(builder, instance_id, prims_per_instance, ""), "");
342
}
343
344
/* Generate indices (like a non-indexed draw call). */
345
LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
346
unsigned vertices_per_prim = 3;
347
348
switch (key->opt.cs_prim_type) {
349
case PIPE_PRIM_TRIANGLES:
350
for (unsigned i = 0; i < 3; i++) {
351
index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0),
352
LLVMConstInt(ctx->ac.i32, i, 0));
353
}
354
break;
355
case PIPE_PRIM_TRIANGLE_STRIP:
356
for (unsigned i = 0; i < 3; i++) {
357
index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), "");
358
}
359
break;
360
default:
361
unreachable("unexpected primitive type");
362
}
363
364
/* Fetch indices. */
365
if (key->opt.cs_indexed) {
366
for (unsigned i = 0; i < 3; i++) {
367
index[i] = LLVMBuildAdd(builder, index[i], ac_get_arg(&ctx->ac, param_start_in_index), "");
368
index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
369
1, 0, true, false, false);
370
index[i] = ac_to_integer(&ctx->ac, index[i]);
371
}
372
}
373
374
LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
375
LLVMConstInt(ctx->ac.i32, 63, 0), "");
376
377
/* Every other triangle in a strip has a reversed vertex order, so we
378
* need to swap vertices of odd primitives to get the correct primitive
379
* orientation when converting triangle strips to triangles. Primitive
380
* restart complicates it, because a strip can start anywhere.
381
*/
382
LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
383
LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
384
385
if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
386
/* Without primitive restart, odd primitives have reversed orientation.
387
* Only primitive restart can flip it with respect to the first vertex
388
* of the draw call.
389
*/
390
/* prim_is_odd = current_is_odd % 2. */
391
LLVMValueRef prim_is_odd = LLVMBuildXor(
392
builder, ctx->ac.i1false, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
393
394
/* Convert triangle strip indices to triangle indices. */
395
ac_build_triangle_strip_indices_to_triangle(
396
&ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
397
index);
398
}
399
400
/* Execute the vertex shader for each vertex to get vertex positions. */
401
LLVMValueRef pos[3][4];
402
for (unsigned i = 0; i < vertices_per_prim; i++) {
403
vs_params[param_vertex_id] = index[i];
404
vs_params[param_instance_id] = instance_id;
405
406
LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
407
for (unsigned chan = 0; chan < 4; chan++)
408
pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
409
}
410
411
/* Divide XYZ by W. */
412
for (unsigned i = 0; i < vertices_per_prim; i++) {
413
for (unsigned chan = 0; chan < 3; chan++)
414
pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
415
}
416
417
/* Load the viewport state. */
418
LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
419
LLVMConstInt(ctx->ac.i32, 2, 0));
420
vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
421
LLVMValueRef vp_scale[2], vp_translate[2];
422
vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
423
vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
424
vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
425
vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
426
427
/* Do culling. */
428
struct ac_cull_options options = {};
429
options.cull_front = key->opt.cs_cull_front;
430
options.cull_back = key->opt.cs_cull_back;
431
options.cull_view_xy = true;
432
options.cull_small_prims = true;
433
options.cull_zero_area = true;
434
options.cull_w = true;
435
436
LLVMValueRef params[] = {
437
instance_id,
438
vertex_counter,
439
output_indexbuf,
440
(void*)index,
441
ac_get_arg(&ctx->ac, param_start_out_index),
442
};
443
444
ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate,
445
ac_get_arg(&ctx->ac, param_smallprim_precision), &options,
446
si_build_primitive_accepted, params);
447
LLVMBuildRetVoid(builder);
448
}
449
450
static void si_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted,
451
void *userdata)
452
{
453
struct si_shader_context *ctx = container_of(ac, struct si_shader_context, ac);
454
struct si_shader_key *key = &ctx->shader->key;
455
LLVMBuilderRef builder = ctx->ac.builder;
456
unsigned vertices_per_prim = 3;
457
LLVMValueRef *params = (LLVMValueRef *)userdata;
458
LLVMValueRef instance_id = params[0];
459
LLVMValueRef vertex_counter = params[1];
460
LLVMValueRef output_indexbuf = params[2];
461
LLVMValueRef *index = (LLVMValueRef *)params[3];
462
LLVMValueRef start_out_index = params[4];
463
464
LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
465
466
ac_build_ifcc(&ctx->ac, accepted, 16607);
467
468
/* Count the number of active threads by doing bitcount(accepted). */
469
LLVMValueRef num_prims_accepted = ac_build_bit_count(&ctx->ac, accepted_threadmask);
470
num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
471
472
/* Get the number of bits set before the index of this thread. */
473
LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
474
LLVMValueRef start;
475
476
/* Execute atomic_add on the vertex count. */
477
struct si_thread0_section section;
478
si_enter_thread0_section(ctx, &section, prim_index, num_prims_accepted);
479
{
480
LLVMValueRef num_indices = LLVMBuildMul(
481
builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
482
vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
483
start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
484
LLVMAtomicOrderingMonotonic, false);
485
}
486
si_exit_thread0_section(&section, &start);
487
488
/* Convert it into the primitive index. */
489
start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
490
491
/* Now we need to store the indices of accepted primitives into
492
* the output index buffer.
493
*/
494
495
/* We have lowered instancing. Pack the instance ID into vertex ID. */
496
if (key->opt.cs_instancing) {
497
instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
498
499
for (unsigned i = 0; i < vertices_per_prim; i++)
500
index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
501
}
502
503
/* Write indices for accepted primitives. */
504
LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
505
vindex = LLVMBuildAdd(builder, vindex, start_out_index, "");
506
LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
507
508
if (!ac_has_vec3_support(ctx->ac.chip_class, true))
509
vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
510
511
ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0,
512
ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
513
ac_build_endif(&ctx->ac, 16607);
514
}
515
516
/* Return false if the shader isn't ready. */
517
static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
518
const struct pipe_draw_info *info)
519
{
520
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
521
struct si_shader_key key;
522
523
memset(&key, 0, sizeof(key));
524
si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, &key, &key.part.vs.prolog);
525
assert(!key.part.vs.prolog.instance_divisor_is_fetched);
526
527
key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
528
key.opt.vs_as_prim_discard_cs = 1;
529
key.opt.cs_prim_type = info->mode;
530
key.opt.cs_indexed = info->index_size != 0;
531
key.opt.cs_instancing = info->instance_count > 1;
532
key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
533
534
if (rs->rasterizer_discard) {
535
/* Just for performance testing and analysis of trivial bottlenecks.
536
* This should result in a very short compute shader. */
537
key.opt.cs_cull_front = 1;
538
key.opt.cs_cull_back = 1;
539
} else {
540
key.opt.cs_cull_front = sctx->viewport0_y_inverted ? rs->cull_back : rs->cull_front;
541
key.opt.cs_cull_back = sctx->viewport0_y_inverted ? rs->cull_front : rs->cull_back;
542
}
543
544
sctx->cs_prim_discard_state.cso = sctx->shader.vs.cso;
545
sctx->cs_prim_discard_state.current = NULL;
546
547
if (!sctx->compiler.passes)
548
si_init_compiler(sctx->screen, &sctx->compiler);
549
550
struct si_compiler_ctx_state compiler_state;
551
compiler_state.compiler = &sctx->compiler;
552
compiler_state.debug = sctx->debug;
553
compiler_state.is_debug_context = sctx->is_debug;
554
555
return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state,
556
&key, -1, true) == 0 &&
557
/* Disallow compute shaders using the scratch buffer. */
558
sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
559
}
560
561
static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
562
{
563
if (sctx->index_ring)
564
return true;
565
566
if (!sctx->prim_discard_compute_cs.priv) {
567
struct radeon_winsys *ws = sctx->ws;
568
569
if (!ws->cs_add_parallel_compute_ib(&sctx->prim_discard_compute_cs,
570
&sctx->gfx_cs, false))
571
return false;
572
}
573
574
if (!sctx->index_ring) {
575
sctx->index_ring = si_aligned_buffer_create(
576
sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
577
PIPE_USAGE_DEFAULT,
578
sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size);
579
if (!sctx->index_ring)
580
return false;
581
}
582
return true;
583
}
584
585
static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
586
{
587
return sctx->index_ring_offset +
588
align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
589
sctx->index_ring_size_per_ib;
590
}
591
592
#define COMPUTE_PREAMBLE_SIZE (8 + 39 + 11 + 7)
593
594
enum si_prim_discard_outcome
595
si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
596
unsigned drawid_offset,
597
const struct pipe_draw_start_count_bias *draws,
598
unsigned num_draws, unsigned total_count)
599
{
600
/* If the compute shader compilation isn't finished, this returns false. */
601
if (!si_shader_select_prim_discard_cs(sctx, info))
602
return SI_PRIM_DISCARD_DISABLED;
603
604
if (!si_initialize_prim_discard_cmdbuf(sctx))
605
return SI_PRIM_DISCARD_DISABLED;
606
607
struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;
608
unsigned prim = info->mode;
609
unsigned instance_count = info->instance_count;
610
611
unsigned num_prims_per_instance;
612
if (prim == PIPE_PRIM_TRIANGLES)
613
num_prims_per_instance = total_count / 3;
614
else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
615
num_prims_per_instance = total_count - 2; /* approximation ignoring multi draws */
616
else
617
unreachable("shouldn't get here");
618
619
unsigned num_prims = num_prims_per_instance * instance_count;
620
unsigned out_indexbuf_size = num_prims * 12;
621
bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
622
623
/* Split draws at the draw call level if the ring is full. This makes
624
* better use of the ring space.
625
*
626
* If instancing is enabled and there is not enough ring buffer space, compute-based
627
* primitive discard is disabled.
628
*/
629
if (ring_full && num_prims > PRIMS_PER_BATCH && instance_count == 1) {
630
unsigned vert_count_per_subdraw = 0;
631
632
if (prim == PIPE_PRIM_TRIANGLES)
633
vert_count_per_subdraw = PRIMS_PER_BATCH * 3;
634
else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
635
vert_count_per_subdraw = PRIMS_PER_BATCH;
636
637
/* Split multi draws first. */
638
if (num_draws > 1) {
639
unsigned count = 0;
640
unsigned first_draw = 0;
641
unsigned num_draws_split = 0;
642
643
for (unsigned i = 0; i < num_draws; i++) {
644
if (count && count + draws[i].count > vert_count_per_subdraw) {
645
/* Submit previous draws. */
646
sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + first_draw, num_draws_split);
647
count = 0;
648
first_draw = i;
649
num_draws_split = 0;
650
}
651
652
if (draws[i].count > vert_count_per_subdraw) {
653
/* Submit just 1 draw. It will be split. */
654
sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + i, 1);
655
assert(count == 0);
656
assert(first_draw == i);
657
assert(num_draws_split == 0);
658
first_draw = i + 1;
659
continue;
660
}
661
662
count += draws[i].count;
663
num_draws_split++;
664
}
665
666
if (count) {
667
/* Submit the remaining draws. */
668
assert(num_draws_split > 0);
669
sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + first_draw, num_draws_split);
670
}
671
return SI_PRIM_DISCARD_MULTI_DRAW_SPLIT;
672
}
673
674
/* Split single draws if splitting multi draws isn't enough. */
675
struct pipe_draw_info split_draw = *info;
676
struct pipe_draw_start_count_bias split_draw_range = draws[0];
677
unsigned base_start = split_draw_range.start;
678
unsigned count = draws[0].count;
679
680
if (prim == PIPE_PRIM_TRIANGLES) {
681
assert(vert_count_per_subdraw < count);
682
683
for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
684
split_draw_range.start = base_start + start;
685
split_draw_range.count = MIN2(count - start, vert_count_per_subdraw);
686
687
sctx->b.draw_vbo(&sctx->b, &split_draw, drawid_offset, NULL, &split_draw_range, 1);
688
}
689
} else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
690
/* No primitive pair can be split, because strips reverse orientation
691
* for odd primitives. */
692
STATIC_ASSERT(PRIMS_PER_BATCH % 2 == 0);
693
694
for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
695
split_draw_range.start = base_start + start;
696
split_draw_range.count = MIN2(count - start, vert_count_per_subdraw + 2);
697
698
sctx->b.draw_vbo(&sctx->b, &split_draw, drawid_offset, NULL, &split_draw_range, 1);
699
}
700
}
701
702
return SI_PRIM_DISCARD_DRAW_SPLIT;
703
}
704
705
/* Just quit if the draw call doesn't fit into the ring and can't be split. */
706
if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
707
if (SI_PRIM_DISCARD_DEBUG)
708
puts("PD failed: draw call too big, can't be split");
709
return SI_PRIM_DISCARD_DISABLED;
710
}
711
712
/* Compute how many CS dwords we need to reserve. */
713
unsigned need_compute_dw = COMPUTE_PREAMBLE_SIZE +
714
11 /* shader */ +
715
30; /* leave some space at the end */
716
unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0);
717
718
for (unsigned i = 0; i < num_draws; i++) {
719
unsigned num_subdraws = DIV_ROUND_UP(draws[i].count, PRIMS_PER_BATCH);
720
721
need_compute_dw += 8 * num_subdraws + /* signal REWIND */
722
14 /* user SGPRs */ +
723
4 * (num_subdraws - 1) + /* user SGPRs after the first subdraw */
724
11 * num_subdraws;
725
need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
726
}
727
728
if (ring_full ||
729
!sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
730
/* If the current IB is empty but the size is too small, add a NOP
731
* packet to force a flush and get a bigger IB.
732
*/
733
if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
734
gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
735
radeon_begin(gfx_cs);
736
radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
737
radeon_emit(gfx_cs, 0);
738
radeon_end();
739
}
740
741
si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
742
}
743
744
/* The compute IB is always chained, but we need to call cs_check_space to add more space. */
745
struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
746
ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
747
assert(compute_has_space);
748
assert(si_check_ring_space(sctx, out_indexbuf_size));
749
assert(cs->current.cdw + need_compute_dw <= cs->current.max_dw);
750
return SI_PRIM_DISCARD_ENABLED;
751
}
752
753
void si_compute_signal_gfx(struct si_context *sctx)
754
{
755
struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
756
unsigned writeback_L2_flags = 0;
757
758
/* GFX8 needs to flush L2 for CP to see the updated vertex count. */
759
if (sctx->chip_class == GFX8)
760
writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
761
762
if (!sctx->compute_num_prims_in_batch)
763
return;
764
765
assert(sctx->compute_rewind_va);
766
767
/* After the queued dispatches are done and vertex counts are written to
768
* the gfx IB, signal the gfx IB to continue. CP doesn't wait for
769
* the dispatches to finish, it only adds the CS_DONE event into the event
770
* queue.
771
*/
772
si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
773
sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
774
writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE,
775
EOP_DATA_SEL_VALUE_32BIT, NULL,
776
sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
777
REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
778
SI_NOT_QUERY);
779
780
sctx->compute_rewind_va = 0;
781
sctx->compute_num_prims_in_batch = 0;
782
}
783
784
/* Dispatch a primitive discard compute shader. */
785
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
786
const struct pipe_draw_info *info,
787
const struct pipe_draw_start_count_bias *draws,
788
unsigned num_draws, unsigned index_size,
789
unsigned total_count, uint64_t input_indexbuf_va,
790
unsigned index_max_size)
791
{
792
struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;
793
struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
794
unsigned num_total_prims;
795
unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
796
797
if (!info->instance_count)
798
return;
799
800
switch (info->mode) {
801
case PIPE_PRIM_TRIANGLES:
802
case PIPE_PRIM_TRIANGLE_STRIP:
803
if (info->mode == PIPE_PRIM_TRIANGLES)
804
num_total_prims = total_count / 3;
805
else if (total_count >= 2)
806
num_total_prims = total_count - 2; /* tri strip approximation ignoring multi draws */
807
else
808
num_total_prims = 0;
809
810
vertices_per_prim = 3;
811
output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
812
gfx10_output_indexbuf_format = V_008F0C_GFX10_FORMAT_32_32_32_UINT;
813
break;
814
default:
815
unreachable("unsupported primitive type");
816
return;
817
}
818
819
if (!num_total_prims)
820
return;
821
822
num_total_prims *= info->instance_count;
823
824
unsigned out_indexbuf_offset;
825
uint64_t output_indexbuf_size = num_total_prims * vertices_per_prim * 4;
826
827
/* Initialize the compute IB if it's empty. */
828
if (!sctx->prim_discard_compute_ib_initialized) {
829
/* 1) State initialization. */
830
sctx->compute_ib_last_shader = NULL;
831
832
if (sctx->last_ib_barrier_fence) {
833
assert(!sctx->last_ib_barrier_buf);
834
sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence,
835
RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
836
}
837
838
/* 2) IB initialization. */
839
840
/* This needs to be done at the beginning of IBs due to possible
841
* TTM buffer moves in the kernel.
842
*/
843
if (sctx->chip_class >= GFX10) { /* 8 DW */
844
radeon_begin(cs);
845
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
846
radeon_emit(cs, 0); /* CP_COHER_CNTL */
847
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
848
radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
849
radeon_emit(cs, 0); /* CP_COHER_BASE */
850
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
851
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
852
radeon_emit(cs, /* GCR_CNTL */
853
S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) |
854
S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) |
855
S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD));
856
radeon_end();
857
} else {
858
si_emit_surface_sync(sctx, cs,
859
S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
860
S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
861
S_0085F0_SH_ICACHE_ACTION_ENA(1) |
862
S_0085F0_SH_KCACHE_ACTION_ENA(1));
863
}
864
865
si_emit_initial_compute_regs(sctx, cs); /* 39 DW */
866
867
radeon_begin(cs); /* 11 DW */
868
radeon_set_sh_reg(
869
cs, R_00B860_COMPUTE_TMPRING_SIZE,
870
S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */
871
872
/* Only 1D grids are launched. */
873
radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
874
radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1));
875
radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1));
876
877
radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
878
radeon_emit(cs, 0);
879
radeon_emit(cs, 0);
880
radeon_end();
881
882
if (sctx->last_ib_barrier_buf) {
883
assert(!sctx->last_ib_barrier_fence);
884
radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ,
885
RADEON_PRIO_FENCE);
886
si_cp_wait_mem(sctx, cs, /* 7 DW */
887
sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset,
888
1, 1, WAIT_REG_MEM_EQUAL);
889
}
890
891
sctx->prim_discard_compute_ib_initialized = true;
892
assert(cs->current.cdw <= COMPUTE_PREAMBLE_SIZE);
893
}
894
895
/* Allocate the output index buffer. */
896
output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size);
897
assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
898
out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
899
sctx->index_ring_offset += output_indexbuf_size;
900
901
radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
902
RADEON_PRIO_SHADER_RW_BUFFER);
903
uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
904
905
/* Prepare index buffer descriptors. */
906
struct si_resource *indexbuf_desc = NULL;
907
unsigned indexbuf_desc_offset;
908
unsigned desc_size = 12 * 4;
909
uint32_t *desc;
910
911
u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size),
912
&indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc);
913
radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
914
RADEON_PRIO_DESCRIPTORS);
915
916
/* Input index buffer. */
917
desc[0] = input_indexbuf_va;
918
desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size);
919
desc[2] = index_max_size * (sctx->chip_class == GFX8 ? index_size : 1);
920
921
if (sctx->chip_class >= GFX10) {
922
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
923
S_008F0C_FORMAT(index_size == 1 ? V_008F0C_GFX10_FORMAT_8_UINT
924
: index_size == 2 ? V_008F0C_GFX10_FORMAT_16_UINT
925
: V_008F0C_GFX10_FORMAT_32_UINT) |
926
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
927
S_008F0C_RESOURCE_LEVEL(1);
928
} else {
929
desc[3] =
930
S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
931
S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8
932
: index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16
933
: V_008F0C_BUF_DATA_FORMAT_32);
934
}
935
936
/* Output index buffer. */
937
desc[4] = out_indexbuf_va;
938
desc[5] =
939
S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4);
940
desc[6] = num_total_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
941
942
if (sctx->chip_class >= GFX10) {
943
desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
944
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
945
S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
946
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
947
S_008F0C_RESOURCE_LEVEL(1);
948
} else {
949
desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
950
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
951
S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
952
S_008F0C_DATA_FORMAT(output_indexbuf_format);
953
}
954
955
/* Viewport state. */
956
struct si_small_prim_cull_info cull_info;
957
si_get_small_prim_cull_info(sctx, &cull_info);
958
959
desc[8] = fui(cull_info.scale[0]);
960
desc[9] = fui(cull_info.scale[1]);
961
desc[10] = fui(cull_info.translate[0]);
962
desc[11] = fui(cull_info.translate[1]);
963
964
/* Set user data SGPRs. */
965
/* This can't be >= 16 if we want the fastest launch rate. */
966
unsigned user_sgprs = info->instance_count > 1 ? 12 : 10;
967
968
uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
969
unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
970
unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
971
uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
972
uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
973
uint64_t vb_desc_va = sctx->vb_descriptors_buffer
974
? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset
975
: 0;
976
si_resource_reference(&indexbuf_desc, NULL);
977
978
/* Set the compute shader. */
979
struct si_shader *shader = sctx->cs_prim_discard_state.current;
980
981
if (shader != sctx->compute_ib_last_shader) {
982
radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
983
RADEON_PRIO_SHADER_BINARY);
984
uint64_t shader_va = shader->bo->gpu_address;
985
986
assert(shader->config.scratch_bytes_per_wave == 0);
987
assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
988
989
radeon_begin(cs);
990
radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
991
radeon_emit(cs, shader_va >> 8);
992
radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
993
994
radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
995
radeon_emit(
996
cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
997
S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) |
998
S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) |
999
S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
1000
S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
1001
radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) |
1002
S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
1003
S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
1004
S_00B84C_LDS_SIZE(shader->config.lds_size));
1005
1006
radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
1007
ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG,
1008
MAX_WAVES_PER_SH, THREADGROUPS_PER_CU));
1009
radeon_end();
1010
sctx->compute_ib_last_shader = shader;
1011
}
1012
1013
STATIC_ASSERT(PRIMS_PER_BATCH % THREADGROUP_SIZE == 0);
1014
1015
struct si_fast_udiv_info32 num_prims_udiv = {};
1016
1017
for (unsigned i = 0; i < num_draws; i++) {
1018
unsigned count = draws[i].count;
1019
unsigned num_prims_per_instance, num_prims;
1020
1021
/* Determine the number of primitives per instance. */
1022
if (info->mode == PIPE_PRIM_TRIANGLES)
1023
num_prims_per_instance = count / 3;
1024
else if (count >= 2)
1025
num_prims_per_instance = count - 2;
1026
else
1027
num_prims_per_instance = 0;
1028
1029
if (!num_prims_per_instance)
1030
continue;
1031
1032
num_prims = num_prims_per_instance;
1033
1034
if (info->instance_count > 1) {
1035
num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
1036
num_prims *= info->instance_count;
1037
}
1038
1039
/* Limitations on how these two are packed in the user SGPR. */
1040
assert(num_prims_udiv.post_shift < 32);
1041
assert(num_prims_per_instance < 1 << 27);
1042
1043
/* Big draw calls are split into smaller dispatches and draw packets. */
1044
for (unsigned start_prim = 0; start_prim < num_prims; start_prim += PRIMS_PER_BATCH) {
1045
unsigned num_subdraw_prims;
1046
1047
if (start_prim + PRIMS_PER_BATCH < num_prims) {
1048
num_subdraw_prims = PRIMS_PER_BATCH;
1049
} else {
1050
num_subdraw_prims = num_prims - start_prim;
1051
}
1052
1053
/* Small dispatches are executed back to back until a specific primitive
1054
* count is reached. Then, a CS_DONE is inserted to signal the gfx IB
1055
* to start drawing the batch. This batching adds latency to the gfx IB,
1056
* but CS_DONE and REWIND are too slow.
1057
*/
1058
if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
1059
si_compute_signal_gfx(sctx);
1060
1061
if (sctx->compute_num_prims_in_batch == 0) {
1062
assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
1063
sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
1064
1065
radeon_begin(gfx_cs);
1066
radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
1067
radeon_emit(gfx_cs, 0);
1068
radeon_end();
1069
}
1070
1071
sctx->compute_num_prims_in_batch += num_subdraw_prims;
1072
1073
uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
1074
uint64_t index_va = out_indexbuf_va + start_prim * 12;
1075
1076
/* Emit the draw packet into the gfx IB. */
1077
radeon_begin(gfx_cs);
1078
radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
1079
radeon_emit(gfx_cs, num_subdraw_prims * vertices_per_prim);
1080
radeon_emit(gfx_cs, index_va);
1081
radeon_emit(gfx_cs, index_va >> 32);
1082
radeon_emit(gfx_cs, 0);
1083
radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
1084
radeon_end();
1085
1086
radeon_begin_again(cs);
1087
1088
/* Continue with the compute IB. */
1089
if (start_prim == 0) {
1090
if (i == 0) {
1091
/* First draw. */
1092
radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
1093
radeon_emit(cs, count_va);
1094
radeon_emit(cs, start_prim);
1095
radeon_emit(cs, draws[i].start);
1096
radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start);
1097
radeon_emit(cs, index_buffers_va);
1098
radeon_emit(cs, vb_desc_va);
1099
radeon_emit(cs, vs_const_desc_va);
1100
radeon_emit(cs, vs_sampler_desc_va);
1101
radeon_emit(cs, info->start_instance);
1102
/* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
1103
radeon_emit(cs, fui(cull_info.small_prim_precision));
1104
1105
if (info->instance_count > 1) {
1106
radeon_emit(cs, num_prims_udiv.multiplier);
1107
radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
1108
}
1109
} else {
1110
/* Subsequent draws. */
1111
radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 4);
1112
radeon_emit(cs, count_va);
1113
radeon_emit(cs, 0);
1114
radeon_emit(cs, draws[i].start);
1115
radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start);
1116
1117
if (info->instance_count > 1) {
1118
radeon_set_sh_reg_seq(cs, R_00B928_COMPUTE_USER_DATA_10, 2);
1119
radeon_emit(cs, num_prims_udiv.multiplier);
1120
radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
1121
}
1122
}
1123
} else {
1124
/* Draw split. Only update the SGPRs that changed. */
1125
radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
1126
radeon_emit(cs, count_va);
1127
radeon_emit(cs, start_prim);
1128
}
1129
1130
/* Set grid dimensions. */
1131
unsigned start_block = start_prim / THREADGROUP_SIZE;
1132
unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
1133
unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
1134
1135
radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
1136
radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
1137
S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
1138
S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
1139
1140
radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
1141
radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
1142
radeon_emit(cs, 1);
1143
radeon_emit(cs, 1);
1144
radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
1145
S_00B800_ORDER_MODE(0 /* launch in order */));
1146
radeon_end();
1147
1148
assert(cs->current.cdw <= cs->current.max_dw);
1149
assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
1150
}
1151
}
1152
}
1153
1154