CoCalc -- si_compute_prim

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
⁴⁵⁷⁰ views
1
/*
2
 * Copyright 2019 Advanced Micro Devices, Inc.
3
 * All Rights Reserved.
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * on the rights to use, copy, modify, merge, publish, distribute, sub
9
 * license, and/or sell copies of the Software, and to permit persons to whom
10
 * the Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23
 *
24
 */
25

26
#include "ac_llvm_cull.h"
27
#include "si_build_pm4.h"
28
#include "si_pipe.h"
29
#include "si_shader_internal.h"
30
#include "sid.h"
31
#include "util/fast_idiv_by_const.h"
32
#include "util/u_prim.h"
33
#include "util/u_suballoc.h"
34
#include "util/u_upload_mgr.h"
35

36
/* Based on:
37
 * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
38
 */
39

40
/* This file implements primitive culling using asynchronous compute.
41
 *
42
 * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it
43
 * in a compute shader. The shader processes 1 primitive/thread by invoking
44
 * the VS for each vertex to get the positions, decomposes strips
45
 * into triangles (if needed), eliminates primitive restart (if needed),
46
 * does (W<0) culling, face culling, view XY culling, zero-area and
47
 * small-primitive culling, and generates a new index buffer that doesn't
48
 * contain culled primitives.
49
 *
50
 * There is no primitive ordering. The generated index buffer will contain
51
 * primitives in a random order.
52
 *
53
 * IB = a GPU command buffer
54
 *
55
 * Both the compute and gfx IBs run in parallel sort of like CE and DE.
56
 * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND
57
 * doesn't continue if its word isn't 0x80000000. The vertex count is being
58
 * atomically incremented within the draw packet. A CS_DONE event will signal
59
 * the REWIND packet to continue. It's really a direct draw with command
60
 * buffer patching from the compute queue.
61
 *
62
 * The compute IB doesn't have to start when its corresponding gfx IB starts,
63
 * but can start sooner. The compute IB is signaled to start after the last
64
 * execution barrier in the *previous* gfx IB. This is handled as follows.
65
 * The kernel GPU scheduler starts the compute IB after the previous gfx IB has
66
 * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that
67
 * represents the barrier in the previous gfx IB.
68
 *
69
 * Features:
70
 * - Triangle strips are decomposed into an indexed triangle list.
71
 *   The decomposition differs based on the provoking vertex state.
72
 * - Instanced draws are converted into non-instanced draws for 16-bit indices.
73
 *   (InstanceID is stored in the high bits of VertexID and unpacked by VS)
74
 * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling).
75
 * - Back face culling, incl. culling zero-area / degenerate primitives.
76
 * - View XY culling.
77
 * - Small primitive culling for all MSAA modes and all quant modes.
78
 *
79
 * The following are not implemented:
80
 * - ClipVertex/ClipDistance/CullDistance-based culling.
81
 * - Scissor culling.
82
 * - HiZ culling.
83
 *
84
 * Limitations (and unimplemented features that may be possible to implement):
85
 * - Only triangles and triangle strips are supported.
86
 * - Primitive restart is not supported.
87
 * - Instancing is only supported with 16-bit indices and instance count <= 2^16.
88
 * - The instance divisor buffer is unavailable, so all divisors must be
89
 *   either 0 or 1.
90
 * - Multidraws where the vertex shader reads gl_DrawID are unsupported.
91
 * - No support for tessellation and geometry shaders.
92
 *   (patch elimination where tess factors are 0 would be possible to implement)
93
 * - The vertex shader must not contain memory stores.
94
 * - All VS resources must not have a write usage in the command buffer.
95
 * - Bindless textures and images must not occur in the vertex shader.
96
 *
97
 * User data SGPR layout:
98
 *   VERTEX_COUNTER: address of "count" in the draw packet incremented atomically by the shader.
99
 *   START_OUT_INDEX: output index buffer offset / 12
100
 *   START_IN_INDEX:   input index buffer offset / index_size
101
 *   VS.BASE_VERTEX:              same value as VS
102
 *   INDEX_BUFFERS: pointer to constants
103
 *     0..3: input index buffer - typed buffer view
104
 *     4..7: output index buffer - typed buffer view
105
 *     8..11: viewport state - scale.xy, translate.xy
106
 *   VS.VERTEX_BUFFERS:           same value as VS
107
 *   VS.CONST_AND_SHADER_BUFFERS: same value as VS
108
 *   VS.SAMPLERS_AND_IMAGES:      same value as VS
109
 *   VS.START_INSTANCE:           same value as VS
110
 *   SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
111
 *   NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives
112
 *       per instance for instancing.
113
 *   NUM_PRIMS_UDIV_TERMS:
114
 *     - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.
115
 *     - Bits [5:31]: The number of primitives per instance for computing the remainder.
116
 *
117
 * How to test primitive restart (the most complicated part because it needs
118
 * to get the primitive orientation right):
119
 *   Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave
120
 *   primitive orientation flips with small draw calls, which is what most tests use.
121
 *   You can also enable draw call splitting into draw calls with just 2 primitives.
122
 */
123

124
/* At least 256 is needed for the fastest wave launch rate from compute queues
125
 * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
126
#define THREADGROUP_SIZE     256 /* high numbers limit available VGPRs */
127
#define THREADGROUPS_PER_CU  1   /* TGs to launch on 1 CU before going onto the next, max 8 */
128
#define MAX_WAVES_PER_SH     0   /* no limit */
129
#define INDEX_STORES_USE_SLC 1   /* don't cache indices if L2 is full */
130

131
/* Grouping compute dispatches for small draw calls: How many primitives from multiple
132
 * draw calls to process by compute before signaling the gfx IB. This reduces the number
133
 * of EOP events + REWIND packets, because they decrease performance.
134
 * This also determines the granularity of draw-level and packet-level splitting.
135
 */
136
#define PRIMS_PER_IB  (1024 * 1024)  /* size per gfx IB */
137
#define PRIMS_PER_BATCH (128 * 1024) /* size between REWIND packets */
138

139
/* Derived values. */
140
#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
141

142
#define REWIND_SIGNAL_BIT 0x80000000
143

144
static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr);
145

146
void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
147
                                         unsigned *prim_discard_vertex_count_threshold,
148
                                         unsigned *index_ring_size_per_ib)
149
{
150
   *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
151

152
   if (sscreen->info.chip_class <= GFX7 || /* SI-CI support is not implemented */
153
       sscreen->debug_flags & DBG(NO_PD) || is_aux_context)
154
      return;
155

156
   /* TODO: enable this */
157
   bool enable_by_default = false;
158

159
   if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) ||
160
       (enable_by_default && sscreen->allow_draw_out_of_order &&
161
        sscreen->info.num_se >= 2)) {
162
      *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
163

164
      if (sscreen->debug_flags & DBG(ALWAYS_PD))
165
         *prim_discard_vertex_count_threshold = 0; /* always enable */
166

167
      /* The total size is double this per context. Greater numbers allow bigger gfx IBs. */
168
      *index_ring_size_per_ib = PRIMS_PER_IB * 12; /* 3 32-bit indices per primitive. */
169
   }
170
}
171

172
static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
173
{
174
   uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
175
   ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
176
   ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
177
   return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
178
                            LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
179
}
180

181
struct si_thread0_section {
182
   struct si_shader_context *ctx;
183
   LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
184
   LLVMValueRef saved_exec;
185
};
186

187
/* Enter a section that only executes on thread 0. */
188
static void si_enter_thread0_section(struct si_shader_context *ctx,
189
                                     struct si_thread0_section *section, LLVMValueRef thread_id,
190
                                     LLVMValueRef check_nonzero)
191
{
192
   section->ctx = ctx;
193
   section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
194

195
   /* This IF has 4 instructions:
196
    *   v_and_b32_e32 v, 63, v         ; get the thread ID
197
    *   v_cmp_eq_u32_e32 vcc, 0, v     ; thread ID == 0
198
    *   s_and_saveexec_b64 s, vcc
199
    *   s_cbranch_execz BB0_4
200
    *
201
    * It could just be s_and_saveexec_b64 s, 1.
202
    */
203
   LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, "");
204
   if (check_nonzero) {
205
      cond = LLVMBuildAnd(ctx->ac.builder, cond,
206
                          LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, check_nonzero,
207
                                        ctx->ac.i32_0, ""), "");
208
   }
209
   ac_build_ifcc(&ctx->ac, cond, 12601);
210
}
211

212
/* Exit a section that only executes on thread 0 and broadcast the result
213
 * to all threads. */
214
static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result)
215
{
216
   struct si_shader_context *ctx = section->ctx;
217

218
   LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
219

220
   ac_build_endif(&ctx->ac, 12601);
221

222
   /* Broadcast the result from thread 0 to all threads. */
223
   *result =
224
      ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
225
}
226

227
static void si_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted,
228
                                        void *data);
229

230
void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
231
{
232
   struct si_shader_key *key = &ctx->shader->key;
233
   LLVMBuilderRef builder = ctx->ac.builder;
234
   LLVMValueRef vs = ctx->main_fn;
235

236
   /* Always inline the VS function. */
237
   ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
238
   LLVMSetLinkage(vs, LLVMPrivateLinkage);
239

240
   enum ac_arg_type const_desc_type;
241
   if (ctx->shader->selector->info.base.num_ubos == 1 &&
242
       ctx->shader->selector->info.base.num_ssbos == 0)
243
      const_desc_type = AC_ARG_CONST_FLOAT_PTR;
244
   else
245
      const_desc_type = AC_ARG_CONST_DESC_PTR;
246

247
   memset(&ctx->args, 0, sizeof(ctx->args));
248

249
   struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
250
   struct ac_arg param_vb_desc, param_const_desc, param_start_out_index;
251
   struct ac_arg param_base_vertex, param_start_instance, param_start_in_index;
252
   struct ac_arg param_block_id, param_local_id, param_smallprim_precision;
253
   struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
254
   struct ac_arg param_sampler_desc;
255

256
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_counter);
257
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_out_index);
258
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_in_index);
259
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_base_vertex);
260
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &param_index_buffers_and_constants);
261
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &param_vb_desc);
262
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, &param_const_desc);
263
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, &param_sampler_desc);
264
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_instance);
265
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &param_smallprim_precision);
266
   if (key->opt.cs_instancing) {
267
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_multiplier);
268
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_terms);
269
   }
270

271
   /* Block ID and thread ID inputs. */
272
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_block_id);
273
   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &param_local_id);
274

275
   /* Create the compute shader function. */
276
   gl_shader_stage old_stage = ctx->stage;
277
   ctx->stage = MESA_SHADER_COMPUTE;
278
   si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
279
   ctx->stage = old_stage;
280

281
   /* Assemble parameters for VS. */
282
   LLVMValueRef vs_params[16];
283
   unsigned num_vs_params = 0;
284
   unsigned param_vertex_id, param_instance_id;
285

286
   vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* INTERNAL RESOURCES */
287
   vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
288
   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
289
   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
290
   vs_params[num_vs_params++] =
291
      LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
292
   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
293
   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
294
   vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
295
   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
296

297
   vs_params[(param_vertex_id = num_vs_params++)] = NULL;   /* VertexID */
298
   vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
299
   vs_params[num_vs_params++] = ctx->ac.i32_0;              /* unused (PrimID) */
300
   vs_params[num_vs_params++] = ctx->ac.i32_0;              /* unused */
301

302
   assert(num_vs_params <= ARRAY_SIZE(vs_params));
303
   assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
304

305
   /* Load descriptors. (load 8 dwords at once) */
306
   LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
307

308
   LLVMValueRef index_buffers_and_constants =
309
      ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
310
   tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
311
                              ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
312
   tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
313

314
   for (unsigned i = 0; i < 8; i++)
315
      desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
316

317
   input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
318
   output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
319

320
   /* Compute PrimID and InstanceID. */
321
   LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
322
                                                 LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
323
                                                 ac_get_arg(&ctx->ac, param_local_id));
324
   LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
325
   LLVMValueRef instance_id = ctx->ac.i32_0;
326

327
   if (key->opt.cs_instancing) {
328
      LLVMValueRef num_prims_udiv_terms = ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
329
      LLVMValueRef num_prims_udiv_multiplier =
330
         ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
331
      /* Unpack num_prims_udiv_terms. */
332
      LLVMValueRef post_shift =
333
         LLVMBuildAnd(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
334
      LLVMValueRef prims_per_instance =
335
         LLVMBuildLShr(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 5, 0), "");
336
      /* Divide the total prim_id by the number of prims per instance. */
337
      instance_id =
338
         ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, post_shift);
339
      /* Compute the remainder. */
340
      prim_id = LLVMBuildSub(builder, prim_id,
341
                             LLVMBuildMul(builder, instance_id, prims_per_instance, ""), "");
342
   }
343

344
   /* Generate indices (like a non-indexed draw call). */
345
   LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
346
   unsigned vertices_per_prim = 3;
347

348
   switch (key->opt.cs_prim_type) {
349
   case PIPE_PRIM_TRIANGLES:
350
      for (unsigned i = 0; i < 3; i++) {
351
         index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0),
352
                                  LLVMConstInt(ctx->ac.i32, i, 0));
353
      }
354
      break;
355
   case PIPE_PRIM_TRIANGLE_STRIP:
356
      for (unsigned i = 0; i < 3; i++) {
357
         index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), "");
358
      }
359
      break;
360
   default:
361
      unreachable("unexpected primitive type");
362
   }
363

364
   /* Fetch indices. */
365
   if (key->opt.cs_indexed) {
366
      for (unsigned i = 0; i < 3; i++) {
367
         index[i] = LLVMBuildAdd(builder, index[i], ac_get_arg(&ctx->ac, param_start_in_index), "");
368
         index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
369
                                                1, 0, true, false, false);
370
         index[i] = ac_to_integer(&ctx->ac, index[i]);
371
      }
372
   }
373

374
   LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
375
                                         LLVMConstInt(ctx->ac.i32, 63, 0), "");
376

377
   /* Every other triangle in a strip has a reversed vertex order, so we
378
    * need to swap vertices of odd primitives to get the correct primitive
379
    * orientation when converting triangle strips to triangles. Primitive
380
    * restart complicates it, because a strip can start anywhere.
381
    */
382
   LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
383
   LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
384

385
   if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
386
      /* Without primitive restart, odd primitives have reversed orientation.
387
       * Only primitive restart can flip it with respect to the first vertex
388
       * of the draw call.
389
       */
390
      /* prim_is_odd = current_is_odd % 2. */
391
      LLVMValueRef prim_is_odd = LLVMBuildXor(
392
         builder, ctx->ac.i1false, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
393

394
      /* Convert triangle strip indices to triangle indices. */
395
      ac_build_triangle_strip_indices_to_triangle(
396
         &ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
397
         index);
398
   }
399

400
   /* Execute the vertex shader for each vertex to get vertex positions. */
401
   LLVMValueRef pos[3][4];
402
   for (unsigned i = 0; i < vertices_per_prim; i++) {
403
      vs_params[param_vertex_id] = index[i];
404
      vs_params[param_instance_id] = instance_id;
405

406
      LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
407
      for (unsigned chan = 0; chan < 4; chan++)
408
         pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
409
   }
410

411
   /* Divide XYZ by W. */
412
   for (unsigned i = 0; i < vertices_per_prim; i++) {
413
      for (unsigned chan = 0; chan < 3; chan++)
414
         pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
415
   }
416

417
   /* Load the viewport state. */
418
   LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
419
                                             LLVMConstInt(ctx->ac.i32, 2, 0));
420
   vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
421
   LLVMValueRef vp_scale[2], vp_translate[2];
422
   vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
423
   vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
424
   vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
425
   vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
426

427
   /* Do culling. */
428
   struct ac_cull_options options = {};
429
   options.cull_front = key->opt.cs_cull_front;
430
   options.cull_back = key->opt.cs_cull_back;
431
   options.cull_view_xy = true;
432
   options.cull_small_prims = true;
433
   options.cull_zero_area = true;
434
   options.cull_w = true;
435

436
   LLVMValueRef params[] = {
437
      instance_id,
438
      vertex_counter,
439
      output_indexbuf,
440
      (void*)index,
441
      ac_get_arg(&ctx->ac, param_start_out_index),
442
   };
443

444
   ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate,
445
                    ac_get_arg(&ctx->ac, param_smallprim_precision), &options,
446
                    si_build_primitive_accepted, params);
447
   LLVMBuildRetVoid(builder);
448
}
449

450
static void si_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted,
451
                                        void *userdata)
452
{
453
   struct si_shader_context *ctx = container_of(ac, struct si_shader_context, ac);
454
   struct si_shader_key *key = &ctx->shader->key;
455
   LLVMBuilderRef builder = ctx->ac.builder;
456
   unsigned vertices_per_prim = 3;
457
   LLVMValueRef *params = (LLVMValueRef *)userdata;
458
   LLVMValueRef instance_id = params[0];
459
   LLVMValueRef vertex_counter = params[1];
460
   LLVMValueRef output_indexbuf = params[2];
461
   LLVMValueRef *index = (LLVMValueRef *)params[3];
462
   LLVMValueRef start_out_index = params[4];
463

464
   LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
465

466
   ac_build_ifcc(&ctx->ac, accepted, 16607);
467

468
   /* Count the number of active threads by doing bitcount(accepted). */
469
   LLVMValueRef num_prims_accepted = ac_build_bit_count(&ctx->ac, accepted_threadmask);
470
   num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
471

472
   /* Get the number of bits set before the index of this thread. */
473
   LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
474
   LLVMValueRef start;
475

476
   /* Execute atomic_add on the vertex count. */
477
   struct si_thread0_section section;
478
   si_enter_thread0_section(ctx, &section, prim_index, num_prims_accepted);
479
   {
480
      LLVMValueRef num_indices = LLVMBuildMul(
481
         builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
482
      vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
483
      start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
484
                                 LLVMAtomicOrderingMonotonic, false);
485
   }
486
   si_exit_thread0_section(&section, &start);
487

488
   /* Convert it into the primitive index. */
489
   start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
490

491
   /* Now we need to store the indices of accepted primitives into
492
    * the output index buffer.
493
    */
494

495
   /* We have lowered instancing. Pack the instance ID into vertex ID. */
496
   if (key->opt.cs_instancing) {
497
      instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
498

499
      for (unsigned i = 0; i < vertices_per_prim; i++)
500
         index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
501
   }
502

503
   /* Write indices for accepted primitives. */
504
   LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
505
   vindex = LLVMBuildAdd(builder, vindex, start_out_index, "");
506
   LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
507

508
   if (!ac_has_vec3_support(ctx->ac.chip_class, true))
509
      vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
510

511
   ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0,
512
                                ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
513
   ac_build_endif(&ctx->ac, 16607);
514
}
515

516
/* Return false if the shader isn't ready. */
517
static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
518
                                             const struct pipe_draw_info *info)
519
{
520
   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
521
   struct si_shader_key key;
522

523
   memset(&key, 0, sizeof(key));
524
   si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, &key, &key.part.vs.prolog);
525
   assert(!key.part.vs.prolog.instance_divisor_is_fetched);
526

527
   key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
528
   key.opt.vs_as_prim_discard_cs = 1;
529
   key.opt.cs_prim_type = info->mode;
530
   key.opt.cs_indexed = info->index_size != 0;
531
   key.opt.cs_instancing = info->instance_count > 1;
532
   key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
533

534
   if (rs->rasterizer_discard) {
535
      /* Just for performance testing and analysis of trivial bottlenecks.
536
       * This should result in a very short compute shader. */
537
      key.opt.cs_cull_front = 1;
538
      key.opt.cs_cull_back = 1;
539
   } else {
540
      key.opt.cs_cull_front = sctx->viewport0_y_inverted ? rs->cull_back : rs->cull_front;
541
      key.opt.cs_cull_back = sctx->viewport0_y_inverted ? rs->cull_front : rs->cull_back;
542
   }
543

544
   sctx->cs_prim_discard_state.cso = sctx->shader.vs.cso;
545
   sctx->cs_prim_discard_state.current = NULL;
546

547
   if (!sctx->compiler.passes)
548
      si_init_compiler(sctx->screen, &sctx->compiler);
549

550
   struct si_compiler_ctx_state compiler_state;
551
   compiler_state.compiler = &sctx->compiler;
552
   compiler_state.debug = sctx->debug;
553
   compiler_state.is_debug_context = sctx->is_debug;
554

555
   return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state,
556
                                    &key, -1, true) == 0 &&
557
          /* Disallow compute shaders using the scratch buffer. */
558
          sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
559
}
560

561
static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
562
{
563
   if (sctx->index_ring)
564
      return true;
565

566
   if (!sctx->prim_discard_compute_cs.priv) {
567
      struct radeon_winsys *ws = sctx->ws;
568

569
      if (!ws->cs_add_parallel_compute_ib(&sctx->prim_discard_compute_cs,
570
                                          &sctx->gfx_cs, false))
571
         return false;
572
   }
573

574
   if (!sctx->index_ring) {
575
      sctx->index_ring = si_aligned_buffer_create(
576
         sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
577
         PIPE_USAGE_DEFAULT,
578
         sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size);
579
      if (!sctx->index_ring)
580
         return false;
581
   }
582
   return true;
583
}
584

585
static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
586
{
587
   return sctx->index_ring_offset +
588
             align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
589
          sctx->index_ring_size_per_ib;
590
}
591

592
#define COMPUTE_PREAMBLE_SIZE (8 + 39 + 11 + 7)
593

594
enum si_prim_discard_outcome
595
si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
596
                                      unsigned drawid_offset,
597
                                      const struct pipe_draw_start_count_bias *draws,
598
                                      unsigned num_draws, unsigned total_count)
599
{
600
   /* If the compute shader compilation isn't finished, this returns false. */
601
   if (!si_shader_select_prim_discard_cs(sctx, info))
602
      return SI_PRIM_DISCARD_DISABLED;
603

604
   if (!si_initialize_prim_discard_cmdbuf(sctx))
605
      return SI_PRIM_DISCARD_DISABLED;
606

607
   struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;
608
   unsigned prim = info->mode;
609
   unsigned instance_count = info->instance_count;
610

611
   unsigned num_prims_per_instance;
612
   if (prim == PIPE_PRIM_TRIANGLES)
613
      num_prims_per_instance = total_count / 3;
614
   else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
615
      num_prims_per_instance = total_count - 2; /* approximation ignoring multi draws */
616
   else
617
      unreachable("shouldn't get here");
618

619
   unsigned num_prims = num_prims_per_instance * instance_count;
620
   unsigned out_indexbuf_size = num_prims * 12;
621
   bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
622

623
   /* Split draws at the draw call level if the ring is full. This makes
624
    * better use of the ring space.
625
    *
626
    * If instancing is enabled and there is not enough ring buffer space, compute-based
627
    * primitive discard is disabled.
628
    */
629
   if (ring_full && num_prims > PRIMS_PER_BATCH && instance_count == 1) {
630
      unsigned vert_count_per_subdraw = 0;
631

632
      if (prim == PIPE_PRIM_TRIANGLES)
633
         vert_count_per_subdraw = PRIMS_PER_BATCH * 3;
634
      else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
635
         vert_count_per_subdraw = PRIMS_PER_BATCH;
636

637
      /* Split multi draws first. */
638
      if (num_draws > 1) {
639
         unsigned count = 0;
640
         unsigned first_draw = 0;
641
         unsigned num_draws_split = 0;
642

643
         for (unsigned i = 0; i < num_draws; i++) {
644
            if (count && count + draws[i].count > vert_count_per_subdraw) {
645
               /* Submit previous draws.  */
646
               sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + first_draw, num_draws_split);
647
               count = 0;
648
               first_draw = i;
649
               num_draws_split = 0;
650
            }
651

652
            if (draws[i].count > vert_count_per_subdraw) {
653
               /* Submit just 1 draw. It will be split. */
654
               sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + i, 1);
655
               assert(count == 0);
656
               assert(first_draw == i);
657
               assert(num_draws_split == 0);
658
               first_draw = i + 1;
659
               continue;
660
            }
661

662
            count += draws[i].count;
663
            num_draws_split++;
664
         }
665

666
         if (count) {
667
            /* Submit the remaining draws.  */
668
            assert(num_draws_split > 0);
669
            sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + first_draw, num_draws_split);
670
         }
671
         return SI_PRIM_DISCARD_MULTI_DRAW_SPLIT;
672
      }
673

674
      /* Split single draws if splitting multi draws isn't enough. */
675
      struct pipe_draw_info split_draw = *info;
676
      struct pipe_draw_start_count_bias split_draw_range = draws[0];
677
      unsigned base_start = split_draw_range.start;
678
      unsigned count = draws[0].count;
679

680
      if (prim == PIPE_PRIM_TRIANGLES) {
681
         assert(vert_count_per_subdraw < count);
682

683
         for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
684
            split_draw_range.start = base_start + start;
685
            split_draw_range.count = MIN2(count - start, vert_count_per_subdraw);
686

687
            sctx->b.draw_vbo(&sctx->b, &split_draw, drawid_offset, NULL, &split_draw_range, 1);
688
         }
689
      } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
690
         /* No primitive pair can be split, because strips reverse orientation
691
          * for odd primitives. */
692
         STATIC_ASSERT(PRIMS_PER_BATCH % 2 == 0);
693

694
         for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
695
            split_draw_range.start = base_start + start;
696
            split_draw_range.count = MIN2(count - start, vert_count_per_subdraw + 2);
697

698
            sctx->b.draw_vbo(&sctx->b, &split_draw, drawid_offset, NULL, &split_draw_range, 1);
699
         }
700
      }
701

702
      return SI_PRIM_DISCARD_DRAW_SPLIT;
703
   }
704

705
   /* Just quit if the draw call doesn't fit into the ring and can't be split. */
706
   if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
707
      if (SI_PRIM_DISCARD_DEBUG)
708
         puts("PD failed: draw call too big, can't be split");
709
      return SI_PRIM_DISCARD_DISABLED;
710
   }
711

712
   /* Compute how many CS dwords we need to reserve. */
713
   unsigned need_compute_dw = COMPUTE_PREAMBLE_SIZE +
714
                              11 /* shader */ +
715
                              30; /* leave some space at the end */
716
   unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0);
717

718
   for (unsigned i = 0; i < num_draws; i++) {
719
      unsigned num_subdraws = DIV_ROUND_UP(draws[i].count, PRIMS_PER_BATCH);
720

721
      need_compute_dw += 8 * num_subdraws + /* signal REWIND */
722
                         14 /* user SGPRs */ +
723
                         4 * (num_subdraws - 1) + /* user SGPRs after the first subdraw */
724
                         11 * num_subdraws;
725
      need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
726
   }
727

728
   if (ring_full ||
729
       !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
730
      /* If the current IB is empty but the size is too small, add a NOP
731
       * packet to force a flush and get a bigger IB.
732
       */
733
      if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
734
          gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
735
         radeon_begin(gfx_cs);
736
         radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
737
         radeon_emit(gfx_cs, 0);
738
         radeon_end();
739
      }
740

741
      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
742
   }
743

744
   /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
745
   struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
746
   ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
747
   assert(compute_has_space);
748
   assert(si_check_ring_space(sctx, out_indexbuf_size));
749
   assert(cs->current.cdw + need_compute_dw <= cs->current.max_dw);
750
   return SI_PRIM_DISCARD_ENABLED;
751
}
752

753
void si_compute_signal_gfx(struct si_context *sctx)
754
{
755
   struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
756
   unsigned writeback_L2_flags = 0;
757

758
   /* GFX8 needs to flush L2 for CP to see the updated vertex count. */
759
   if (sctx->chip_class == GFX8)
760
      writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
761

762
   if (!sctx->compute_num_prims_in_batch)
763
      return;
764

765
   assert(sctx->compute_rewind_va);
766

767
   /* After the queued dispatches are done and vertex counts are written to
768
    * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
769
    * the dispatches to finish, it only adds the CS_DONE event into the event
770
    * queue.
771
    */
772
   si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
773
                     sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
774
                     writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE,
775
                     EOP_DATA_SEL_VALUE_32BIT, NULL,
776
                     sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
777
                     REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
778
                     SI_NOT_QUERY);
779

780
   sctx->compute_rewind_va = 0;
781
   sctx->compute_num_prims_in_batch = 0;
782
}
783

784
/* Dispatch a primitive discard compute shader. */
785
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
786
                                          const struct pipe_draw_info *info,
787
                                          const struct pipe_draw_start_count_bias *draws,
788
                                          unsigned num_draws, unsigned index_size,
789
                                          unsigned total_count, uint64_t input_indexbuf_va,
790
                                          unsigned index_max_size)
791
{
792
   struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;
793
   struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
794
   unsigned num_total_prims;
795
   unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
796

797
   if (!info->instance_count)
798
      return;
799

800
   switch (info->mode) {
801
   case PIPE_PRIM_TRIANGLES:
802
   case PIPE_PRIM_TRIANGLE_STRIP:
803
      if (info->mode == PIPE_PRIM_TRIANGLES)
804
         num_total_prims = total_count / 3;
805
      else if (total_count >= 2)
806
         num_total_prims = total_count - 2; /* tri strip approximation ignoring multi draws */
807
      else
808
         num_total_prims = 0;
809

810
      vertices_per_prim = 3;
811
      output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
812
      gfx10_output_indexbuf_format = V_008F0C_GFX10_FORMAT_32_32_32_UINT;
813
      break;
814
   default:
815
      unreachable("unsupported primitive type");
816
      return;
817
   }
818

819
   if (!num_total_prims)
820
      return;
821

822
   num_total_prims *= info->instance_count;
823

824
   unsigned out_indexbuf_offset;
825
   uint64_t output_indexbuf_size = num_total_prims * vertices_per_prim * 4;
826

827
   /* Initialize the compute IB if it's empty. */
828
   if (!sctx->prim_discard_compute_ib_initialized) {
829
      /* 1) State initialization. */
830
      sctx->compute_ib_last_shader = NULL;
831

832
      if (sctx->last_ib_barrier_fence) {
833
         assert(!sctx->last_ib_barrier_buf);
834
         sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence,
835
                                           RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
836
      }
837

838
      /* 2) IB initialization. */
839

840
      /* This needs to be done at the beginning of IBs due to possible
841
       * TTM buffer moves in the kernel.
842
       */
843
      if (sctx->chip_class >= GFX10) { /* 8 DW */
844
         radeon_begin(cs);
845
         radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
846
         radeon_emit(cs, 0);          /* CP_COHER_CNTL */
847
         radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
848
         radeon_emit(cs, 0xffffff);   /* CP_COHER_SIZE_HI */
849
         radeon_emit(cs, 0);          /* CP_COHER_BASE */
850
         radeon_emit(cs, 0);          /* CP_COHER_BASE_HI */
851
         radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
852
         radeon_emit(cs,              /* GCR_CNTL */
853
                     S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) |
854
                        S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) |
855
                        S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD));
856
         radeon_end();
857
      } else {
858
         si_emit_surface_sync(sctx, cs,
859
                              S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
860
                                 S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
861
                                 S_0085F0_SH_ICACHE_ACTION_ENA(1) |
862
                                 S_0085F0_SH_KCACHE_ACTION_ENA(1));
863
      }
864

865
      si_emit_initial_compute_regs(sctx, cs); /* 39 DW */
866

867
      radeon_begin(cs); /* 11 DW */
868
      radeon_set_sh_reg(
869
         cs, R_00B860_COMPUTE_TMPRING_SIZE,
870
         S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */
871

872
      /* Only 1D grids are launched. */
873
      radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
874
      radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1));
875
      radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1));
876

877
      radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
878
      radeon_emit(cs, 0);
879
      radeon_emit(cs, 0);
880
      radeon_end();
881

882
      if (sctx->last_ib_barrier_buf) {
883
         assert(!sctx->last_ib_barrier_fence);
884
         radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ,
885
                                   RADEON_PRIO_FENCE);
886
         si_cp_wait_mem(sctx, cs, /* 7 DW */
887
                        sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset,
888
                        1, 1, WAIT_REG_MEM_EQUAL);
889
      }
890

891
      sctx->prim_discard_compute_ib_initialized = true;
892
      assert(cs->current.cdw <= COMPUTE_PREAMBLE_SIZE);
893
   }
894

895
   /* Allocate the output index buffer. */
896
   output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size);
897
   assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
898
   out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
899
   sctx->index_ring_offset += output_indexbuf_size;
900

901
   radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
902
                             RADEON_PRIO_SHADER_RW_BUFFER);
903
   uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
904

905
   /* Prepare index buffer descriptors. */
906
   struct si_resource *indexbuf_desc = NULL;
907
   unsigned indexbuf_desc_offset;
908
   unsigned desc_size = 12 * 4;
909
   uint32_t *desc;
910

911
   u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size),
912
                  &indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc);
913
   radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
914
                             RADEON_PRIO_DESCRIPTORS);
915

916
   /* Input index buffer. */
917
   desc[0] = input_indexbuf_va;
918
   desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size);
919
   desc[2] = index_max_size * (sctx->chip_class == GFX8 ? index_size : 1);
920

921
   if (sctx->chip_class >= GFX10) {
922
      desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
923
                S_008F0C_FORMAT(index_size == 1 ? V_008F0C_GFX10_FORMAT_8_UINT
924
                                                : index_size == 2 ? V_008F0C_GFX10_FORMAT_16_UINT
925
                                                                  : V_008F0C_GFX10_FORMAT_32_UINT) |
926
                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
927
                S_008F0C_RESOURCE_LEVEL(1);
928
   } else {
929
      desc[3] =
930
         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
931
         S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8
932
                                              : index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16
933
                                                                : V_008F0C_BUF_DATA_FORMAT_32);
934
   }
935

936
   /* Output index buffer. */
937
   desc[4] = out_indexbuf_va;
938
   desc[5] =
939
      S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4);
940
   desc[6] = num_total_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
941

942
   if (sctx->chip_class >= GFX10) {
943
      desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
944
                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
945
                S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
946
                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
947
                S_008F0C_RESOURCE_LEVEL(1);
948
   } else {
949
      desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
950
                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
951
                S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
952
                S_008F0C_DATA_FORMAT(output_indexbuf_format);
953
   }
954

955
   /* Viewport state. */
956
   struct si_small_prim_cull_info cull_info;
957
   si_get_small_prim_cull_info(sctx, &cull_info);
958

959
   desc[8] = fui(cull_info.scale[0]);
960
   desc[9] = fui(cull_info.scale[1]);
961
   desc[10] = fui(cull_info.translate[0]);
962
   desc[11] = fui(cull_info.translate[1]);
963

964
   /* Set user data SGPRs. */
965
   /* This can't be >= 16 if we want the fastest launch rate. */
966
   unsigned user_sgprs = info->instance_count > 1 ? 12 : 10;
967

968
   uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
969
   unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
970
   unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
971
   uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
972
   uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
973
   uint64_t vb_desc_va = sctx->vb_descriptors_buffer
974
                            ? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset
975
                            : 0;
976
   si_resource_reference(&indexbuf_desc, NULL);
977

978
   /* Set the compute shader. */
979
   struct si_shader *shader = sctx->cs_prim_discard_state.current;
980

981
   if (shader != sctx->compute_ib_last_shader) {
982
      radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
983
                                RADEON_PRIO_SHADER_BINARY);
984
      uint64_t shader_va = shader->bo->gpu_address;
985

986
      assert(shader->config.scratch_bytes_per_wave == 0);
987
      assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
988

989
      radeon_begin(cs);
990
      radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
991
      radeon_emit(cs, shader_va >> 8);
992
      radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
993

994
      radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
995
      radeon_emit(
996
         cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
997
                S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) |
998
                S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) |
999
                S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
1000
                S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
1001
      radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) |
1002
                         S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
1003
                         S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
1004
                         S_00B84C_LDS_SIZE(shader->config.lds_size));
1005

1006
      radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
1007
                        ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG,
1008
                                                       MAX_WAVES_PER_SH, THREADGROUPS_PER_CU));
1009
      radeon_end();
1010
      sctx->compute_ib_last_shader = shader;
1011
   }
1012

1013
   STATIC_ASSERT(PRIMS_PER_BATCH % THREADGROUP_SIZE == 0);
1014

1015
   struct si_fast_udiv_info32 num_prims_udiv = {};
1016

1017
   for (unsigned i = 0; i < num_draws; i++) {
1018
      unsigned count = draws[i].count;
1019
      unsigned num_prims_per_instance, num_prims;
1020

1021
      /* Determine the number of primitives per instance. */
1022
      if (info->mode == PIPE_PRIM_TRIANGLES)
1023
         num_prims_per_instance = count / 3;
1024
      else if (count >= 2)
1025
         num_prims_per_instance = count - 2;
1026
      else
1027
         num_prims_per_instance = 0;
1028

1029
      if (!num_prims_per_instance)
1030
         continue;
1031

1032
      num_prims = num_prims_per_instance;
1033

1034
      if (info->instance_count > 1) {
1035
         num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
1036
         num_prims *= info->instance_count;
1037
      }
1038

1039
      /* Limitations on how these two are packed in the user SGPR. */
1040
      assert(num_prims_udiv.post_shift < 32);
1041
      assert(num_prims_per_instance < 1 << 27);
1042

1043
      /* Big draw calls are split into smaller dispatches and draw packets. */
1044
      for (unsigned start_prim = 0; start_prim < num_prims; start_prim += PRIMS_PER_BATCH) {
1045
         unsigned num_subdraw_prims;
1046

1047
         if (start_prim + PRIMS_PER_BATCH < num_prims) {
1048
            num_subdraw_prims = PRIMS_PER_BATCH;
1049
         } else {
1050
            num_subdraw_prims = num_prims - start_prim;
1051
         }
1052

1053
         /* Small dispatches are executed back to back until a specific primitive
1054
          * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
1055
          * to start drawing the batch. This batching adds latency to the gfx IB,
1056
          * but CS_DONE and REWIND are too slow.
1057
          */
1058
         if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
1059
            si_compute_signal_gfx(sctx);
1060

1061
         if (sctx->compute_num_prims_in_batch == 0) {
1062
            assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
1063
            sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
1064

1065
            radeon_begin(gfx_cs);
1066
            radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
1067
            radeon_emit(gfx_cs, 0);
1068
            radeon_end();
1069
         }
1070

1071
         sctx->compute_num_prims_in_batch += num_subdraw_prims;
1072

1073
         uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
1074
         uint64_t index_va = out_indexbuf_va + start_prim * 12;
1075

1076
         /* Emit the draw packet into the gfx IB. */
1077
         radeon_begin(gfx_cs);
1078
         radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
1079
         radeon_emit(gfx_cs, num_subdraw_prims * vertices_per_prim);
1080
         radeon_emit(gfx_cs, index_va);
1081
         radeon_emit(gfx_cs, index_va >> 32);
1082
         radeon_emit(gfx_cs, 0);
1083
         radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
1084
         radeon_end();
1085

1086
         radeon_begin_again(cs);
1087

1088
         /* Continue with the compute IB. */
1089
         if (start_prim == 0) {
1090
            if (i == 0) {
1091
               /* First draw. */
1092
               radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
1093
               radeon_emit(cs, count_va);
1094
               radeon_emit(cs, start_prim);
1095
               radeon_emit(cs, draws[i].start);
1096
               radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start);
1097
               radeon_emit(cs, index_buffers_va);
1098
               radeon_emit(cs, vb_desc_va);
1099
               radeon_emit(cs, vs_const_desc_va);
1100
               radeon_emit(cs, vs_sampler_desc_va);
1101
               radeon_emit(cs, info->start_instance);
1102
               /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
1103
               radeon_emit(cs, fui(cull_info.small_prim_precision));
1104

1105
               if (info->instance_count > 1) {
1106
                  radeon_emit(cs, num_prims_udiv.multiplier);
1107
                  radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
1108
               }
1109
            } else {
1110
               /* Subsequent draws. */
1111
               radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 4);
1112
               radeon_emit(cs, count_va);
1113
               radeon_emit(cs, 0);
1114
               radeon_emit(cs, draws[i].start);
1115
               radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start);
1116

1117
               if (info->instance_count > 1) {
1118
                  radeon_set_sh_reg_seq(cs, R_00B928_COMPUTE_USER_DATA_10, 2);
1119
                  radeon_emit(cs, num_prims_udiv.multiplier);
1120
                  radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
1121
               }
1122
            }
1123
         } else {
1124
            /* Draw split. Only update the SGPRs that changed. */
1125
            radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
1126
            radeon_emit(cs, count_va);
1127
            radeon_emit(cs, start_prim);
1128
         }
1129

1130
         /* Set grid dimensions. */
1131
         unsigned start_block = start_prim / THREADGROUP_SIZE;
1132
         unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
1133
         unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
1134

1135
         radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
1136
         radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
1137
                           S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
1138
                              S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
1139

1140
         radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
1141
         radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
1142
         radeon_emit(cs, 1);
1143
         radeon_emit(cs, 1);
1144
         radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
1145
                         S_00B800_ORDER_MODE(0 /* launch in order */));
1146
         radeon_end();
1147

1148
         assert(cs->current.cdw <= cs->current.max_dw);
1149
         assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
1150
      }
1151
   }
1152
}
1153

1154
Product

Resources

Company