CoCalc -- gfx10_shader

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
⁴⁵⁷⁰ views
1
/*
2
 * Copyright 2017 Advanced Micro Devices, Inc.
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * on the rights to use, copy, modify, merge, publish, distribute, sub
8
 * license, and/or sell copies of the Software, and to permit persons to whom
9
 * the Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22
 */
23

24
#include "ac_llvm_cull.h"
25
#include "si_pipe.h"
26
#include "si_shader_internal.h"
27
#include "sid.h"
28
#include "util/u_memory.h"
29
#include "util/u_prim.h"
30

31
static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx)
32
{
33
   return si_unpack_param(ctx, ctx->args.merged_wave_info, 24, 4);
34
}
35

36
static LLVMValueRef get_tgsize(struct si_shader_context *ctx)
37
{
38
   return si_unpack_param(ctx, ctx->args.merged_wave_info, 28, 4);
39
}
40

41
static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx)
42
{
43
   LLVMBuilderRef builder = ctx->ac.builder;
44
   LLVMValueRef tmp;
45
   tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
46
                      LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
47
   return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
48
}
49

50
static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx)
51
{
52
   return si_unpack_param(ctx, ctx->args.gs_tg_info, 12, 9);
53
}
54

55
static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
56
{
57
   return si_unpack_param(ctx, ctx->args.gs_tg_info, 22, 9);
58
}
59

60
static LLVMValueRef ngg_get_ordered_id(struct si_shader_context *ctx)
61
{
62
   return si_unpack_param(ctx, ctx->args.gs_tg_info, 0, 12);
63
}
64

65
static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
66
{
67
   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
68

69
   return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
70
                                LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false));
71
}
72

73
static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index)
74
{
75
   if (ctx->stage == MESA_SHADER_VERTEX) {
76
      LLVMValueRef tmp;
77
      tmp = LLVMBuildLShr(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id),
78
                          LLVMConstInt(ctx->ac.i32, 8 + index, false), "");
79
      return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
80
   }
81
   return ctx->ac.i1false;
82
}
83

84
/**
85
 * Return the number of vertices as a constant in \p num_vertices,
86
 * and return a more precise value as LLVMValueRef from the function.
87
 */
88
static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, unsigned *num_vertices)
89
{
90
   const struct si_shader_info *info = &ctx->shader->selector->info;
91

92
   if (ctx->stage == MESA_SHADER_VERTEX) {
93
      if (info->base.vs.blit_sgprs_amd) {
94
         /* Blits always use axis-aligned rectangles with 3 vertices. */
95
         *num_vertices = 3;
96
         return LLVMConstInt(ctx->ac.i32, 3, 0);
97
      } else {
98
         /* We always build up all three indices for the prim export
99
          * independent of the primitive type. The additional garbage
100
          * data shouldn't hurt. This number doesn't matter with
101
          * NGG passthrough.
102
          */
103
         *num_vertices = 3;
104

105
         /* Extract OUTPRIM field. */
106
         LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2);
107
         return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, "");
108
      }
109
   } else {
110
      assert(ctx->stage == MESA_SHADER_TESS_EVAL);
111

112
      if (info->base.tess.point_mode)
113
         *num_vertices = 1;
114
      else if (info->base.tess.primitive_mode == GL_LINES)
115
         *num_vertices = 2;
116
      else
117
         *num_vertices = 3;
118

119
      return LLVMConstInt(ctx->ac.i32, *num_vertices, false);
120
   }
121
}
122

123
bool gfx10_ngg_export_prim_early(struct si_shader *shader)
124
{
125
   struct si_shader_selector *sel = shader->selector;
126

127
   assert(shader->key.as_ngg && !shader->key.as_es);
128

129
   return sel->info.stage != MESA_SHADER_GEOMETRY && !sel->info.writes_edgeflag;
130
}
131

132
void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx)
133
{
134
   /* Newer chips can use PRIMGEN_PASSTHRU_NO_MSG to skip gs_alloc_req for NGG passthrough. */
135
   if (gfx10_is_ngg_passthrough(ctx->shader) &&
136
       ctx->screen->info.family >= CHIP_DIMGREY_CAVEFISH)
137
      return;
138

139
   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx),
140
                                 ngg_get_prim_cnt(ctx));
141
}
142

143
void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
144
                                 LLVMValueRef prim_passthrough)
145
{
146
   LLVMBuilderRef builder = ctx->ac.builder;
147

148
   if (gfx10_is_ngg_passthrough(ctx->shader) || ctx->shader->key.opt.ngg_culling) {
149
      ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
150
      {
151
         struct ac_ngg_prim prim = {};
152

153
         if (prim_passthrough)
154
            prim.passthrough = prim_passthrough;
155
         else
156
            prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
157

158
         /* This is only used with NGG culling, which returns the NGG
159
          * passthrough prim export encoding.
160
          */
161
         if (ctx->shader->selector->info.writes_edgeflag) {
162
            unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
163
            LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0);
164

165
            unsigned num_vertices;
166
            ngg_get_vertices_per_prim(ctx, &num_vertices);
167

168
            for (unsigned i = 0; i < num_vertices; i++) {
169
               unsigned shift = 9 + i * 10;
170
               LLVMValueRef edge;
171

172
               edge = LLVMBuildLoad(builder, user_edgeflags[i], "");
173
               edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, "");
174
               edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), "");
175
               edgeflags = LLVMBuildOr(builder, edgeflags, edge, "");
176
            }
177
            prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, "");
178
         }
179

180
         ac_build_export_prim(&ctx->ac, &prim);
181
      }
182
      ac_build_endif(&ctx->ac, 6001);
183
      return;
184
   }
185

186
   ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
187
   {
188
      struct ac_ngg_prim prim = {};
189

190
      ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
191

192
      prim.isnull = ctx->ac.i1false;
193
      prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
194
      prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
195
      prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
196

197
      for (unsigned i = 0; i < prim.num_vertices; ++i) {
198
         prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
199

200
         if (ctx->shader->selector->info.writes_edgeflag) {
201
            LLVMValueRef edge;
202

203
            edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
204
            edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
205
            prim.edgeflag[i] = edge;
206
         }
207
      }
208

209
      ac_build_export_prim(&ctx->ac, &prim);
210
   }
211
   ac_build_endif(&ctx->ac, 6001);
212
}
213

214
static void build_streamout_vertex(struct si_shader_context *ctx, LLVMValueRef *so_buffer,
215
                                   LLVMValueRef *wg_offset_dw, unsigned stream,
216
                                   LLVMValueRef offset_vtx, LLVMValueRef vertexptr)
217
{
218
   struct si_shader_info *info = &ctx->shader->selector->info;
219
   struct pipe_stream_output_info *so = &ctx->shader->selector->so;
220
   LLVMBuilderRef builder = ctx->ac.builder;
221
   LLVMValueRef offset[4] = {};
222
   LLVMValueRef tmp;
223

224
   for (unsigned buffer = 0; buffer < 4; ++buffer) {
225
      if (!wg_offset_dw[buffer])
226
         continue;
227

228
      tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->stride[buffer], false),
229
                         "");
230
      tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
231
      offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
232
   }
233

234
   for (unsigned i = 0; i < so->num_outputs; ++i) {
235
      if (so->output[i].stream != stream)
236
         continue;
237

238
      unsigned reg = so->output[i].register_index;
239
      struct si_shader_output_values out;
240
      out.semantic = info->output_semantic[reg];
241

242
      for (unsigned comp = 0; comp < 4; comp++) {
243
         tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false));
244
         out.values[comp] = LLVMBuildLoad(builder, tmp, "");
245
         out.vertex_stream[comp] = (info->output_streams[reg] >> (2 * comp)) & 3;
246
      }
247

248
      si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out);
249
   }
250
}
251

252
struct ngg_streamout {
253
   LLVMValueRef num_vertices;
254

255
   /* per-thread data */
256
   LLVMValueRef prim_enable[4]; /* i1 per stream */
257
   LLVMValueRef vertices[3];    /* [N x i32] addrspace(LDS)* */
258

259
   /* Output */
260
   LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
261
};
262

263
/**
264
 * Build streamout logic.
265
 *
266
 * Implies a barrier.
267
 *
268
 * Writes number of emitted primitives to gs_ngg_scratch[4:8].
269
 *
270
 * Clobbers gs_ngg_scratch[8:].
271
 */
272
static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout *nggso)
273
{
274
   struct si_shader_info *info = &ctx->shader->selector->info;
275
   struct pipe_stream_output_info *so = &ctx->shader->selector->so;
276
   LLVMBuilderRef builder = ctx->ac.builder;
277
   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
278
   LLVMValueRef tid = get_thread_id_in_tg(ctx);
279
   LLVMValueRef tmp, tmp2;
280
   LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
281
   LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
282
   LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
283
   LLVMValueRef so_buffer[4] = {};
284
   unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0);
285
   LLVMValueRef prim_stride_dw[4] = {};
286
   LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
287
   int stream_for_buffer[4] = {-1, -1, -1, -1};
288
   unsigned bufmask_for_stream[4] = {};
289
   bool isgs = ctx->stage == MESA_SHADER_GEOMETRY;
290
   unsigned scratch_emit_base = isgs ? 4 : 0;
291
   LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
292
   unsigned scratch_offset_base = isgs ? 8 : 4;
293
   LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
294

295
   ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
296

297
   /* Determine the mapping of streamout buffers to vertex streams. */
298
   for (unsigned i = 0; i < so->num_outputs; ++i) {
299
      unsigned buf = so->output[i].output_buffer;
300
      unsigned stream = so->output[i].stream;
301
      assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
302
      stream_for_buffer[buf] = stream;
303
      bufmask_for_stream[stream] |= 1 << buf;
304
   }
305

306
   for (unsigned buffer = 0; buffer < 4; ++buffer) {
307
      if (stream_for_buffer[buffer] == -1)
308
         continue;
309

310
      assert(so->stride[buffer]);
311

312
      tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false);
313
      prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, "");
314
      prim_stride_dw_vgpr =
315
         ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
316
                            LLVMConstInt(ctx->ac.i32, buffer, false));
317

318
      so_buffer[buffer] = ac_build_load_to_sgpr(
319
         &ctx->ac, buf_ptr, LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false));
320
   }
321

322
   tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
323
   ac_build_ifcc(&ctx->ac, tmp, 5200);
324
   {
325
      LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
326
      LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
327

328
      /* Advance the streamout offsets in GDS. */
329
      LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
330
      LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
331

332
      tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
333
      ac_build_ifcc(&ctx->ac, tmp, 5210);
334
      {
335
         if (isgs) {
336
            tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
337
            tmp = LLVMBuildLoad(builder, tmp, "");
338
         } else {
339
            tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
340
         }
341
         LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
342

343
         unsigned swizzle[4];
344
         int unused_stream = -1;
345
         for (unsigned stream = 0; stream < 4; ++stream) {
346
            if (!info->num_stream_output_components[stream]) {
347
               unused_stream = stream;
348
               break;
349
            }
350
         }
351
         for (unsigned buffer = 0; buffer < 4; ++buffer) {
352
            if (stream_for_buffer[buffer] >= 0) {
353
               swizzle[buffer] = stream_for_buffer[buffer];
354
            } else {
355
               assert(unused_stream >= 0);
356
               swizzle[buffer] = unused_stream;
357
            }
358
         }
359

360
         tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
361
         tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
362

363
         LLVMValueRef args[] = {
364
            LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
365
            tmp,
366
            ctx->ac.i32_0,                             // ordering
367
            ctx->ac.i32_0,                             // scope
368
            ctx->ac.i1false,                           // isVolatile
369
            LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
370
            ctx->ac.i1true,                            // wave release
371
            ctx->ac.i1true,                            // wave done
372
         };
373
         tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args,
374
                                  ARRAY_SIZE(args), 0);
375

376
         /* Keep offsets in a VGPR for quick retrieval via readlane by
377
          * the first wave for bounds checking, and also store in LDS
378
          * for retrieval by all waves later. */
379
         LLVMBuildStore(builder, tmp, offsets_vgpr);
380

381
         tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, "");
382
         tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
383
         LLVMBuildStore(builder, tmp, tmp2);
384
      }
385
      ac_build_endif(&ctx->ac, 5210);
386

387
      /* Determine the max emit per buffer. This is done via the SALU, in part
388
       * because LLVM can't generate divide-by-multiply if we try to do this
389
       * via VALU with one lane per buffer.
390
       */
391
      LLVMValueRef max_emit[4] = {};
392
      for (unsigned buffer = 0; buffer < 4; ++buffer) {
393
         if (stream_for_buffer[buffer] == -1)
394
            continue;
395

396
         LLVMValueRef bufsize_dw = LLVMBuildLShr(
397
            builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, "");
398

399
         tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
400
         LLVMValueRef offset_dw =
401
            ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false));
402

403
         tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
404
         tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], "");
405

406
         tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
407
         max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, "");
408
      }
409

410
      /* Determine the number of emitted primitives per stream and fixup the
411
       * GDS counter if necessary.
412
       *
413
       * This is complicated by the fact that a single stream can emit to
414
       * multiple buffers (but luckily not vice versa).
415
       */
416
      LLVMValueRef emit_vgpr = ctx->ac.i32_0;
417

418
      for (unsigned stream = 0; stream < 4; ++stream) {
419
         if (!info->num_stream_output_components[stream])
420
            continue;
421

422
         tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
423
         LLVMValueRef generated =
424
            ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false));
425

426
         LLVMValueRef emit = generated;
427
         for (unsigned buffer = 0; buffer < 4; ++buffer) {
428
            if (stream_for_buffer[buffer] == stream)
429
               emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
430
         }
431

432
         emit_vgpr =
433
            ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false));
434

435
         /* Fixup the offset using a plain GDS atomic if we overflowed. */
436
         tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
437
         ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */
438
         tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
439
                             ac_get_thread_id(&ctx->ac), "");
440
         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
441
         ac_build_ifcc(&ctx->ac, tmp, 5222);
442
         {
443
            tmp = LLVMBuildSub(builder, generated, emit, "");
444
            tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
445
            tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
446
            LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
447
                               LLVMAtomicOrderingMonotonic, false);
448
         }
449
         ac_build_endif(&ctx->ac, 5222);
450
         ac_build_endif(&ctx->ac, 5221);
451
      }
452

453
      tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
454
      ac_build_ifcc(&ctx->ac, tmp, 5225);
455
      {
456
         tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, "");
457
         tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
458
         LLVMBuildStore(builder, emit_vgpr, tmp);
459
      }
460
      ac_build_endif(&ctx->ac, 5225);
461
   }
462
   ac_build_endif(&ctx->ac, 5200);
463

464
   /* Determine the workgroup-relative per-thread / primitive offset into
465
    * the streamout buffers */
466
   struct ac_wg_scan primemit_scan[4] = {};
467

468
   if (isgs) {
469
      for (unsigned stream = 0; stream < 4; ++stream) {
470
         if (!info->num_stream_output_components[stream])
471
            continue;
472

473
         primemit_scan[stream].enable_exclusive = true;
474
         primemit_scan[stream].op = nir_op_iadd;
475
         primemit_scan[stream].src = nggso->prim_enable[stream];
476
         primemit_scan[stream].scratch = ac_build_gep0(
477
            &ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
478
         primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
479
         primemit_scan[stream].numwaves = get_tgsize(ctx);
480
         if (ctx->stage == MESA_SHADER_GEOMETRY) {
481
            /* ngg_subgroup_size is only the input size. GS can always generate up to 256 vertices. */
482
            primemit_scan[stream].maxwaves = DIV_ROUND_UP(256, ctx->ac.wave_size);
483
         } else {
484
            primemit_scan[stream].maxwaves = DIV_ROUND_UP(ctx->screen->ngg_subgroup_size,
485
                                                          ctx->ac.wave_size);
486
         }
487
         ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
488
      }
489
   }
490

491
   ac_build_s_barrier(&ctx->ac);
492

493
   /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
494
   LLVMValueRef wgoffset_dw[4] = {};
495

496
   {
497
      LLVMValueRef scratch_vgpr;
498

499
      tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
500
      scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
501

502
      for (unsigned buffer = 0; buffer < 4; ++buffer) {
503
         if (stream_for_buffer[buffer] >= 0) {
504
            wgoffset_dw[buffer] =
505
               ac_build_readlane(&ctx->ac, scratch_vgpr,
506
                                 LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
507
         }
508
      }
509

510
      for (unsigned stream = 0; stream < 4; ++stream) {
511
         if (info->num_stream_output_components[stream]) {
512
            nggso->emit[stream] =
513
               ac_build_readlane(&ctx->ac, scratch_vgpr,
514
                                 LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
515
         }
516
      }
517
   }
518

519
   /* Write out primitive data */
520
   for (unsigned stream = 0; stream < 4; ++stream) {
521
      if (!info->num_stream_output_components[stream])
522
         continue;
523

524
      if (isgs) {
525
         ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
526
      } else {
527
         primemit_scan[stream].result_exclusive = tid;
528
      }
529

530
      tmp = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive,
531
                          nggso->emit[stream], "");
532
      tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], "");
533
      ac_build_ifcc(&ctx->ac, tmp, 5240);
534
      {
535
         LLVMValueRef offset_vtx =
536
            LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, "");
537

538
         for (unsigned i = 0; i < max_num_vertices; ++i) {
539
            tmp = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false),
540
                                nggso->num_vertices, "");
541
            ac_build_ifcc(&ctx->ac, tmp, 5241);
542
            build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx,
543
                                   nggso->vertices[i]);
544
            ac_build_endif(&ctx->ac, 5241);
545
            offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
546
         }
547
      }
548
      ac_build_endif(&ctx->ac, 5240);
549
   }
550
}
551

552
/* LDS layout of ES vertex data for NGG culling. */
553
enum
554
{
555
   /* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old
556
    *         ES thread ID. After vertex compaction, compacted ES threads
557
    *         store the old thread ID here to copy input VGPRs from uncompacted
558
    *         ES threads.
559
    * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
560
    * Byte 2: TES rel patch ID
561
    * Byte 3: Unused
562
    */
563
   lds_byte0_accept_flag = 0,
564
   lds_byte1_new_thread_id,
565
   lds_byte2_tes_rel_patch_id,
566
   lds_byte3_unused,
567

568
   lds_packed_data = 0, /* lds_byteN_... */
569
   lds_pos_cull_x_div_w,
570
   lds_pos_cull_y_div_w,
571
   lds_pos_cull_w,
572

573
   lds_pos_x = lds_packed_data + 1,
574
   lds_pos_y,
575
   lds_pos_z,
576
   lds_pos_w,
577
   /* If VS: */
578
   lds_vertex_id,
579
   lds_instance_id, /* optional */
580
   /* If TES: */
581
   lds_tes_u = lds_vertex_id,
582
   lds_tes_v = lds_instance_id,
583
   lds_tes_patch_id, /* optional */
584
};
585

586
static LLVMValueRef si_build_gep_i8_var(struct si_shader_context *ctx, LLVMValueRef ptr,
587
                                        LLVMValueRef index)
588
{
589
   LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
590

591
   return LLVMBuildGEP(ctx->ac.builder, LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""), &index,
592
                       1, "");
593
}
594

595
static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx, LLVMValueRef ptr,
596
                                    unsigned byte_index)
597
{
598
   assert(byte_index < 4);
599
   return si_build_gep_i8_var(ctx, ptr, LLVMConstInt(ctx->ac.i32, byte_index, 0));
600
}
601

602
static unsigned ngg_nogs_vertex_size(struct si_shader *shader)
603
{
604
   unsigned lds_vertex_size = 0;
605

606
   /* The edgeflag is always stored in the last element that's also
607
    * used for padding to reduce LDS bank conflicts. */
608
   if (shader->selector->so.num_outputs)
609
      lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
610
   if (shader->selector->info.writes_edgeflag)
611
      lds_vertex_size = MAX2(lds_vertex_size, 1);
612

613
   /* LDS size for passing data from GS to ES.
614
    * GS stores Primitive IDs into LDS at the address corresponding
615
    * to the ES thread of the provoking vertex. All ES threads
616
    * load and export PrimitiveID for their thread.
617
    */
618
   if (shader->selector->info.stage == MESA_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)
619
      lds_vertex_size = MAX2(lds_vertex_size, 1);
620

621
   if (shader->key.opt.ngg_culling) {
622
      if (shader->selector->info.stage == MESA_SHADER_VERTEX) {
623
         STATIC_ASSERT(lds_instance_id + 1 == 7);
624
         lds_vertex_size = MAX2(lds_vertex_size, 7);
625
      } else {
626
         assert(shader->selector->info.stage == MESA_SHADER_TESS_EVAL);
627

628
         if (shader->selector->info.uses_primid || shader->key.mono.u.vs_export_prim_id) {
629
            STATIC_ASSERT(lds_tes_patch_id + 2 == 9); /* +1 for LDS padding */
630
            lds_vertex_size = MAX2(lds_vertex_size, 9);
631
         } else {
632
            STATIC_ASSERT(lds_tes_v + 1 == 7);
633
            lds_vertex_size = MAX2(lds_vertex_size, 7);
634
         }
635
      }
636
   }
637

638
   return lds_vertex_size;
639
}
640

641
/**
642
 * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
643
 * for the vertex outputs.
644
 */
645
static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vtxid)
646
{
647
   /* The extra dword is used to avoid LDS bank conflicts. */
648
   unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader);
649
   LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
650
   LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
651
   LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
652
   return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
653
}
654

655
static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx, LLVMValueRef ret,
656
                                          struct ac_arg param, unsigned return_index)
657
{
658
   LLVMValueRef v = ac_get_arg(&ctx->ac, param);
659

660
   for (unsigned i = 0; i < 4; i++) {
661
      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, ac_llvm_extract_elem(&ctx->ac, v, i),
662
                                 return_index + i, "");
663
   }
664
   return ret;
665
}
666

667
static void load_vertex_counts(struct si_shader_context *ctx, LLVMValueRef lds,
668
                               unsigned max_waves, LLVMValueRef tid,
669
                               LLVMValueRef *total_count,
670
                               LLVMValueRef *prefix_sum)
671
{
672
   LLVMBuilderRef builder = ctx->ac.builder;
673
   LLVMValueRef i8vec4_lane = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
674
   unsigned num_i8vec4 = DIV_ROUND_UP(max_waves, 4);
675

676
   /* If all threads loaded the vertex counts, it would cause many LDS bank conflicts
677
    * and the performance could decrease up to WaveSize times (32x or 64x).
678
    *
679
    * Therefore, only load the i-th tuple of vertex counts in the i-th thread. Other threads will
680
    * get them through readlane. 4 8-bit vertex counts are loaded per thread.
681
    */
682
   ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntULT, tid,
683
                                         LLVMConstInt(ctx->ac.i32, num_i8vec4, 0), ""), 17771);
684
   LLVMBuildStore(builder, LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, lds, tid), ""), i8vec4_lane);
685
   ac_build_endif(&ctx->ac, 17771);
686

687
   /* Compute the number of ES waves. */
688
   LLVMValueRef num_waves = get_tgsize(ctx);
689

690
   /* Compute a byte mask where each byte is either 0 or 0xff depending on whether the wave
691
    * exists. We need the mask to clear uninitialized bytes in LDS and to compute the prefix sum.
692
    *
693
    * 8 waves: valid_mask = ~0ull >> (64 - num_waves * 8)
694
    * 4 waves: valid_mask = ~0 >> (32 - num_waves * 8)
695
    */
696
   LLVMValueRef num_waves8 = LLVMBuildShl(builder, num_waves, LLVMConstInt(ctx->ac.i32, 3, 0), "");
697
   LLVMValueRef valid_mask;
698

699
   if (max_waves > 4) {
700
      LLVMValueRef num_waves8_rev = LLVMBuildSub(builder, LLVMConstInt(ctx->ac.i32, 64, 0),
701
                                                 num_waves8, "");
702
      valid_mask = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i64, ~0ull, 0),
703
                                 LLVMBuildZExt(builder, num_waves8_rev, ctx->ac.i64, ""), "");
704
   } else {
705
      LLVMValueRef num_waves8_rev = LLVMBuildSub(builder, LLVMConstInt(ctx->ac.i32, 32, 0),
706
                                                 num_waves8, "");
707
      valid_mask = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, ~0, 0), num_waves8_rev, "");
708
   }
709

710
   /* Compute a byte mask where bytes below wave_id are 0xff, else they are 0.
711
    *
712
    * prefix_mask = ~(~0 << (wave_id * 8))
713
    */
714
   LLVMTypeRef type = max_waves > 4 ? ctx->ac.i64 : ctx->ac.i32;
715
   LLVMValueRef wave_id8 = LLVMBuildShl(builder, get_wave_id_in_tg(ctx),
716
                                        LLVMConstInt(ctx->ac.i32, 3, 0), "");
717
   LLVMValueRef prefix_mask =
718
      LLVMBuildNot(builder, LLVMBuildShl(builder, LLVMConstInt(type, ~0ull, 0),
719
                                         LLVMBuildZExt(builder, wave_id8, type, ""), ""), "");
720

721
   /* Compute the total vertex count and the vertex count of previous waves (prefix). */
722
   *total_count = ctx->ac.i32_0;
723
   *prefix_sum = ctx->ac.i32_0;
724

725
   for (unsigned i = 0; i < num_i8vec4; i++) {
726
      LLVMValueRef i8vec4;
727

728
      i8vec4 = ac_build_readlane_no_opt_barrier(&ctx->ac, LLVMBuildLoad(builder, i8vec4_lane, ""),
729
                                                LLVMConstInt(ctx->ac.i32, i, 0));
730
      /* Inactive waves have uninitialized vertex counts. Set them to 0 using this. */
731
      i8vec4 = LLVMBuildAnd(builder, i8vec4,
732
                            ac_unpack_param(&ctx->ac, valid_mask, 32 * i, 32), "");
733
      /* Compute the sum of all i8vec4 components and add it to the result. */
734
      *total_count = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.sad.u8", ctx->ac.i32,
735
                                        (LLVMValueRef[]){i8vec4, ctx->ac.i32_0, *total_count},
736
                                        3, AC_FUNC_ATTR_READNONE);
737
      ac_set_range_metadata(&ctx->ac, *total_count, 0, 64*4 + 1); /* the result is at most 64*4 */
738

739
      /* Compute the sum of the vertex counts of all previous waves. */
740
      i8vec4 = LLVMBuildAnd(builder, i8vec4,
741
                                ac_unpack_param(&ctx->ac, prefix_mask, 32 * i, 32), "");
742
      *prefix_sum = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.sad.u8", ctx->ac.i32,
743
                                       (LLVMValueRef[]){i8vec4, ctx->ac.i32_0, *prefix_sum},
744
                                       3, AC_FUNC_ATTR_READNONE);
745
      ac_set_range_metadata(&ctx->ac, *prefix_sum, 0, 64*4 + 1); /* the result is at most 64*4 */
746
   }
747
   *total_count = ac_build_readlane_no_opt_barrier(&ctx->ac, *total_count, NULL);
748
}
749

750
/**
751
 * Given a total thread count, update total and per-wave thread counts in input SGPRs
752
 * and return the per-wave thread count.
753
 *
754
 * \param new_num_threads    Total thread count on the input, per-wave thread count on the output.
755
 * \param tg_info            tg_info SGPR value
756
 * \param tg_info_num_bits   the bit size of thread count field in tg_info
757
 * \param tg_info_shift      the bit offset of the thread count field in tg_info
758
 * \param wave_info          merged_wave_info SGPR value
759
 * \param wave_info_num_bits the bit size of thread count field in merged_wave_info
760
 * \param wave_info_shift    the bit offset of the thread count field in merged_wave_info
761
 */
762
static void update_thread_counts(struct si_shader_context *ctx, LLVMValueRef *new_num_threads,
763
                                 LLVMValueRef *tg_info, unsigned tg_info_num_bits,
764
                                 unsigned tg_info_shift, LLVMValueRef *wave_info,
765
                                 unsigned wave_info_num_bits, unsigned wave_info_shift)
766
{
767
   LLVMBuilderRef builder = ctx->ac.builder;
768

769
   /* Update the total thread count. */
770
   unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift);
771
   *tg_info = LLVMBuildAnd(builder, *tg_info, LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), "");
772
   *tg_info = LLVMBuildOr(
773
      builder, *tg_info,
774
      LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), "");
775

776
   /* Update the per-wave thread count. */
777
   LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
778
                                            LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), "");
779
   *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, "");
780
   *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0);
781
   *new_num_threads =
782
      ac_build_imin(&ctx->ac, *new_num_threads, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0));
783
   unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift);
784
   *wave_info = LLVMBuildAnd(builder, *wave_info, LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), "");
785
   *wave_info = LLVMBuildOr(
786
      builder, *wave_info,
787
      LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""),
788
      "");
789
}
790

791
static void gfx10_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted,
792
                                           void *userdata)
793
{
794
   struct si_shader_context *ctx = container_of(ac, struct si_shader_context, ac);
795
   LLVMValueRef *params = (LLVMValueRef *)userdata;
796
   LLVMValueRef gs_accepted = params[0];
797
   LLVMValueRef *gs_vtxptr = (LLVMValueRef *)params[1];
798

799
   ac_build_ifcc(&ctx->ac, accepted, 0);
800
   LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_1, gs_accepted);
801
   for (unsigned vtx = 0; vtx < 3; vtx++) {
802
      LLVMBuildStore(ctx->ac.builder, ctx->ac.i8_1,
803
                     si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
804
   }
805
   ac_build_endif(&ctx->ac, 0);
806
}
807

808
/**
809
 * Cull primitives for NGG VS or TES, then compact vertices, which happens
810
 * before the VS or TES main function. Return values for the main function.
811
 * Also return the position, which is passed to the shader as an input,
812
 * so that we don't compute it twice.
813
 */
814
void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
815
                                     LLVMValueRef *addrs)
816
{
817
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
818
   struct si_shader *shader = ctx->shader;
819
   struct si_shader_selector *sel = shader->selector;
820
   struct si_shader_info *info = &sel->info;
821
   LLVMBuilderRef builder = ctx->ac.builder;
822
   unsigned max_waves = DIV_ROUND_UP(ctx->screen->ngg_subgroup_size, ctx->ac.wave_size);
823

824
   assert(shader->key.opt.ngg_culling);
825
   assert(shader->key.as_ngg);
826
   assert(sel->info.stage == MESA_SHADER_VERTEX ||
827
          (sel->info.stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es));
828

829
   LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
830
   unsigned pos_index = 0;
831

832
   for (unsigned i = 0; i < info->num_outputs; i++) {
833
      LLVMValueRef position[4];
834

835
      switch (info->output_semantic[i]) {
836
      case VARYING_SLOT_POS:
837
         /* If we are going to cull everything (rasterizer_discard), discard
838
          * the position. This is useful for analyzing maximum theoretical
839
          * performance without VS input loads.
840
          */
841
         if (shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE &&
842
             shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE) {
843
            for (unsigned j = 0; j < 4; j++)
844
               LLVMBuildStore(builder, LLVMGetUndef(ctx->ac.f32), addrs[4 * i + j]);
845
            break;
846
         }
847

848
         pos_index = i;
849
         for (unsigned j = 0; j < 4; j++) {
850
            position[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
851
         }
852

853
         /* Store Position.W into LDS. */
854
         LLVMBuildStore(
855
            builder, ac_to_integer(&ctx->ac, position[3]),
856
            ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_cull_w, 0)));
857

858
         /* Store Position.XY / W into LDS. */
859
         for (unsigned chan = 0; chan < 2; chan++) {
860
            LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
861
            LLVMBuildStore(
862
               builder, ac_to_integer(&ctx->ac, val),
863
               ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_cull_x_div_w + chan, 0)));
864
         }
865
         break;
866
      }
867
   }
868

869
   /* Initialize the packed data. */
870
   LLVMBuildStore(
871
      builder, ctx->ac.i32_0,
872
      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
873
   ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
874
   ac_build_s_barrier(&ctx->ac);
875

876
   LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
877

878
   /* The hardware requires that there are no holes between unculled vertices,
879
    * which means we have to pack ES threads, i.e. reduce the ES thread count
880
    * and move ES input VGPRs to lower threads. The upside is that varyings
881
    * are only fetched and computed for unculled vertices.
882
    *
883
    * Vertex compaction:
884
    *
885
    * Part 1: Store the surviving vertex count for each wave in LDS.
886
    *   - The GS culling code notifies ES threads which vertices were accepted.
887
    *   - Barrier
888
    *   - ES threads will compute the vertex count and store it in LDS.
889
    * - Barrier
890
    * - Each wave loads the vertex counts from LDS.
891
    *
892
    * Part 2: Compact ES threads:
893
    * - Compute the prefix sum for each surviving vertex. This is the new thread ID
894
    *   of the vertex.
895
    * - Write input VGPRs and vertex positions for each surviving vertex into the LDS
896
    *   address of the new thread ID.
897
    * - Now kill all waves that have inactive threads.
898
    * - Barrier
899
    * - Update vertex indices and null flag in the GS input VGPRs.
900
    *
901
    * Part 3: Update inputs GPRs
902
    * - For all waves, update per-wave thread counts in input SGPRs.
903
    * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
904
    */
905

906
   LLVMValueRef vtxindex[3];
907
   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
908
      /* For the GS fast launch, the VS prolog simply puts the Vertex IDs
909
       * into these VGPRs.
910
       */
911
      vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
912
      vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
913
      vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
914
   } else {
915
      vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
916
      vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
917
      vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
918
   };
919
   LLVMValueRef gs_vtxptr[] = {
920
      ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
921
      ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
922
      ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
923
   };
924
   es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
925

926
   /* Adding these optimization barriers improves the generated code as follows. Crazy right?
927
    *
928
    * - s_mov_b32 s4, 0xffff
929
    * - v_lshrrev_b32_e32 v10, 16, v0
930
    * - v_and_b32_e32 v12, s4, v0
931
    * - v_and_b32_e32 v11, s4, v1
932
    *   s_bfe_u32 s4, s3, 0x80008
933
    * - s_mov_b64 s[8:9], 0
934
    * - v_mul_u32_u24_e32 v0, 28, v10
935
    * - v_mul_u32_u24_e32 v9, 28, v12
936
    * - v_mul_u32_u24_e32 v1, 28, v11
937
    * + v_mov_b32_e32 v11, 28
938
    *   v_cmp_gt_u32_e32 vcc, s4, v2
939
    * + s_mov_b64 s[8:9], 0
940
    *   s_waitcnt lgkmcnt(0)
941
    *   s_barrier
942
    * + v_mul_u32_u24_sdwa v10, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
943
    * + v_mul_u32_u24_sdwa v23, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
944
    * + v_mul_u32_u24_sdwa v0, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
945
    *   s_and_saveexec_b64 s[44:45], vcc
946
    *   s_cbranch_execz BB2_8
947
    * - v_mul_u32_u24_e32 v16, 28, v12
948
    * - v_mul_u32_u24_e32 v17, 28, v11
949
    * - v_mul_u32_u24_e32 v18, 28, v10
950
    */
951
   ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[0], false);
952
   ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[1], false);
953
   ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[2], false);
954

955
   LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
956

957
   /* Do culling in GS threads. */
958
   ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002);
959
   {
960
      /* Load positions. */
961
      LLVMValueRef pos[3][4] = {};
962
      for (unsigned vtx = 0; vtx < 3; vtx++) {
963
         for (unsigned chan = 0; chan < 4; chan++) {
964
            unsigned index;
965
            if (chan == 0 || chan == 1)
966
               index = lds_pos_cull_x_div_w + chan;
967
            else if (chan == 3)
968
               index = lds_pos_cull_w;
969
            else
970
               continue;
971

972
            LLVMValueRef addr =
973
               ac_build_gep0(&ctx->ac, gs_vtxptr[vtx], LLVMConstInt(ctx->ac.i32, index, 0));
974
            pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
975
            pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
976
         }
977
      }
978

979
      /* Load the viewport state for small prim culling. */
980
      LLVMValueRef vp = ac_build_load_invariant(
981
         &ctx->ac, ac_get_arg(&ctx->ac, ctx->small_prim_cull_info), ctx->ac.i32_0);
982
      vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
983
      LLVMValueRef vp_scale[2], vp_translate[2];
984
      vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
985
      vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
986
      vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
987
      vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
988

989
      /* Get the small prim filter precision. */
990
      LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
991
      small_prim_precision =
992
         LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
993
      small_prim_precision =
994
         LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), "");
995
      small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
996

997
      /* Execute culling code. */
998
      struct ac_cull_options options = {};
999
      options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
1000
      options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
1001
      options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS;
1002
      options.cull_small_prims = options.cull_view_xy;
1003
      options.cull_zero_area = options.cull_front || options.cull_back;
1004
      options.cull_w = true;
1005

1006
      /* Tell ES threads whether their vertex survived. */
1007
      LLVMValueRef params[] = {
1008
         gs_accepted,
1009
         (void*)gs_vtxptr,
1010
      };
1011
      ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true, vp_scale, vp_translate,
1012
                       small_prim_precision, &options,
1013
                       gfx10_build_primitive_accepted, params);
1014
   }
1015
   ac_build_endif(&ctx->ac, 16002);
1016
   ac_build_s_barrier(&ctx->ac);
1017

1018
   gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
1019

1020
   LLVMValueRef vertex_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, "");
1021
   LLVMValueRef vertex_mask = ac_build_alloca(&ctx->ac, ctx->ac.iN_wavemask, "");
1022

1023
   /* Convert the per-vertex accept flag to a vertex thread mask, store it in registers. */
1024
   ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007);
1025
   {
1026
      LLVMValueRef accepted =
1027
         LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), "");
1028
      accepted = LLVMBuildICmp(builder, LLVMIntNE, accepted, ctx->ac.i8_0, "");
1029
      LLVMValueRef mask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
1030

1031
      LLVMBuildStore(builder, accepted, vertex_accepted);
1032
      LLVMBuildStore(builder, mask, vertex_mask);
1033
   }
1034
   ac_build_endif(&ctx->ac, 16007);
1035

1036
   /* Store the per-wave vertex count to LDS. Non-ES waves store 0. */
1037
   vertex_mask = LLVMBuildLoad(builder, vertex_mask, "");
1038
   ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, ctx->ac.i32_0, ""), 16008);
1039
   {
1040
      LLVMValueRef vertex_count = ac_build_bit_count(&ctx->ac, vertex_mask);
1041
      LLVMBuildStore(builder, LLVMBuildTrunc(builder, vertex_count, ctx->ac.i8, ""),
1042
                     si_build_gep_i8_var(ctx, ctx->gs_ngg_scratch, get_wave_id_in_tg(ctx)));
1043
   }
1044
   ac_build_endif(&ctx->ac, 16008);
1045

1046
   ac_build_s_barrier(&ctx->ac);
1047

1048
   /* Load the vertex masks and compute the new ES thread count. */
1049
   LLVMValueRef new_num_es_threads, prefix_sum, kill_wave;
1050
   load_vertex_counts(ctx, ctx->gs_ngg_scratch, max_waves, tid, &new_num_es_threads,
1051
                      &prefix_sum);
1052

1053
   bool uses_instance_id = ctx->stage == MESA_SHADER_VERTEX &&
1054
                           (sel->info.uses_instanceid ||
1055
                            shader->key.part.vs.prolog.instance_divisor_is_one ||
1056
                            shader->key.part.vs.prolog.instance_divisor_is_fetched);
1057
   bool uses_tes_prim_id = ctx->stage == MESA_SHADER_TESS_EVAL &&
1058
                           (sel->info.uses_primid || shader->key.mono.u.vs_export_prim_id);
1059

1060
   /* ES threads compute their prefix sum, which is the new ES thread ID.
1061
    * Then they write the vertex position and input VGPRs into the LDS address
1062
    * of the new thread ID. It will be used to load input VGPRs by compacted
1063
    * threads.
1064
    */
1065
   vertex_accepted = LLVMBuildLoad(builder, vertex_accepted, "");
1066
   ac_build_ifcc(&ctx->ac, vertex_accepted, 16009);
1067
   {
1068
      /* Add the number of bits set in vertex_mask up to the current thread ID - 1
1069
       * to get the prefix sum.
1070
       */
1071
      prefix_sum = LLVMBuildAdd(builder, prefix_sum, ac_build_mbcnt(&ctx->ac, vertex_mask), "");
1072

1073
      LLVMValueRef new_id = prefix_sum;
1074
      LLVMValueRef new_vtx = ngg_nogs_vertex_ptr(ctx, new_id);
1075

1076
      LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
1077
                     si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
1078

1079
      /* Store Position.XYZW into LDS. */
1080
      for (unsigned chan = 0; chan < 4; chan++) {
1081
         LLVMBuildStore(
1082
            builder, ac_to_integer(&ctx->ac, LLVMBuildLoad(builder, addrs[4 * pos_index + chan], "")),
1083
            ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
1084
      }
1085

1086
      /* Store VertexID and InstanceID into LDS. ES threads will have to load them
1087
       * from LDS after vertex compaction and use them instead of their own
1088
       * system values.
1089
       */
1090
      if (ctx->stage == MESA_SHADER_VERTEX) {
1091
         LLVMBuildStore(
1092
            builder, ctx->abi.vertex_id,
1093
            ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
1094
         if (uses_instance_id) {
1095
            LLVMBuildStore(
1096
               builder, ctx->abi.instance_id,
1097
               ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
1098
         }
1099
      } else {
1100
         assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1101
         LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tes_u)),
1102
                        ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
1103
         LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tes_v)),
1104
                        ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
1105
         LLVMBuildStore(builder, LLVMBuildTrunc(builder, ac_get_arg(&ctx->ac, ctx->args.tes_rel_patch_id), ctx->ac.i8, ""),
1106
                        si_build_gep_i8(ctx, new_vtx, lds_byte2_tes_rel_patch_id));
1107
         if (uses_tes_prim_id) {
1108
            LLVMBuildStore(
1109
               builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
1110
               ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
1111
         }
1112
      }
1113
   }
1114
   ac_build_endif(&ctx->ac, 16009);
1115

1116
   /* If all vertices are culled, set the primitive count to 0, so that all waves are culled here. */
1117
   LLVMValueRef num_primitives = ngg_get_prim_cnt(ctx);
1118
   num_primitives = LLVMBuildSelect(builder,
1119
                                    LLVMBuildICmp(builder, LLVMIntEQ, new_num_es_threads,
1120
                                                  ctx->ac.i32_0, ""),
1121
                                    ctx->ac.i32_0, num_primitives, "");
1122
   /* Kill waves that have inactive threads. */
1123
   kill_wave = LLVMBuildICmp(builder, LLVMIntULE,
1124
                             ac_build_imax(&ctx->ac, new_num_es_threads, num_primitives),
1125
                             LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
1126
                                          LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""),
1127
                             "");
1128
   ac_build_ifcc(&ctx->ac, kill_wave, 19202);
1129
   {
1130
      /* If we are killing wave 0, send that there are no primitives
1131
       * in this threadgroup.
1132
       */
1133
      ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ctx->ac.i32_0);
1134
      ac_build_s_endpgm(&ctx->ac);
1135
   }
1136
   ac_build_endif(&ctx->ac, 19202);
1137
   ac_build_s_barrier(&ctx->ac);
1138

1139
   /* Send the final vertex and primitive counts. */
1140
   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), new_num_es_threads,
1141
                                 ngg_get_prim_cnt(ctx));
1142

1143
   /* Update thread counts in SGPRs. */
1144
   LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->args.gs_tg_info);
1145
   LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->args.merged_wave_info);
1146

1147
   /* This also converts the thread count from the total count to the per-wave count. */
1148
   update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12, &new_merged_wave_info, 8,
1149
                        0);
1150

1151
   /* Update vertex indices in VGPR0 (same format as NGG passthrough).
1152
    *
1153
    * Set the null flag at the beginning (culled), and then
1154
    * overwrite it for accepted primitives.
1155
    */
1156
   LLVMValueRef new_vgpr0 =
1157
      ac_build_alloca_init(&ctx->ac, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), "");
1158

1159
   /* Get vertex indices after vertex compaction. */
1160
   ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011);
1161
   {
1162
      struct ac_ngg_prim prim = {};
1163
      prim.num_vertices = 3;
1164
      prim.isnull = ctx->ac.i1false;
1165

1166
      for (unsigned vtx = 0; vtx < 3; vtx++) {
1167
         prim.index[vtx] = LLVMBuildLoad(
1168
            builder, si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte1_new_thread_id), "");
1169
         prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, "");
1170
         prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx);
1171
      }
1172

1173
      /* Set the new GS input VGPR. */
1174
      LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0);
1175
   }
1176
   ac_build_endif(&ctx->ac, 16011);
1177

1178
   if (gfx10_ngg_export_prim_early(shader))
1179
      gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, ""));
1180

1181
   /* Prepare LDS addresses of the new ES input VGPRs. */
1182
   LLVMValueRef input_vgpr_addresses[4] = {
1183
      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)),
1184
      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)),
1185
   };
1186
   if (ctx->stage == MESA_SHADER_TESS_EVAL) {
1187
      input_vgpr_addresses[2] = si_build_gep_i8(ctx, es_vtxptr, lds_byte2_tes_rel_patch_id);
1188
      if (uses_tes_prim_id) {
1189
         input_vgpr_addresses[3] = ac_build_gep0(&ctx->ac, es_vtxptr,
1190
                                                 LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0));
1191
      }
1192
   }
1193

1194
   /* Return values for the main function. */
1195
   LLVMValueRef ret = ctx->return_value;
1196
   LLVMValueRef val;
1197

1198
   ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, "");
1199
   ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
1200
   if (ctx->stage == MESA_SHADER_TESS_EVAL)
1201
      ret = si_insert_input_ret(ctx, ret, ctx->args.tess_offchip_offset, 4);
1202

1203
   ret = si_insert_input_ptr(ctx, ret, ctx->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS);
1204
   ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
1205
                             8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
1206
   ret = si_insert_input_ptr(ctx, ret, ctx->const_and_shader_buffers,
1207
                             8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
1208
   ret = si_insert_input_ptr(ctx, ret, ctx->samplers_and_images, 8 + SI_SGPR_SAMPLERS_AND_IMAGES);
1209
   ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
1210

1211
   if (ctx->stage == MESA_SHADER_VERTEX) {
1212
      ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex, 8 + SI_SGPR_BASE_VERTEX);
1213
      ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id, 8 + SI_SGPR_DRAWID);
1214
      ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance, 8 + SI_SGPR_START_INSTANCE);
1215
      ret = si_insert_input_ptr(ctx, ret, ctx->args.vertex_buffers, 8 + SI_VS_NUM_USER_SGPR);
1216

1217
      for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
1218
         ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
1219
                                     8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
1220
      }
1221
   } else {
1222
      assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1223
      ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout, 8 + SI_SGPR_TES_OFFCHIP_LAYOUT);
1224
      ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr, 8 + SI_SGPR_TES_OFFCHIP_ADDR);
1225
   }
1226

1227
   unsigned vgpr;
1228
   if (ctx->stage == MESA_SHADER_VERTEX) {
1229
      if (shader->selector->num_vbos_in_user_sgprs) {
1230
         vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
1231
      } else {
1232
         vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
1233
      }
1234
   } else {
1235
      vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
1236
   }
1237

1238
   val = LLVMBuildLoad(builder, new_vgpr0, "");
1239
   ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1240
   vgpr++; /* gs_vtx23_offset */
1241

1242
   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
1243
   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
1244
   vgpr++; /* gs_vtx45_offset */
1245

1246
   /* Set the input VPGRs to the corresponding LDS addresses where the VGPR values are
1247
    * stored. The VS prolog will load them.
1248
    */
1249
   if (ctx->stage == MESA_SHADER_VERTEX) {
1250
      val = LLVMBuildPtrToInt(builder, input_vgpr_addresses[0], ctx->ac.i32, "");
1251
      ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
1252
                                 ""); /* VGPR5 - VertexID */
1253
      vgpr += 2;
1254
      if (uses_instance_id) {
1255
         val = LLVMBuildPtrToInt(builder, input_vgpr_addresses[1], ctx->ac.i32, "");
1256
         ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
1257
                                    ""); /* VGPR8 - InstanceID */
1258
      } else {
1259
         vgpr++;
1260
      }
1261
   } else {
1262
      assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1263
      unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
1264
      for (unsigned i = 0; i < num_vgprs; i++) {
1265
         val = LLVMBuildPtrToInt(builder, input_vgpr_addresses[i], ctx->ac.i32, "");
1266
         ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1267
      }
1268
      if (num_vgprs == 3)
1269
         vgpr++;
1270
   }
1271

1272
   /* These two also use LDS. */
1273
   if (sel->info.writes_edgeflag ||
1274
       (ctx->stage == MESA_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
1275
      ac_build_s_barrier(&ctx->ac);
1276

1277
   ctx->return_value = ret;
1278
}
1279

1280
/**
1281
 * Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
1282
 */
1283
void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
1284
{
1285
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1286
   struct si_shader_selector *sel = ctx->shader->selector;
1287
   struct si_shader_info *info = &sel->info;
1288
   struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
1289
   LLVMBuilderRef builder = ctx->ac.builder;
1290
   LLVMValueRef tmp, tmp2;
1291

1292
   assert(!ctx->shader->is_gs_copy_shader);
1293
   assert(info->num_outputs <= max_outputs);
1294

1295
   LLVMValueRef vertex_ptr = NULL;
1296

1297
   if (sel->so.num_outputs || sel->info.writes_edgeflag)
1298
      vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1299

1300
   for (unsigned i = 0; i < info->num_outputs; i++) {
1301
      outputs[i].semantic = info->output_semantic[i];
1302

1303
      for (unsigned j = 0; j < 4; j++) {
1304
         outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
1305

1306
         /* TODO: we may store more outputs than streamout needs,
1307
          * but streamout performance isn't that important.
1308
          */
1309
         if (sel->so.num_outputs) {
1310
            tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
1311
            tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], "");
1312
            tmp2 = ac_to_integer(&ctx->ac, tmp2);
1313
            LLVMBuildStore(builder, tmp2, tmp);
1314
         }
1315
      }
1316

1317
      /* Store the edgeflag at the end (if streamout is enabled) */
1318
      if (info->output_semantic[i] == VARYING_SLOT_EDGE && sel->info.writes_edgeflag) {
1319
         LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
1320
         /* The output is a float, but the hw expects a 1-bit integer. */
1321
         edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, "");
1322
         edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1);
1323

1324
         tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1325
         tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1326
         LLVMBuildStore(builder, edgeflag, tmp);
1327
      }
1328
   }
1329

1330
   bool unterminated_es_if_block =
1331
      !sel->so.num_outputs && !sel->info.writes_edgeflag &&
1332
      !ctx->screen->use_ngg_streamout && /* no query buffer */
1333
      (ctx->stage != MESA_SHADER_VERTEX || !ctx->shader->key.mono.u.vs_export_prim_id);
1334

1335
   if (!unterminated_es_if_block)
1336
      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1337

1338
   LLVMValueRef is_gs_thread = si_is_gs_thread(ctx);
1339
   LLVMValueRef is_es_thread = si_is_es_thread(ctx);
1340
   LLVMValueRef vtxindex[3];
1341

1342
   if (ctx->shader->key.opt.ngg_culling) {
1343
      vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9);
1344
      vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9);
1345
      vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9);
1346
   } else {
1347
      vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
1348
      vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
1349
      vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
1350
   }
1351

1352
   /* Determine the number of vertices per primitive. */
1353
   unsigned num_vertices;
1354
   LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices);
1355

1356
   /* Streamout */
1357
   LLVMValueRef emitted_prims = NULL;
1358

1359
   if (sel->so.num_outputs) {
1360
      assert(!unterminated_es_if_block);
1361

1362
      struct ngg_streamout nggso = {};
1363
      nggso.num_vertices = num_vertices_val;
1364
      nggso.prim_enable[0] = is_gs_thread;
1365

1366
      for (unsigned i = 0; i < num_vertices; ++i)
1367
         nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1368

1369
      build_streamout(ctx, &nggso);
1370
      emitted_prims = nggso.emit[0];
1371
   }
1372

1373
   LLVMValueRef user_edgeflags[3] = {};
1374

1375
   if (sel->info.writes_edgeflag) {
1376
      assert(!unterminated_es_if_block);
1377

1378
      /* Streamout already inserted the barrier, so don't insert it again. */
1379
      if (!sel->so.num_outputs)
1380
         ac_build_s_barrier(&ctx->ac);
1381

1382
      ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1383
      /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
1384
      for (unsigned i = 0; i < num_vertices; i++) {
1385
         tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1386
         tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1387
         tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
1388
         tmp = LLVMBuildLoad(builder, tmp, "");
1389
         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1390

1391
         user_edgeflags[i] = ac_build_alloca_init(&ctx->ac, tmp, "");
1392
      }
1393
      ac_build_endif(&ctx->ac, 5400);
1394
   }
1395

1396
   /* Copy Primitive IDs from GS threads to the LDS address corresponding
1397
    * to the ES thread of the provoking vertex.
1398
    */
1399
   if (ctx->stage == MESA_SHADER_VERTEX && ctx->shader->key.mono.u.vs_export_prim_id) {
1400
      assert(!unterminated_es_if_block);
1401

1402
      /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
1403
      if (sel->so.num_outputs || sel->info.writes_edgeflag)
1404
         ac_build_s_barrier(&ctx->ac);
1405

1406
      ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1407
      /* Extract the PROVOKING_VTX_INDEX field. */
1408
      LLVMValueRef provoking_vtx_in_prim = si_unpack_param(ctx, ctx->vs_state_bits, 4, 2);
1409

1410
      /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
1411
      LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
1412
      LLVMValueRef provoking_vtx_index =
1413
         LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
1414
      LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index);
1415

1416
      LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id),
1417
                     ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0));
1418
      ac_build_endif(&ctx->ac, 5400);
1419
   }
1420

1421
   /* Update query buffer */
1422
   if (ctx->screen->use_ngg_streamout && !info->base.vs.blit_sgprs_amd) {
1423
      assert(!unterminated_es_if_block);
1424

1425
      tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
1426
      tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1427
      ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */
1428
      tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
1429
      ac_build_ifcc(&ctx->ac, tmp, 5030);
1430
      tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac),
1431
                          sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, "");
1432
      ac_build_ifcc(&ctx->ac, tmp, 5031);
1433
      {
1434
         LLVMValueRef args[] = {
1435
            ngg_get_prim_cnt(ctx),
1436
            ngg_get_query_buf(ctx),
1437
            LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */
1438
            ctx->ac.i32_0,                        /* soffset */
1439
            ctx->ac.i32_0,                        /* cachepolicy */
1440
         };
1441

1442
         if (sel->so.num_outputs) {
1443
            args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1);
1444
            args[2] = ac_build_writelane(&ctx->ac, args[2], LLVMConstInt(ctx->ac.i32, 24, false),
1445
                                         ctx->ac.i32_1);
1446
         }
1447

1448
         /* TODO: should this be 64-bit atomics? */
1449
         ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
1450
                            0);
1451
      }
1452
      ac_build_endif(&ctx->ac, 5031);
1453
      ac_build_endif(&ctx->ac, 5030);
1454
      ac_build_endif(&ctx->ac, 5029);
1455
   }
1456

1457
   /* Build the primitive export. */
1458
   if (!gfx10_ngg_export_prim_early(ctx->shader)) {
1459
      assert(!unterminated_es_if_block);
1460
      gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL);
1461
   }
1462

1463
   /* Export per-vertex data (positions and parameters). */
1464
   if (!unterminated_es_if_block)
1465
      ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
1466
   {
1467
      unsigned i;
1468

1469
      /* Unconditionally (re-)load the values for proper SSA form. */
1470
      for (i = 0; i < info->num_outputs; i++) {
1471
         /* If the NGG cull shader part computed the position, don't
1472
          * use the position from the current shader part. Instead,
1473
          * load it from LDS.
1474
          */
1475
         if (info->output_semantic[i] == VARYING_SLOT_POS &&
1476
             ctx->shader->key.opt.ngg_culling) {
1477
            vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1478

1479
            for (unsigned j = 0; j < 4; j++) {
1480
               tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
1481
               tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1482
               tmp = LLVMBuildLoad(builder, tmp, "");
1483
               outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
1484
            }
1485
         } else {
1486
            for (unsigned j = 0; j < 4; j++) {
1487
               outputs[i].values[j] = LLVMBuildLoad(builder, addrs[4 * i + j], "");
1488
            }
1489
         }
1490
      }
1491

1492
      if (ctx->shader->key.mono.u.vs_export_prim_id) {
1493
         outputs[i].semantic = VARYING_SLOT_PRIMITIVE_ID;
1494

1495
         if (ctx->stage == MESA_SHADER_VERTEX) {
1496
            /* Wait for GS stores to finish. */
1497
            ac_build_s_barrier(&ctx->ac);
1498

1499
            tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1500
            tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
1501
            outputs[i].values[0] = LLVMBuildLoad(builder, tmp, "");
1502
         } else {
1503
            assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1504
            outputs[i].values[0] = si_get_primitive_id(ctx, 0);
1505
         }
1506

1507
         outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]);
1508
         for (unsigned j = 1; j < 4; j++)
1509
            outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32);
1510

1511
         memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
1512
         i++;
1513
      }
1514

1515
      si_llvm_build_vs_exports(ctx, outputs, i);
1516
   }
1517
   ac_build_endif(&ctx->ac, 6002);
1518
}
1519

1520
static LLVMValueRef ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
1521
{
1522
   const struct si_shader_selector *sel = ctx->shader->selector;
1523
   const struct si_shader_info *info = &sel->info;
1524

1525
   LLVMTypeRef elements[2] = {
1526
      LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
1527
      LLVMArrayType(ctx->ac.i8, 4),
1528
   };
1529
   LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
1530
   type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
1531
   return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
1532
}
1533

1534
/**
1535
 * Return a pointer to the LDS storage reserved for the N'th vertex, where N
1536
 * is in emit order; that is:
1537
 * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
1538
 * - during vertex emit, i.e. while the API GS shader invocation is running,
1539
 *   N = threadidx * gs.vertices_out + emitidx
1540
 *
1541
 * Goals of the LDS memory layout:
1542
 * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
1543
 *    in uniform control flow
1544
 * 2. Eliminate bank conflicts on read for export if, additionally, there is no
1545
 *    culling
1546
 * 3. Agnostic to the number of waves (since we don't know it before compiling)
1547
 * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
1548
 * 5. Avoid wasting memory.
1549
 *
1550
 * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
1551
 * layout, elimination of bank conflicts requires that each vertex occupy an
1552
 * odd number of dwords. We use the additional dword to store the output stream
1553
 * index as well as a flag to indicate whether this vertex ends a primitive
1554
 * for rasterization.
1555
 *
1556
 * Swizzling is required to satisfy points 1 and 2 simultaneously.
1557
 *
1558
 * Vertices are stored in export order (gsthread * gs.vertices_out + emitidx).
1559
 * Indices are swizzled in groups of 32, which ensures point 1 without
1560
 * disturbing point 2.
1561
 *
1562
 * \return an LDS pointer to type {[N x i32], [4 x i8]}
1563
 */
1564
static LLVMValueRef ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
1565
{
1566
   struct si_shader_selector *sel = ctx->shader->selector;
1567
   LLVMBuilderRef builder = ctx->ac.builder;
1568
   LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
1569

1570
   /* gs.vertices_out = 2^(write_stride_2exp) * some odd number */
1571
   unsigned write_stride_2exp = ffs(sel->info.base.gs.vertices_out) - 1;
1572
   if (write_stride_2exp) {
1573
      LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");
1574
      LLVMValueRef swizzle = LLVMBuildAnd(
1575
         builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), "");
1576
      vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
1577
   }
1578

1579
   return ac_build_gep0(&ctx->ac, storage, vertexidx);
1580
}
1581

1582
static LLVMValueRef ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
1583
                                           LLVMValueRef emitidx)
1584
{
1585
   struct si_shader_selector *sel = ctx->shader->selector;
1586
   LLVMBuilderRef builder = ctx->ac.builder;
1587
   LLVMValueRef tmp;
1588

1589
   tmp = LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false);
1590
   tmp = LLVMBuildMul(builder, tmp, gsthread, "");
1591
   const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
1592
   return ngg_gs_vertex_ptr(ctx, vertexidx);
1593
}
1594

1595
static LLVMValueRef ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx,
1596
                                               LLVMValueRef vertexptr, unsigned out_idx)
1597
{
1598
   LLVMValueRef gep_idx[3] = {
1599
      ctx->ac.i32_0, /* implied C-style array */
1600
      ctx->ac.i32_0, /* first struct entry */
1601
      LLVMConstInt(ctx->ac.i32, out_idx, false),
1602
   };
1603
   return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1604
}
1605

1606
static LLVMValueRef ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx,
1607
                                                 LLVMValueRef vertexptr, unsigned stream)
1608
{
1609
   LLVMValueRef gep_idx[3] = {
1610
      ctx->ac.i32_0, /* implied C-style array */
1611
      ctx->ac.i32_1, /* second struct entry */
1612
      LLVMConstInt(ctx->ac.i32, stream, false),
1613
   };
1614
   return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1615
}
1616

1617
void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs)
1618
{
1619
   const struct si_shader_selector *sel = ctx->shader->selector;
1620
   const struct si_shader_info *info = &sel->info;
1621
   LLVMBuilderRef builder = ctx->ac.builder;
1622
   LLVMValueRef tmp;
1623
   const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1624

1625
   /* If this thread has already emitted the declared maximum number of
1626
    * vertices, skip the write: excessive vertex emissions are not
1627
    * supposed to have any effect.
1628
    */
1629
   const LLVMValueRef can_emit =
1630
      LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
1631
                    LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false), "");
1632

1633
   tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1634
   tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
1635
   LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1636

1637
   ac_build_ifcc(&ctx->ac, can_emit, 9001);
1638

1639
   const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
1640
   unsigned out_idx = 0;
1641
   for (unsigned i = 0; i < info->num_outputs; i++) {
1642
      for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
1643
         if (!(info->output_usagemask[i] & (1 << chan)) ||
1644
             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
1645
            continue;
1646

1647
         LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
1648
         out_val = ac_to_integer(&ctx->ac, out_val);
1649
         LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
1650
      }
1651
   }
1652
   assert(out_idx * 4 == sel->gsvs_vertex_size);
1653

1654
   /* Determine and store whether this vertex completed a primitive. */
1655
   const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
1656

1657
   tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->info.base.gs.output_primitive) - 1, false);
1658
   const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
1659

1660
   /* Since the geometry shader emits triangle strips, we need to
1661
    * track which primitive is odd and swap vertex indices to get
1662
    * the correct vertex order.
1663
    */
1664
   LLVMValueRef is_odd = ctx->ac.i1false;
1665
   if (stream == 0 && u_vertices_per_prim(sel->info.base.gs.output_primitive) == 3) {
1666
      tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
1667
      is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
1668
   }
1669

1670
   tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
1671
   LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
1672

1673
   /* The per-vertex primitive flag encoding:
1674
    *   bit 0: whether this vertex finishes a primitive
1675
    *   bit 1: whether the primitive is odd (if we are emitting triangle strips)
1676
    */
1677
   tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
1678
   tmp = LLVMBuildOr(
1679
      builder, tmp,
1680
      LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), "");
1681
   LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
1682

1683
   tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1684
   tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
1685
   LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
1686

1687
   ac_build_endif(&ctx->ac, 9001);
1688
}
1689

1690
void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
1691
{
1692
   /* Zero out the part of LDS scratch that is used to accumulate the
1693
    * per-stream generated primitive count.
1694
    */
1695
   LLVMBuilderRef builder = ctx->ac.builder;
1696
   LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
1697
   LLVMValueRef tid = get_thread_id_in_tg(ctx);
1698
   LLVMValueRef tmp;
1699

1700
   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
1701
   ac_build_ifcc(&ctx->ac, tmp, 5090);
1702
   {
1703
      LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
1704
      LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
1705
   }
1706
   ac_build_endif(&ctx->ac, 5090);
1707

1708
   ac_build_s_barrier(&ctx->ac);
1709
}
1710

1711
void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
1712
{
1713
   const struct si_shader_selector *sel = ctx->shader->selector;
1714
   const struct si_shader_info *info = &sel->info;
1715
   const unsigned verts_per_prim = u_vertices_per_prim(sel->info.base.gs.output_primitive);
1716
   LLVMBuilderRef builder = ctx->ac.builder;
1717
   LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
1718
   LLVMValueRef tmp, tmp2;
1719

1720
   /* Zero out remaining (non-emitted) primitive flags.
1721
    *
1722
    * Note: Alternatively, we could pass the relevant gs_next_vertex to
1723
    *       the emit threads via LDS. This is likely worse in the expected
1724
    *       typical case where each GS thread emits the full set of
1725
    *       vertices.
1726
    */
1727
   for (unsigned stream = 0; stream < 4; ++stream) {
1728
      if (!info->num_stream_output_components[stream])
1729
         continue;
1730

1731
      const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
1732

1733
      ac_build_bgnloop(&ctx->ac, 5100);
1734

1735
      const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1736
      tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
1737
                          LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false), "");
1738
      ac_build_ifcc(&ctx->ac, tmp, 5101);
1739
      ac_build_break(&ctx->ac);
1740
      ac_build_endif(&ctx->ac, 5101);
1741

1742
      tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1743
      LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1744

1745
      tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
1746
      LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
1747

1748
      ac_build_endloop(&ctx->ac, 5100);
1749
   }
1750

1751
   /* Accumulate generated primitives counts across the entire threadgroup. */
1752
   for (unsigned stream = 0; stream < 4; ++stream) {
1753
      if (!info->num_stream_output_components[stream])
1754
         continue;
1755

1756
      LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1757
      numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
1758

1759
      tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
1760
      ac_build_ifcc(&ctx->ac, tmp, 5105);
1761
      {
1762
         LLVMBuildAtomicRMW(
1763
            builder, LLVMAtomicRMWBinOpAdd,
1764
            ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)),
1765
            numprims, LLVMAtomicOrderingMonotonic, false);
1766
      }
1767
      ac_build_endif(&ctx->ac, 5105);
1768
   }
1769

1770
   ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1771

1772
   ac_build_s_barrier(&ctx->ac);
1773

1774
   const LLVMValueRef tid = get_thread_id_in_tg(ctx);
1775
   LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
1776

1777
   /* Streamout */
1778
   if (sel->so.num_outputs) {
1779
      struct ngg_streamout nggso = {};
1780

1781
      nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
1782

1783
      LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
1784
      for (unsigned stream = 0; stream < 4; ++stream) {
1785
         if (!info->num_stream_output_components[stream])
1786
            continue;
1787

1788
         tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
1789
         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1790
         tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1791
         nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
1792
      }
1793

1794
      for (unsigned i = 0; i < verts_per_prim; ++i) {
1795
         tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false),
1796
                            "");
1797
         tmp = ngg_gs_vertex_ptr(ctx, tmp);
1798
         nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
1799
      }
1800

1801
      build_streamout(ctx, &nggso);
1802
   }
1803

1804
   /* Write shader query data. */
1805
   if (ctx->screen->use_ngg_streamout) {
1806
      tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
1807
      tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1808
      ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */
1809
      unsigned num_query_comps = sel->so.num_outputs ? 8 : 4;
1810
      tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
1811
                          LLVMConstInt(ctx->ac.i32, num_query_comps, false), "");
1812
      ac_build_ifcc(&ctx->ac, tmp, 5110);
1813
      {
1814
         LLVMValueRef offset;
1815
         tmp = tid;
1816
         if (sel->so.num_outputs)
1817
            tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), "");
1818
         offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), "");
1819
         if (sel->so.num_outputs) {
1820
            tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), "");
1821
            tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), "");
1822
            offset = LLVMBuildAdd(builder, offset, tmp, "");
1823
         }
1824

1825
         tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
1826
         LLVMValueRef args[] = {
1827
            tmp,           ngg_get_query_buf(ctx),
1828
            offset,        LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */
1829
            ctx->ac.i32_0,                                       /* cachepolicy */
1830
         };
1831
         ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
1832
                            0);
1833
      }
1834
      ac_build_endif(&ctx->ac, 5110);
1835
      ac_build_endif(&ctx->ac, 5109);
1836
   }
1837

1838
   /* Determine vertex liveness. */
1839
   LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
1840

1841
   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1842
   ac_build_ifcc(&ctx->ac, tmp, 5120);
1843
   {
1844
      for (unsigned i = 0; i < verts_per_prim; ++i) {
1845
         const LLVMValueRef primidx =
1846
            LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), "");
1847

1848
         if (i > 0) {
1849
            tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
1850
            ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
1851
         }
1852

1853
         /* Load primitive liveness */
1854
         tmp = ngg_gs_vertex_ptr(ctx, primidx);
1855
         tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1856
         const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1857

1858
         tmp = LLVMBuildLoad(builder, vertliveptr, "");
1859
         tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr);
1860

1861
         if (i > 0)
1862
            ac_build_endif(&ctx->ac, 5121 + i);
1863
      }
1864
   }
1865
   ac_build_endif(&ctx->ac, 5120);
1866

1867
   /* Inclusive scan addition across the current wave. */
1868
   LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
1869
   struct ac_wg_scan vertlive_scan = {};
1870
   vertlive_scan.op = nir_op_iadd;
1871
   vertlive_scan.enable_reduce = true;
1872
   vertlive_scan.enable_exclusive = true;
1873
   vertlive_scan.src = vertlive;
1874
   vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
1875
   vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
1876
   vertlive_scan.numwaves = get_tgsize(ctx);
1877
   vertlive_scan.maxwaves = DIV_ROUND_UP(256, ctx->ac.wave_size);
1878

1879
   ac_build_wg_scan(&ctx->ac, &vertlive_scan);
1880

1881
   /* Skip all exports (including index exports) when possible. */
1882
   LLVMValueRef have_exports =
1883
      LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
1884
   num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
1885

1886
   /* Allocate export space. Send this message as early as possible, to
1887
    * hide the latency of the SQ <-> SPI roundtrip.
1888
    */
1889
   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce,
1890
                                 num_emit_threads);
1891

1892
   /* Setup the reverse vertex compaction permutation. We re-use stream 1
1893
    * of the primitive liveness flags, relying on the fact that each
1894
    * threadgroup can have at most 256 threads. */
1895
   ac_build_ifcc(&ctx->ac, vertlive, 5130);
1896
   {
1897
      tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
1898
      tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
1899
      LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
1900
   }
1901
   ac_build_endif(&ctx->ac, 5130);
1902

1903
   ac_build_s_barrier(&ctx->ac);
1904

1905
   /* Export primitive data */
1906
   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1907
   ac_build_ifcc(&ctx->ac, tmp, 5140);
1908
   {
1909
      LLVMValueRef flags;
1910
      struct ac_ngg_prim prim = {};
1911
      prim.num_vertices = verts_per_prim;
1912

1913
      tmp = ngg_gs_vertex_ptr(ctx, tid);
1914
      flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1915
      prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
1916

1917
      for (unsigned i = 0; i < verts_per_prim; ++i) {
1918
         prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
1919
                                      LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
1920
         prim.edgeflag[i] = ctx->ac.i1false;
1921
      }
1922

1923
      /* Geometry shaders output triangle strips, but NGG expects triangles. */
1924
      if (verts_per_prim == 3) {
1925
         LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
1926
         is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
1927
         LLVMValueRef flatshade_first = LLVMBuildICmp(
1928
            builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, "");
1929

1930
         ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);
1931
      }
1932

1933
      ac_build_export_prim(&ctx->ac, &prim);
1934
   }
1935
   ac_build_endif(&ctx->ac, 5140);
1936

1937
   /* Export position and parameter data */
1938
   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
1939
   ac_build_ifcc(&ctx->ac, tmp, 5145);
1940
   {
1941
      struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
1942

1943
      tmp = ngg_gs_vertex_ptr(ctx, tid);
1944
      tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
1945
      tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
1946
      const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
1947

1948
      unsigned out_idx = 0;
1949
      for (unsigned i = 0; i < info->num_outputs; i++) {
1950
         outputs[i].semantic = info->output_semantic[i];
1951

1952
         for (unsigned j = 0; j < 4; j++, out_idx++) {
1953
            tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
1954
            tmp = LLVMBuildLoad(builder, tmp, "");
1955
            outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
1956
            outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
1957
         }
1958
      }
1959

1960
      si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
1961
   }
1962
   ac_build_endif(&ctx->ac, 5145);
1963
}
1964

1965
static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
1966
                                     unsigned min_verts_per_prim, bool use_adjacency)
1967
{
1968
   unsigned max_reuse = max_esverts - min_verts_per_prim;
1969
   if (use_adjacency)
1970
      max_reuse /= 2;
1971
   *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
1972
}
1973

1974
unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader)
1975
{
1976
   const struct si_shader_selector *sel = shader->selector;
1977

1978
   if (sel->info.stage == MESA_SHADER_GEOMETRY && sel->so.num_outputs)
1979
      return 44;
1980

1981
   return 8;
1982
}
1983

1984
/**
1985
 * Determine subgroup information like maximum number of vertices and prims.
1986
 *
1987
 * This happens before the shader is uploaded, since LDS relocations during
1988
 * upload depend on the subgroup size.
1989
 */
1990
bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
1991
{
1992
   const struct si_shader_selector *gs_sel = shader->selector;
1993
   const struct si_shader_selector *es_sel =
1994
      shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
1995
   const gl_shader_stage gs_stage = gs_sel->info.stage;
1996
   const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
1997
   const unsigned input_prim = si_get_input_prim(gs_sel);
1998
   const bool use_adjacency =
1999
      input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
2000
   const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
2001
   const unsigned min_verts_per_prim = gs_stage == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
2002

2003
   /* All these are in dwords: */
2004
   /* GE can only use 8K dwords (32KB) of LDS per workgroup.
2005
    */
2006
   const unsigned max_lds_size = 8 * 1024 - gfx10_ngg_get_scratch_dw_size(shader);
2007
   const unsigned target_lds_size = max_lds_size;
2008
   unsigned esvert_lds_size = 0;
2009
   unsigned gsprim_lds_size = 0;
2010

2011
   /* All these are per subgroup: */
2012
   const unsigned min_esverts = gs_sel->screen->info.chip_class >= GFX10_3 ? 29 : 24;
2013
   bool max_vert_out_per_gs_instance = false;
2014
   unsigned max_gsprims_base = gs_sel->screen->ngg_subgroup_size; /* default prim group size clamp */
2015
   unsigned max_esverts_base = gs_sel->screen->ngg_subgroup_size;
2016

2017
   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
2018
      /* All lanes are filled in wave32. */
2019
      max_gsprims_base = ROUND_DOWN_TO(max_gsprims_base / 3, 32);
2020
      max_esverts_base = max_gsprims_base * 3;
2021
   } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
2022
      max_gsprims_base = max_esverts_base - 2;
2023
   }
2024

2025
   if (gs_stage == MESA_SHADER_GEOMETRY) {
2026
      bool force_multi_cycling = false;
2027
      unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations;
2028

2029
retry_select_mode:
2030
      if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
2031
         if (max_out_verts_per_gsprim) {
2032
            max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
2033
         }
2034
      } else {
2035
         /* Use special multi-cycling mode in which each GS
2036
          * instance gets its own subgroup. Does not work with
2037
          * tessellation. */
2038
         max_vert_out_per_gs_instance = true;
2039
         max_gsprims_base = 1;
2040
         max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out;
2041
      }
2042

2043
      esvert_lds_size = es_sel->esgs_itemsize / 4;
2044
      gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
2045

2046
      if (gsprim_lds_size > target_lds_size && !force_multi_cycling) {
2047
         if (gs_sel->tess_turns_off_ngg || es_sel->info.stage != MESA_SHADER_TESS_EVAL) {
2048
            force_multi_cycling = true;
2049
            goto retry_select_mode;
2050
         }
2051
      }
2052
   } else {
2053
      /* VS and TES. */
2054
      /* LDS size for passing data from ES to GS. */
2055
      esvert_lds_size = ngg_nogs_vertex_size(shader);
2056
   }
2057

2058
   unsigned max_gsprims = max_gsprims_base;
2059
   unsigned max_esverts = max_esverts_base;
2060

2061
   if (esvert_lds_size)
2062
      max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
2063
   if (gsprim_lds_size)
2064
      max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
2065

2066
   max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2067
   clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2068
   assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2069

2070
   if (esvert_lds_size || gsprim_lds_size) {
2071
      /* Now that we have a rough proportionality between esverts
2072
       * and gsprims based on the primitive type, scale both of them
2073
       * down simultaneously based on required LDS space.
2074
       *
2075
       * We could be smarter about this if we knew how much vertex
2076
       * reuse to expect.
2077
       */
2078
      unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
2079
      if (lds_total > target_lds_size) {
2080
         max_esverts = max_esverts * target_lds_size / lds_total;
2081
         max_gsprims = max_gsprims * target_lds_size / lds_total;
2082

2083
         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2084
         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2085
         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2086
      }
2087
   }
2088

2089
   /* Round up towards full wave sizes for better ALU utilization. */
2090
   if (!max_vert_out_per_gs_instance) {
2091
      const unsigned wavesize = si_get_shader_wave_size(shader);
2092
      unsigned orig_max_esverts;
2093
      unsigned orig_max_gsprims;
2094
      do {
2095
         orig_max_esverts = max_esverts;
2096
         orig_max_gsprims = max_gsprims;
2097

2098
         max_esverts = align(max_esverts, wavesize);
2099
         max_esverts = MIN2(max_esverts, max_esverts_base);
2100
         if (esvert_lds_size)
2101
            max_esverts =
2102
               MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
2103
         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2104

2105
         /* Hardware restriction: minimum value of max_esverts */
2106
         if (gs_sel->screen->info.chip_class == GFX10)
2107
            max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2108
         else
2109
            max_esverts = MAX2(max_esverts, min_esverts);
2110

2111
         max_gsprims = align(max_gsprims, wavesize);
2112
         max_gsprims = MIN2(max_gsprims, max_gsprims_base);
2113
         if (gsprim_lds_size) {
2114
            /* Don't count unusable vertices to the LDS size. Those are vertices above
2115
             * the maximum number of vertices that can occur in the workgroup,
2116
             * which is e.g. max_gsprims * 3 for triangles.
2117
             */
2118
            unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2119
            max_gsprims =
2120
               MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
2121
         }
2122
         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2123
         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2124
      } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
2125

2126
      /* Verify the restriction. */
2127
      if (gs_sel->screen->info.chip_class == GFX10)
2128
         assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
2129
      else
2130
         assert(max_esverts >= min_esverts);
2131
   } else {
2132
      /* Hardware restriction: minimum value of max_esverts */
2133
      if (gs_sel->screen->info.chip_class == GFX10)
2134
         max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2135
      else
2136
         max_esverts = MAX2(max_esverts, min_esverts);
2137
   }
2138

2139
   unsigned max_out_vertices =
2140
      max_vert_out_per_gs_instance
2141
         ? gs_sel->info.base.gs.vertices_out
2142
         : gs_stage == MESA_SHADER_GEOMETRY
2143
              ? max_gsprims * gs_num_invocations * gs_sel->info.base.gs.vertices_out
2144
              : max_esverts;
2145
   assert(max_out_vertices <= 256);
2146

2147
   unsigned prim_amp_factor = 1;
2148
   if (gs_stage == MESA_SHADER_GEOMETRY) {
2149
      /* Number of output primitives per GS input primitive after
2150
       * GS instancing. */
2151
      prim_amp_factor = gs_sel->info.base.gs.vertices_out;
2152
   }
2153

2154
   /* Fix up the thread counts for fast launch. */
2155
   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
2156
      /* The vertex count must be a multiple of 3. */
2157
      max_esverts -= max_esverts % 3;
2158
      /* We can only decrease the size, not increase it. */
2159
      if (max_gsprims * 3 < max_esverts) {
2160
         max_esverts = max_gsprims * 3;
2161
      } else {
2162
         max_gsprims = max_esverts / 3;
2163
      }
2164
   } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
2165
      /* The primitive count must be even to get correct winding for triangle strips. */
2166
      max_gsprims &= ~1;
2167
      if (max_gsprims - 2 < max_esverts) {
2168
         max_esverts = max_gsprims + 2;
2169
      } else {
2170
         max_gsprims = max_esverts - 2;
2171
         max_gsprims &= ~1;
2172
         max_esverts = max_gsprims + 2;
2173
      }
2174
   }
2175

2176
   shader->ngg.hw_max_esverts = max_esverts;
2177
   shader->ngg.max_gsprims = max_gsprims;
2178
   shader->ngg.max_out_verts = max_out_vertices;
2179
   shader->ngg.prim_amp_factor = prim_amp_factor;
2180
   shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
2181

2182
   /* Don't count unusable vertices. */
2183
   shader->gs_info.esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) *
2184
                                    esvert_lds_size;
2185
   shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
2186

2187
   assert(shader->ngg.hw_max_esverts >= min_esverts); /* HW limitation */
2188

2189
   /* If asserts are disabled, we use the same conditions to return false */
2190
   return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
2191
          max_out_vertices <= 256 &&
2192
          shader->ngg.hw_max_esverts >= min_esverts;
2193
}
2194

2195
Product

Resources

Company