CoCalc -- si_shader_llvm

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
⁴⁵⁷⁰ views
1
/*
2
 * Copyright 2020 Advanced Micro Devices, Inc.
3
 * All Rights Reserved.
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * on the rights to use, copy, modify, merge, publish, distribute, sub
9
 * license, and/or sell copies of the Software, and to permit persons to whom
10
 * the Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23
 */
24

25
#include "si_pipe.h"
26
#include "si_shader_internal.h"
27
#include "sid.h"
28
#include "util/u_memory.h"
29

30
LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
31
{
32
   /* Return true if the current thread should execute an ES thread. */
33
   return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
34
                        si_unpack_param(ctx, ctx->args.merged_wave_info, 0, 8), "");
35
}
36

37
LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
38
{
39
   /* Return true if the current thread should execute a GS thread. */
40
   return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
41
                        si_unpack_param(ctx, ctx->args.merged_wave_info, 8, 8), "");
42
}
43

44
static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned input_index,
45
                                          unsigned vtx_offset_param, LLVMTypeRef type,
46
                                          unsigned swizzle)
47
{
48
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
49
   struct si_shader *shader = ctx->shader;
50
   LLVMValueRef vtx_offset, soffset;
51
   struct si_shader_info *info = &shader->selector->info;
52
   unsigned param;
53
   LLVMValueRef value;
54

55
   param = si_shader_io_get_unique_index(info->input_semantic[input_index], false);
56

57
   /* GFX9 has the ESGS ring in LDS. */
58
   if (ctx->screen->info.chip_class >= GFX9) {
59
      unsigned index = vtx_offset_param;
60

61
      switch (index / 2) {
62
      case 0:
63
         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, index % 2 ? 16 : 0, 16);
64
         break;
65
      case 1:
66
         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, index % 2 ? 16 : 0, 16);
67
         break;
68
      case 2:
69
         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, index % 2 ? 16 : 0, 16);
70
         break;
71
      default:
72
         assert(0);
73
         return NULL;
74
      }
75

76
      unsigned offset = param * 4 + swizzle;
77
      vtx_offset =
78
         LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), "");
79

80
      LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
81
      LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
82
      return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
83
   }
84

85
   /* GFX6: input load from the ESGS ring in memory. */
86
   /* Get the vertex offset parameter on GFX6. */
87
   LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->args.gs_vtx_offset[vtx_offset_param]);
88

89
   vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
90

91
   soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
92

93
   value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0,
94
                                ctx->ac.f32, ac_glc, true, false);
95
   return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
96
}
97

98
static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
99
                                         unsigned driver_location, unsigned component,
100
                                         unsigned num_components, unsigned vertex_index,
101
                                         LLVMTypeRef type)
102
{
103
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
104

105
   LLVMValueRef value[4];
106
   for (unsigned i = component; i < component + num_components; i++) {
107
      value[i] = si_llvm_load_input_gs(&ctx->abi, driver_location,
108
                                       vertex_index, type, i);
109
   }
110

111
   return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
112
}
113

114
/* Pass GS inputs from ES to GS on GFX9. */
115
static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
116
{
117
   if (!ctx->shader->is_monolithic)
118
      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
119

120
   LLVMValueRef ret = ctx->return_value;
121

122
   ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
123
   ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
124
   if (ctx->shader->key.as_ngg)
125
      ret = si_insert_input_ptr(ctx, ret, ctx->args.gs_tg_info, 2);
126
   else
127
      ret = si_insert_input_ret(ctx, ret, ctx->args.gs2vs_offset, 2);
128
   ret = si_insert_input_ret(ctx, ret, ctx->args.merged_wave_info, 3);
129
   ret = si_insert_input_ret(ctx, ret, ctx->args.scratch_offset, 5);
130

131
   ret = si_insert_input_ptr(ctx, ret, ctx->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS);
132
   ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
133
                             8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
134
   if (ctx->screen->use_ngg) {
135
      ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
136
   }
137

138
   unsigned vgpr = 8 + SI_NUM_VS_STATE_RESOURCE_SGPRS;
139

140
   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
141
   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
142
   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
143
   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
144
   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
145
   ctx->return_value = ret;
146
}
147

148
void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
149
{
150
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
151
   struct si_shader *es = ctx->shader;
152
   struct si_shader_info *info = &es->selector->info;
153
   LLVMValueRef lds_base = NULL;
154
   unsigned chan;
155
   int i;
156

157
   if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
158
      unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
159
      LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
160
      LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->args.merged_wave_info, 24, 4);
161
      vertex_idx =
162
         LLVMBuildOr(ctx->ac.builder, vertex_idx,
163
                     LLVMBuildMul(ctx->ac.builder, wave_idx,
164
                                  LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""),
165
                     "");
166
      lds_base =
167
         LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
168
   }
169

170
   for (i = 0; i < info->num_outputs; i++) {
171
      int param;
172

173
      if (info->output_semantic[i] == VARYING_SLOT_VIEWPORT ||
174
          info->output_semantic[i] == VARYING_SLOT_LAYER)
175
         continue;
176

177
      param = si_shader_io_get_unique_index(info->output_semantic[i], false);
178

179
      for (chan = 0; chan < 4; chan++) {
180
         if (!(info->output_usagemask[i] & (1 << chan)))
181
            continue;
182

183
         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
184
         out_val = ac_to_integer(&ctx->ac, out_val);
185

186
         /* GFX9 has the ESGS ring in LDS. */
187
         if (ctx->screen->info.chip_class >= GFX9) {
188
            LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
189
            idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
190
            ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
191
            continue;
192
         }
193

194
         ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL,
195
                                     ac_get_arg(&ctx->ac, ctx->args.es2gs_offset),
196
                                     (4 * param + chan) * 4, ac_glc | ac_slc | ac_swizzled);
197
      }
198
   }
199

200
   if (ctx->screen->info.chip_class >= GFX9)
201
      si_set_es_return_value_for_gs(ctx);
202
}
203

204
static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
205
{
206
   if (ctx->screen->info.chip_class >= GFX9)
207
      return si_unpack_param(ctx, ctx->args.merged_wave_info, 16, 8);
208
   else
209
      return ac_get_arg(&ctx->ac, ctx->args.gs_wave_id);
210
}
211

212
static void emit_gs_epilogue(struct si_shader_context *ctx)
213
{
214
   if (ctx->shader->key.as_ngg) {
215
      gfx10_ngg_gs_emit_epilogue(ctx);
216
      return;
217
   }
218

219
   if (ctx->screen->info.chip_class >= GFX10)
220
      LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
221

222
   ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx));
223

224
   if (ctx->screen->info.chip_class >= GFX9)
225
      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
226
}
227

228
static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
229
                                     LLVMValueRef *addrs)
230
{
231
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
232
   struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
233

234
   assert(info->num_outputs <= max_outputs);
235

236
   emit_gs_epilogue(ctx);
237
}
238

239
/* Emit one vertex from the geometry shader */
240
static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
241
{
242
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
243

244
   if (ctx->shader->key.as_ngg) {
245
      gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
246
      return;
247
   }
248

249
   struct si_shader_info *info = &ctx->shader->selector->info;
250
   struct si_shader *shader = ctx->shader;
251
   LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->args.gs2vs_offset);
252
   LLVMValueRef gs_next_vertex;
253
   LLVMValueRef can_emit;
254
   unsigned chan, offset;
255
   int i;
256

257
   /* Write vertex attribute values to GSVS ring */
258
   gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], "");
259

260
   /* If this thread has already emitted the declared maximum number of
261
    * vertices, skip the write: excessive vertex emissions are not
262
    * supposed to have any effect.
263
    *
264
    * If the shader has no writes to memory, kill it instead. This skips
265
    * further memory loads and may allow LLVM to skip to the end
266
    * altogether.
267
    */
268
   can_emit =
269
      LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
270
                    LLVMConstInt(ctx->ac.i32, shader->selector->info.base.gs.vertices_out, 0), "");
271

272
   bool use_kill = !info->base.writes_memory;
273
   if (use_kill) {
274
      ac_build_kill_if_false(&ctx->ac, can_emit);
275
   } else {
276
      ac_build_ifcc(&ctx->ac, can_emit, 6505);
277
   }
278

279
   offset = 0;
280
   for (i = 0; i < info->num_outputs; i++) {
281
      for (chan = 0; chan < 4; chan++) {
282
         if (!(info->output_usagemask[i] & (1 << chan)) ||
283
             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
284
            continue;
285

286
         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
287
         LLVMValueRef voffset =
288
            LLVMConstInt(ctx->ac.i32, offset * shader->selector->info.base.gs.vertices_out, 0);
289
         offset++;
290

291
         voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
292
         voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
293

294
         out_val = ac_to_integer(&ctx->ac, out_val);
295

296
         ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, soffset,
297
                                     0, ac_glc | ac_slc | ac_swizzled);
298
      }
299
   }
300

301
   gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
302
   LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
303

304
   /* Signal vertex emission if vertex data was written. */
305
   if (offset) {
306
      ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
307
                       si_get_gs_wave_id(ctx));
308
   }
309

310
   if (!use_kill)
311
      ac_build_endif(&ctx->ac, 6505);
312
}
313

314
/* Cut one primitive from the geometry shader */
315
static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream)
316
{
317
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
318

319
   if (ctx->shader->key.as_ngg) {
320
      LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
321
      return;
322
   }
323

324
   /* Signal primitive cut */
325
   ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
326
                    si_get_gs_wave_id(ctx));
327
}
328

329
void si_preload_esgs_ring(struct si_shader_context *ctx)
330
{
331
   if (ctx->screen->info.chip_class <= GFX8) {
332
      unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? SI_GS_RING_ESGS : SI_ES_RING_ESGS;
333
      LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
334
      LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
335

336
      ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
337
   } else {
338
      if (USE_LDS_SYMBOLS) {
339
         /* Declare the ESGS ring as an explicit LDS symbol. */
340
         si_llvm_declare_esgs_ring(ctx);
341
      } else {
342
         ac_declare_lds_as_pointer(&ctx->ac);
343
         ctx->esgs_ring = ctx->ac.lds;
344
      }
345
   }
346
}
347

348
void si_preload_gs_rings(struct si_shader_context *ctx)
349
{
350
   const struct si_shader_selector *sel = ctx->shader->selector;
351
   LLVMBuilderRef builder = ctx->ac.builder;
352
   LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
353
   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
354
   LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
355

356
   /* The conceptual layout of the GSVS ring is
357
    *   v0c0 .. vLv0 v0c1 .. vLc1 ..
358
    * but the real memory layout is swizzled across
359
    * threads:
360
    *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
361
    *   t16v0c0 ..
362
    * Override the buffer descriptor accordingly.
363
    */
364
   LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
365
   uint64_t stream_offset = 0;
366

367
   for (unsigned stream = 0; stream < 4; ++stream) {
368
      unsigned num_components;
369
      unsigned stride;
370
      unsigned num_records;
371
      LLVMValueRef ring, tmp;
372

373
      num_components = sel->info.num_stream_output_components[stream];
374
      if (!num_components)
375
         continue;
376

377
      stride = 4 * num_components * sel->info.base.gs.vertices_out;
378

379
      /* Limit on the stride field for <= GFX7. */
380
      assert(stride < (1 << 14));
381

382
      num_records = ctx->ac.wave_size;
383

384
      ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
385
      tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
386
      tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
387
      stream_offset += stride * ctx->ac.wave_size;
388

389
      ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
390
      ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
391
      tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
392
      tmp = LLVMBuildOr(
393
         builder, tmp,
394
         LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), "");
395
      ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
396
      ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, num_records, 0),
397
                                    LLVMConstInt(ctx->ac.i32, 2, 0), "");
398

399
      uint32_t rsrc3 =
400
         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
401
         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
402
         S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
403
         S_008F0C_ADD_TID_ENABLE(1);
404

405
      if (ctx->ac.chip_class >= GFX10) {
406
         rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
407
                  S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
408
      } else {
409
         rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
410
                  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
411
                  S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
412
      }
413

414
      ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, rsrc3, false),
415
                                    LLVMConstInt(ctx->ac.i32, 3, 0), "");
416

417
      ctx->gsvs_ring[stream] = ring;
418
   }
419
}
420

421
/* Generate code for the hardware VS shader stage to go with a geometry shader */
422
struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
423
                                             struct ac_llvm_compiler *compiler,
424
                                             struct si_shader_selector *gs_selector,
425
                                             struct pipe_debug_callback *debug)
426
{
427
   struct si_shader_context ctx;
428
   struct si_shader *shader;
429
   LLVMBuilderRef builder;
430
   struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
431
   struct si_shader_info *gsinfo = &gs_selector->info;
432
   int i;
433

434
   shader = CALLOC_STRUCT(si_shader);
435
   if (!shader)
436
      return NULL;
437

438
   /* We can leave the fence as permanently signaled because the GS copy
439
    * shader only becomes visible globally after it has been compiled. */
440
   util_queue_fence_init(&shader->ready);
441

442
   shader->selector = gs_selector;
443
   shader->is_gs_copy_shader = true;
444

445
   si_llvm_context_init(&ctx, sscreen, compiler,
446
                        si_get_wave_size(sscreen, MESA_SHADER_VERTEX,
447
                                         false, false, false, false));
448
   ctx.shader = shader;
449
   ctx.stage = MESA_SHADER_VERTEX;
450

451
   builder = ctx.ac.builder;
452

453
   si_llvm_create_main_func(&ctx, false);
454

455
   LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.internal_bindings);
456
   ctx.gsvs_ring[0] =
457
      ac_build_load_to_sgpr(&ctx.ac, buf_ptr, LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
458

459
   LLVMValueRef voffset =
460
      LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, LLVMConstInt(ctx.ac.i32, 4, 0), "");
461

462
   /* Fetch the vertex stream ID.*/
463
   LLVMValueRef stream_id;
464

465
   if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
466
      stream_id = si_unpack_param(&ctx, ctx.args.streamout_config, 24, 2);
467
   else
468
      stream_id = ctx.ac.i32_0;
469

470
   /* Fill in output information. */
471
   for (i = 0; i < gsinfo->num_outputs; ++i) {
472
      outputs[i].semantic = gsinfo->output_semantic[i];
473

474
      for (int chan = 0; chan < 4; chan++) {
475
         outputs[i].vertex_stream[chan] = (gsinfo->output_streams[i] >> (2 * chan)) & 3;
476
      }
477
   }
478

479
   LLVMBasicBlockRef end_bb;
480
   LLVMValueRef switch_inst;
481

482
   end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
483
   switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
484

485
   for (int stream = 0; stream < 4; stream++) {
486
      LLVMBasicBlockRef bb;
487
      unsigned offset;
488

489
      if (!gsinfo->num_stream_output_components[stream])
490
         continue;
491

492
      if (stream > 0 && !gs_selector->so.num_outputs)
493
         continue;
494

495
      bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
496
      LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
497
      LLVMPositionBuilderAtEnd(builder, bb);
498

499
      /* Fetch vertex data from GSVS ring */
500
      offset = 0;
501
      for (i = 0; i < gsinfo->num_outputs; ++i) {
502
         for (unsigned chan = 0; chan < 4; chan++) {
503
            if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
504
                outputs[i].vertex_stream[chan] != stream) {
505
               outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
506
               continue;
507
            }
508

509
            LLVMValueRef soffset =
510
               LLVMConstInt(ctx.ac.i32, offset * gs_selector->info.base.gs.vertices_out * 16 * 4, 0);
511
            offset++;
512

513
            outputs[i].values[chan] =
514
               ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, ctx.ac.i32_0, voffset, soffset, 0,
515
                                    ctx.ac.f32, ac_glc | ac_slc, true, false);
516
         }
517
      }
518

519
      /* Streamout and exports. */
520
      if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
521
         si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream);
522
      }
523

524
      if (stream == 0)
525
         si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
526

527
      LLVMBuildBr(builder, end_bb);
528
   }
529

530
   LLVMPositionBuilderAtEnd(builder, end_bb);
531

532
   LLVMBuildRetVoid(ctx.ac.builder);
533

534
   ctx.stage = MESA_SHADER_GEOMETRY; /* override for shader dumping */
535
   si_llvm_optimize_module(&ctx);
536

537
   bool ok = false;
538
   if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, &ctx.ac,
539
                       debug, MESA_SHADER_GEOMETRY, "GS Copy Shader", false)) {
540
      if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY))
541
         fprintf(stderr, "GS Copy Shader:\n");
542
      si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
543

544
      if (!ctx.shader->config.scratch_bytes_per_wave)
545
         ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
546
      else
547
         ok = true;
548
   }
549

550
   si_llvm_dispose(&ctx);
551

552
   if (!ok) {
553
      FREE(shader);
554
      shader = NULL;
555
   } else {
556
      si_fix_resource_usage(sscreen, shader);
557
   }
558
   return shader;
559
}
560

561
/**
562
 * Build the GS prolog function. Rotate the input vertices for triangle strips
563
 * with adjacency.
564
 */
565
void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
566
{
567
   unsigned num_sgprs, num_vgprs;
568
   LLVMBuilderRef builder = ctx->ac.builder;
569
   LLVMTypeRef returns[AC_MAX_ARGS];
570
   LLVMValueRef func, ret;
571

572
   memset(&ctx->args, 0, sizeof(ctx->args));
573

574
   if (ctx->screen->info.chip_class >= GFX9) {
575
      /* Other user SGPRs are not needed by GS. */
576
      num_sgprs = 8 + SI_NUM_VS_STATE_RESOURCE_SGPRS;
577
      num_vgprs = 5; /* ES inputs are not needed by GS */
578
   } else {
579
      num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
580
      num_vgprs = 8;
581
   }
582

583
   for (unsigned i = 0; i < num_sgprs; ++i) {
584
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
585
      returns[i] = ctx->ac.i32;
586
   }
587

588
   for (unsigned i = 0; i < num_vgprs; ++i) {
589
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
590
      returns[num_sgprs + i] = ctx->ac.f32;
591
   }
592

593
   /* Create the function. */
594
   si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
595
   func = ctx->main_fn;
596

597
   /* Copy inputs to outputs. This should be no-op, as the registers match,
598
    * but it will prevent the compiler from overwriting them unintentionally.
599
    */
600
   ret = ctx->return_value;
601
   for (unsigned i = 0; i < num_sgprs; i++) {
602
      LLVMValueRef p = LLVMGetParam(func, i);
603
      ret = LLVMBuildInsertValue(builder, ret, p, i, "");
604
   }
605
   for (unsigned i = 0; i < num_vgprs; i++) {
606
      LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
607
      p = ac_to_float(&ctx->ac, p);
608
      ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
609
   }
610

611
   if (key->gs_prolog.states.tri_strip_adj_fix) {
612
      /* Remap the input vertices for every other primitive. */
613
      const struct ac_arg gfx6_vtx_params[6] = {
614
         {.used = true, .arg_index = num_sgprs},     {.used = true, .arg_index = num_sgprs + 1},
615
         {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4},
616
         {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6},
617
      };
618
      const struct ac_arg gfx9_vtx_params[3] = {
619
         {.used = true, .arg_index = num_sgprs},
620
         {.used = true, .arg_index = num_sgprs + 1},
621
         {.used = true, .arg_index = num_sgprs + 4},
622
      };
623
      LLVMValueRef vtx_in[6], vtx_out[6];
624
      LLVMValueRef prim_id, rotate;
625

626
      if (ctx->screen->info.chip_class >= GFX9) {
627
         for (unsigned i = 0; i < 3; i++) {
628
            vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
629
            vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
630
         }
631
      } else {
632
         for (unsigned i = 0; i < 6; i++)
633
            vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
634
      }
635

636
      prim_id = LLVMGetParam(func, num_sgprs + 2);
637
      rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
638

639
      for (unsigned i = 0; i < 6; ++i) {
640
         LLVMValueRef base, rotated;
641
         base = vtx_in[i];
642
         rotated = vtx_in[(i + 4) % 6];
643
         vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
644
      }
645

646
      if (ctx->screen->info.chip_class >= GFX9) {
647
         for (unsigned i = 0; i < 3; i++) {
648
            LLVMValueRef hi, out;
649

650
            hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), "");
651
            out = LLVMBuildOr(builder, vtx_out[i * 2], hi, "");
652
            out = ac_to_float(&ctx->ac, out);
653
            ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, "");
654
         }
655
      } else {
656
         for (unsigned i = 0; i < 6; i++) {
657
            LLVMValueRef out;
658

659
            out = ac_to_float(&ctx->ac, vtx_out[i]);
660
            ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, "");
661
         }
662
      }
663
   }
664

665
   LLVMBuildRet(builder, ret);
666
}
667

668
void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
669
{
670
   ctx->abi.load_inputs = si_nir_load_input_gs;
671
   ctx->abi.emit_vertex = si_llvm_emit_vertex;
672
   ctx->abi.emit_primitive = si_llvm_emit_primitive;
673
   ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
674
}
675

676
Product

Resources

Company