CoCalc -- si_shader_llvm

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
⁴⁵⁷⁰ views
1
/*
2
 * Copyright 2020 Advanced Micro Devices, Inc.
3
 * All Rights Reserved.
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * on the rights to use, copy, modify, merge, publish, distribute, sub
9
 * license, and/or sell copies of the Software, and to permit persons to whom
10
 * the Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23
 */
24

25
#include "si_pipe.h"
26
#include "si_shader_internal.h"
27
#include "sid.h"
28
#include "util/u_memory.h"
29

30
static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index)
31
{
32
   assert(index <= 1);
33

34
   if (index == 1)
35
      return LLVMBuildAShr(ctx->ac.builder, i32, LLVMConstInt(ctx->ac.i32, 16, 0), "");
36

37
   return LLVMBuildSExt(ctx->ac.builder, LLVMBuildTrunc(ctx->ac.builder, i32, ctx->ac.i16, ""),
38
                        ctx->ac.i32, "");
39
}
40

41
static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4])
42
{
43
   const struct si_shader_info *info = &ctx->shader->selector->info;
44
   unsigned vs_blit_property = info->base.vs.blit_sgprs_amd;
45

46
   if (vs_blit_property) {
47
      LLVMValueRef vertex_id = ctx->abi.vertex_id;
48
      LLVMValueRef sel_x1 =
49
         LLVMBuildICmp(ctx->ac.builder, LLVMIntULE, vertex_id, ctx->ac.i32_1, "");
50
      /* Use LLVMIntNE, because we have 3 vertices and only
51
       * the middle one should use y2.
52
       */
53
      LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, vertex_id, ctx->ac.i32_1, "");
54

55
      unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
56
      if (input_index == 0) {
57
         /* Position: */
58
         LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs);
59
         LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 1);
60

61
         LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
62
         LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
63
         LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
64
         LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
65

66
         LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
67
         LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
68

69
         out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
70
         out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
71
         out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 2);
72
         out[3] = ctx->ac.f32_1;
73
         return;
74
      }
75

76
      /* Color or texture coordinates: */
77
      assert(input_index == 1);
78

79
      if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
80
         for (int i = 0; i < 4; i++) {
81
            out[i] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3 + i);
82
         }
83
      } else {
84
         assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
85
         LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3);
86
         LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 4);
87
         LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 5);
88
         LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 6);
89

90
         out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
91
         out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
92
         out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 7);
93
         out[3] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 8);
94
      }
95
      return;
96
   }
97

98
   /* Set can_speculate=false to help keep all loads grouped together
99
    * for better latency hiding. If it was true, LLVM could move the loads forward
100
    * and accidentally double memory latency by doing:
101
    *
102
    *    buffer_load_dword_xyzw
103
    *    s_waitcnt vmcnt(0)
104
    *    buffer_load_dword_xyzw
105
    *    s_waitcnt vmcnt(0)
106
    *
107
    * ... which is what we must prevent at all cost.
108
    */
109
   const bool can_speculate = false;
110
   unsigned bit_size = info->input_fp16_lo_hi_valid[input_index] & 0x1 ? 16 : 32;
111
   LLVMTypeRef int_type = bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32;
112
   LLVMTypeRef float_type = bit_size == 16 ? ctx->ac.f16 : ctx->ac.f32;
113
   unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
114
   union si_vs_fix_fetch fix_fetch;
115
   LLVMValueRef vb_desc;
116
   LLVMValueRef vertex_index;
117
   LLVMValueRef tmp;
118

119
   if (input_index < num_vbos_in_user_sgprs) {
120
      vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
121
   } else {
122
      unsigned index = input_index - num_vbos_in_user_sgprs;
123
      vb_desc = ac_build_load_to_sgpr(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.vertex_buffers),
124
                                      LLVMConstInt(ctx->ac.i32, index, 0));
125
   }
126

127
   vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index);
128

129
   /* Use the open-coded implementation for all loads of doubles and
130
    * of dword-sized data that needs fixups. We need to insert conversion
131
    * code anyway, and the amd/common code does it for us.
132
    */
133
   bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
134
   fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
135
   if (opencode || (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
136
       (fix_fetch.u.log_size == 2)) {
137
      tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size,
138
                                           fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format,
139
                                           fix_fetch.u.reverse, !opencode, vb_desc, vertex_index,
140
                                           ctx->ac.i32_0, ctx->ac.i32_0, 0, can_speculate);
141
      for (unsigned i = 0; i < 4; ++i)
142
         out[i] =
143
            LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
144

145
      if (bit_size == 16) {
146
         if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT ||
147
             fix_fetch.u.format == AC_FETCH_FORMAT_SINT) {
148
            for (unsigned i = 0; i < 4; i++)
149
               out[i] = LLVMBuildTrunc(ctx->ac.builder, out[i], ctx->ac.i16, "");
150
         } else {
151
            for (unsigned i = 0; i < 4; i++) {
152
               out[i] = ac_to_float(&ctx->ac, out[i]);
153
               out[i] = LLVMBuildFPTrunc(ctx->ac.builder, out[i], ctx->ac.f16, "");
154
            }
155
         }
156
      }
157
      return;
158
   }
159

160
   unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
161
   if (required_channels == 0) {
162
      for (unsigned i = 0; i < 4; ++i)
163
         out[i] = LLVMGetUndef(ctx->ac.f32);
164
      return;
165
   }
166

167
   /* Do multiple loads for special formats. */
168
   LLVMValueRef fetches[4];
169
   unsigned num_fetches;
170
   unsigned fetch_stride;
171
   unsigned channels_per_fetch;
172

173
   if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
174
      num_fetches = MIN2(required_channels, 3);
175
      fetch_stride = 1 << fix_fetch.u.log_size;
176
      channels_per_fetch = 1;
177
   } else {
178
      num_fetches = 1;
179
      fetch_stride = 0;
180
      channels_per_fetch = required_channels;
181
   }
182

183
   for (unsigned i = 0; i < num_fetches; ++i) {
184
      LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
185
      fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
186
                                               channels_per_fetch, 0, can_speculate,
187
                                               bit_size == 16, false);
188
   }
189

190
   if (num_fetches == 1 && channels_per_fetch > 1) {
191
      LLVMValueRef fetch = fetches[0];
192
      for (unsigned i = 0; i < channels_per_fetch; ++i) {
193
         tmp = LLVMConstInt(ctx->ac.i32, i, false);
194
         fetches[i] = LLVMBuildExtractElement(ctx->ac.builder, fetch, tmp, "");
195
      }
196
      num_fetches = channels_per_fetch;
197
      channels_per_fetch = 1;
198
   }
199

200
   for (unsigned i = num_fetches; i < 4; ++i)
201
      fetches[i] = LLVMGetUndef(float_type);
202

203
   if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && required_channels == 4) {
204
      if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
205
         fetches[3] = LLVMConstInt(int_type, 1, 0);
206
      else
207
         fetches[3] = LLVMConstReal(float_type, 1);
208
   } else if (fix_fetch.u.log_size == 3 &&
209
              (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
210
               fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
211
               fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
212
              required_channels == 4) {
213

214
      /* For 2_10_10_10, the hardware returns an unsigned value;
215
       * convert it to a signed one.
216
       */
217
      LLVMValueRef tmp = fetches[3];
218
      LLVMValueRef c30 = LLVMConstInt(int_type, 30, 0);
219

220
      /* First, recover the sign-extended signed integer value. */
221
      if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
222
         tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, int_type, "");
223
      else
224
         tmp = ac_to_integer(&ctx->ac, tmp);
225

226
      /* For the integer-like cases, do a natural sign extension.
227
       *
228
       * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
229
       * and happen to contain 0, 1, 2, 3 as the two LSBs of the
230
       * exponent.
231
       */
232
      tmp = LLVMBuildShl(
233
         ctx->ac.builder, tmp,
234
         fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(int_type, 7, 0) : c30, "");
235
      tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
236

237
      /* Convert back to the right type. */
238
      if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
239
         LLVMValueRef clamp;
240
         LLVMValueRef neg_one = LLVMConstReal(float_type, -1.0);
241
         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, float_type, "");
242
         clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
243
         tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
244
      } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
245
         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, float_type, "");
246
      }
247

248
      fetches[3] = tmp;
249
   }
250

251
   for (unsigned i = 0; i < 4; ++i)
252
      out[i] = ac_to_float(&ctx->ac, fetches[i]);
253
}
254

255
void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir)
256
{
257
   const struct si_shader_info *info = &ctx->shader->selector->info;
258

259
   for (unsigned i = 0; i < info->num_inputs; i++) {
260
      LLVMValueRef values[4];
261

262
      load_input_vs(ctx, i, values);
263

264
      for (unsigned chan = 0; chan < 4; chan++)
265
         ctx->inputs[i * 4 + chan] = ac_to_integer(&ctx->ac, values[chan]);
266
   }
267
}
268

269
void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
270
                                    LLVMValueRef const *so_write_offsets,
271
                                    struct pipe_stream_output *stream_out,
272
                                    struct si_shader_output_values *shader_out)
273
{
274
   unsigned buf_idx = stream_out->output_buffer;
275
   unsigned start = stream_out->start_component;
276
   unsigned num_comps = stream_out->num_components;
277
   LLVMValueRef out[4];
278

279
   assert(num_comps && num_comps <= 4);
280
   if (!num_comps || num_comps > 4)
281
      return;
282

283
   /* Load the output as int. */
284
   for (int j = 0; j < num_comps; j++) {
285
      assert(stream_out->stream == shader_out->vertex_stream[start + j]);
286

287
      out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
288
   }
289

290
   /* Pack the output. */
291
   LLVMValueRef vdata = NULL;
292

293
   switch (num_comps) {
294
   case 1: /* as i32 */
295
      vdata = out[0];
296
      break;
297
   case 2: /* as v2i32 */
298
   case 3: /* as v3i32 */
299
      if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
300
         vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
301
         break;
302
      }
303
      /* as v4i32 (aligned to 4) */
304
      out[3] = LLVMGetUndef(ctx->ac.i32);
305
      FALLTHROUGH;
306
   case 4: /* as v4i32 */
307
      vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
308
      break;
309
   }
310

311
   ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], vdata, num_comps,
312
                               so_write_offsets[buf_idx], ctx->ac.i32_0, stream_out->dst_offset * 4,
313
                               ac_glc | ac_slc);
314
}
315

316
/**
317
 * Write streamout data to buffers for vertex stream @p stream (different
318
 * vertex streams can occur for GS copy shaders).
319
 */
320
void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
321
                            unsigned noutput, unsigned stream)
322
{
323
   struct si_shader_selector *sel = ctx->shader->selector;
324
   struct pipe_stream_output_info *so = &sel->so;
325
   LLVMBuilderRef builder = ctx->ac.builder;
326
   int i;
327

328
   /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
329
   LLVMValueRef so_vtx_count = si_unpack_param(ctx, ctx->args.streamout_config, 16, 7);
330

331
   LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
332

333
   /* can_emit = tid < so_vtx_count; */
334
   LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
335

336
   /* Emit the streamout code conditionally. This actually avoids
337
    * out-of-bounds buffer access. The hw tells us via the SGPR
338
    * (so_vtx_count) which threads are allowed to emit streamout data. */
339
   ac_build_ifcc(&ctx->ac, can_emit, 6501);
340
   {
341
      /* The buffer offset is computed as follows:
342
       *   ByteOffset = streamout_offset[buffer_id]*4 +
343
       *                (streamout_write_index + thread_id)*stride[buffer_id] +
344
       *                attrib_offset
345
       */
346

347
      LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args.streamout_write_index);
348

349
      /* Compute (streamout_write_index + thread_id). */
350
      so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
351

352
      /* Load the descriptor and compute the write offset for each
353
       * enabled buffer. */
354
      LLVMValueRef so_write_offset[4] = {};
355
      LLVMValueRef so_buffers[4];
356
      LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
357

358
      for (i = 0; i < 4; i++) {
359
         if (!so->stride[i])
360
            continue;
361

362
         LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + i, 0);
363

364
         so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
365

366
         LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args.streamout_offset[i]);
367
         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
368

369
         so_write_offset[i] = ac_build_imad(
370
            &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, so->stride[i] * 4, 0), so_offset);
371
      }
372

373
      /* Write streamout data. */
374
      for (i = 0; i < so->num_outputs; i++) {
375
         unsigned reg = so->output[i].register_index;
376

377
         if (reg >= noutput)
378
            continue;
379

380
         if (stream != so->output[i].stream)
381
            continue;
382

383
         si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, &so->output[i],
384
                                        &outputs[reg]);
385
      }
386
   }
387
   ac_build_endif(&ctx->ac, 6501);
388
}
389

390
static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, struct ac_export_args *pos,
391
                                    LLVMValueRef *out_elts)
392
{
393
   unsigned reg_index;
394
   unsigned chan;
395
   unsigned const_chan;
396
   LLVMValueRef base_elt;
397
   LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
398
   LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0);
399
   LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
400
   unsigned clipdist_mask = ctx->shader->selector->clipdist_mask &
401
                            ~ctx->shader->key.opt.kill_clip_distances;
402

403
   for (reg_index = 0; reg_index < 2; reg_index++) {
404
      struct ac_export_args *args = &pos[2 + reg_index];
405

406
      if (!(clipdist_mask & BITFIELD_RANGE(reg_index * 4, 4)))
407
         continue;
408

409
      args->out[0] = args->out[1] = args->out[2] = args->out[3] = LLVMGetUndef(ctx->ac.f32);
410

411
      /* Compute dot products of position and user clip plane vectors */
412
      for (chan = 0; chan < 4; chan++) {
413
         if (!(clipdist_mask & BITFIELD_BIT(reg_index * 4 + chan)))
414
            continue;
415

416
         for (const_chan = 0; const_chan < 4; const_chan++) {
417
            LLVMValueRef addr =
418
               LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + const_chan) * 4, 0);
419
            base_elt = si_buffer_load_const(ctx, const_resource, addr);
420
            args->out[chan] =
421
               ac_build_fmad(&ctx->ac, base_elt, out_elts[const_chan],
422
                             const_chan == 0 ? ctx->ac.f32_0 : args->out[chan]);
423
         }
424
      }
425

426
      args->enabled_channels = 0xf;
427
      args->valid_mask = 0;
428
      args->done = 0;
429
      args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
430
      args->compr = 0;
431
   }
432
}
433

434
/* Initialize arguments for the shader export intrinsic */
435
static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, const LLVMValueRef *values,
436
                                        unsigned target, struct ac_export_args *args)
437
{
438
   args->enabled_channels = 0xf; /* writemask - default is 0xf */
439
   args->valid_mask = 0;         /* Specify whether the EXEC mask represents the valid mask */
440
   args->done = 0;               /* Specify whether this is the last export */
441
   args->target = target;        /* Specify the target we are exporting */
442
   args->compr = false;
443

444
   memcpy(&args->out[0], values, sizeof(values[0]) * 4);
445
}
446

447
static void si_prepare_param_exports(struct si_shader_context *ctx,
448
                                     const struct si_shader_output_values *outputs, unsigned noutput,
449
                                     struct ac_export_args exports[32])
450
{
451
   struct si_shader *shader = ctx->shader;
452
   unsigned param_count = 0;
453

454
   for (unsigned i = 0; i < noutput; i++) {
455
      unsigned semantic = outputs[i].semantic;
456

457
      if (outputs[i].vertex_stream[0] != 0 && outputs[i].vertex_stream[1] != 0 &&
458
          outputs[i].vertex_stream[2] != 0 && outputs[i].vertex_stream[3] != 0)
459
         continue;
460

461
      switch (semantic) {
462
      case VARYING_SLOT_LAYER:
463
      case VARYING_SLOT_VIEWPORT:
464
      case VARYING_SLOT_CLIP_DIST0:
465
      case VARYING_SLOT_CLIP_DIST1:
466
      case VARYING_SLOT_COL0:
467
      case VARYING_SLOT_COL1:
468
      case VARYING_SLOT_BFC0:
469
      case VARYING_SLOT_BFC1:
470
      case VARYING_SLOT_PRIMITIVE_ID:
471
      case VARYING_SLOT_FOGC:
472
         break;
473
      default:
474
         if ((semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7) ||
475
             semantic >= VARYING_SLOT_VAR0)
476
            break;
477
         else
478
            continue;
479
      }
480

481
      if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
482
          shader->key.opt.kill_outputs &
483
             (1ull << si_shader_io_get_unique_index(semantic, true)))
484
         continue;
485

486
      si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_PARAM + param_count,
487
                                  &exports[param_count]);
488

489
      assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
490
      shader->info.vs_output_param_offset[i] = param_count++;
491
   }
492

493
   shader->info.nr_param_exports = param_count;
494
}
495

496
/**
497
 * Vertex color clamping.
498
 *
499
 * This uses a state constant loaded in a user data SGPR and
500
 * an IF statement is added that clamps all colors if the constant
501
 * is true.
502
 */
503
static void si_vertex_color_clamping(struct si_shader_context *ctx,
504
                                     struct si_shader_output_values *outputs, unsigned noutput)
505
{
506
   LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
507
   bool has_colors = false;
508

509
   /* Store original colors to alloca variables. */
510
   for (unsigned i = 0; i < noutput; i++) {
511
      if (outputs[i].semantic != VARYING_SLOT_COL0 &&
512
          outputs[i].semantic != VARYING_SLOT_COL1 &&
513
          outputs[i].semantic != VARYING_SLOT_BFC0 &&
514
          outputs[i].semantic != VARYING_SLOT_BFC1)
515
         continue;
516

517
      for (unsigned j = 0; j < 4; j++)
518
         addr[i][j] = ac_build_alloca_init(&ctx->ac, outputs[i].values[j], "");
519

520
      has_colors = true;
521
   }
522

523
   if (!has_colors)
524
      return;
525

526
   /* The state is in the first bit of the user SGPR. */
527
   LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
528
   cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
529

530
   ac_build_ifcc(&ctx->ac, cond, 6502);
531

532
   /* Store clamped colors to alloca variables within the conditional block. */
533
   for (unsigned i = 0; i < noutput; i++) {
534
      if (outputs[i].semantic != VARYING_SLOT_COL0 &&
535
          outputs[i].semantic != VARYING_SLOT_COL1 &&
536
          outputs[i].semantic != VARYING_SLOT_BFC0 &&
537
          outputs[i].semantic != VARYING_SLOT_BFC1)
538
         continue;
539

540
      for (unsigned j = 0; j < 4; j++) {
541
         LLVMBuildStore(ctx->ac.builder, ac_build_clamp(&ctx->ac, outputs[i].values[j]),
542
                        addr[i][j]);
543
      }
544
   }
545
   ac_build_endif(&ctx->ac, 6502);
546

547
   /* Load clamped colors */
548
   for (unsigned i = 0; i < noutput; i++) {
549
      if (outputs[i].semantic != VARYING_SLOT_COL0 &&
550
          outputs[i].semantic != VARYING_SLOT_COL1 &&
551
          outputs[i].semantic != VARYING_SLOT_BFC0 &&
552
          outputs[i].semantic != VARYING_SLOT_BFC1)
553
         continue;
554

555
      for (unsigned j = 0; j < 4; j++) {
556
         outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
557
      }
558
   }
559
}
560

561
/* Generate export instructions for hardware VS shader stage or NGG GS stage
562
 * (position and parameter data only).
563
 */
564
void si_llvm_build_vs_exports(struct si_shader_context *ctx,
565
                              struct si_shader_output_values *outputs, unsigned noutput)
566
{
567
   struct si_shader *shader = ctx->shader;
568
   struct ac_export_args pos_args[4] = {};
569
   LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL,
570
                viewport_index_value = NULL;
571
   unsigned pos_idx, index;
572
   unsigned clipdist_mask = (shader->selector->clipdist_mask &
573
                             ~shader->key.opt.kill_clip_distances) |
574
                            shader->selector->culldist_mask;
575
   int i;
576

577
   si_vertex_color_clamping(ctx, outputs, noutput);
578

579
   struct ac_export_args param_exports[32];
580
   si_prepare_param_exports(ctx, outputs, noutput, param_exports);
581

582
   /* Build position exports. */
583
   for (i = 0; i < noutput; i++) {
584
      switch (outputs[i].semantic) {
585
      case VARYING_SLOT_POS:
586
         si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS, &pos_args[0]);
587
         break;
588
      case VARYING_SLOT_PSIZ:
589
         psize_value = outputs[i].values[0];
590
         break;
591
      case VARYING_SLOT_LAYER:
592
         layer_value = outputs[i].values[0];
593
         break;
594
      case VARYING_SLOT_VIEWPORT:
595
         viewport_index_value = outputs[i].values[0];
596
         break;
597
      case VARYING_SLOT_EDGE:
598
         edgeflag_value = outputs[i].values[0];
599
         break;
600
      case VARYING_SLOT_CLIP_DIST0:
601
      case VARYING_SLOT_CLIP_DIST1:
602
         index = outputs[i].semantic - VARYING_SLOT_CLIP_DIST0;
603
         if (clipdist_mask & BITFIELD_RANGE(index * 4, 4)) {
604
            si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS + 2 + index,
605
                                        &pos_args[2 + index]);
606
         }
607
         break;
608
      case VARYING_SLOT_CLIP_VERTEX:
609
         si_llvm_emit_clipvertex(ctx, pos_args, outputs[i].values);
610
         break;
611
      }
612
   }
613

614
   /* We need to add the position output manually if it's missing. */
615
   if (!pos_args[0].out[0]) {
616
      pos_args[0].enabled_channels = 0xf; /* writemask */
617
      pos_args[0].valid_mask = 0;         /* EXEC mask */
618
      pos_args[0].done = 0;               /* last export? */
619
      pos_args[0].target = V_008DFC_SQ_EXP_POS;
620
      pos_args[0].compr = 0;              /* COMPR flag */
621
      pos_args[0].out[0] = ctx->ac.f32_0; /* X */
622
      pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
623
      pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
624
      pos_args[0].out[3] = ctx->ac.f32_1; /* W */
625
   }
626

627
   bool writes_psize = shader->selector->info.writes_psize && !shader->key.opt.kill_pointsize;
628
   bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && !shader->key.as_ngg;
629
   bool writes_vrs = ctx->screen->options.vrs2x2;
630

631
   /* Write the misc vector (point size, edgeflag, layer, viewport). */
632
   if (writes_psize || pos_writes_edgeflag || writes_vrs ||
633
       shader->selector->info.writes_viewport_index || shader->selector->info.writes_layer) {
634
      pos_args[1].enabled_channels = writes_psize |
635
                                     ((pos_writes_edgeflag | writes_vrs) << 1) |
636
                                     (shader->selector->info.writes_layer << 2);
637

638
      pos_args[1].valid_mask = 0; /* EXEC mask */
639
      pos_args[1].done = 0;       /* last export? */
640
      pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
641
      pos_args[1].compr = 0;              /* COMPR flag */
642
      pos_args[1].out[0] = ctx->ac.f32_0; /* X */
643
      pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
644
      pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
645
      pos_args[1].out[3] = ctx->ac.f32_0; /* W */
646

647
      if (writes_psize)
648
         pos_args[1].out[0] = psize_value;
649

650
      if (pos_writes_edgeflag) {
651
         /* The output is a float, but the hw expects an integer
652
          * with the first bit containing the edge flag. */
653
         edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, edgeflag_value, ctx->ac.i32, "");
654
         edgeflag_value = ac_build_umin(&ctx->ac, edgeflag_value, ctx->ac.i32_1);
655

656
         /* The LLVM intrinsic expects a float. */
657
         pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
658
      }
659

660
      if (writes_vrs) {
661
         /* Bits [2:3] = VRS rate X
662
          * Bits [4:5] = VRS rate Y
663
          *
664
          * The range is [-2, 1]. Values:
665
          *   1: 2x coarser shading rate in that direction.
666
          *   0: normal shading rate
667
          *  -1: 2x finer shading rate (sample shading, not directional)
668
          *  -2: 4x finer shading rate (sample shading, not directional)
669
          *
670
          * Sample shading can't go above 8 samples, so both numbers can't be -2
671
          * at the same time.
672
          */
673
         LLVMValueRef rates = LLVMConstInt(ctx->ac.i32, (1 << 2) | (1 << 4), 0);
674

675
         /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
676
         rates = LLVMBuildSelect(ctx->ac.builder,
677
                                 LLVMBuildFCmp(ctx->ac.builder, LLVMRealUNE,
678
                                               pos_args[0].out[3], ctx->ac.f32_1, ""),
679
                                 rates, ctx->ac.i32_0, "");
680

681
         LLVMValueRef v = ac_to_integer(&ctx->ac, pos_args[1].out[1]);
682
         v = LLVMBuildOr(ctx->ac.builder, v, rates, "");
683
         pos_args[1].out[1] = ac_to_float(&ctx->ac, v);
684
      }
685

686
      if (ctx->screen->info.chip_class >= GFX9) {
687
         /* GFX9 has the layer in out.z[10:0] and the viewport
688
          * index in out.z[19:16].
689
          */
690
         if (shader->selector->info.writes_layer)
691
            pos_args[1].out[2] = layer_value;
692

693
         if (shader->selector->info.writes_viewport_index) {
694
            LLVMValueRef v = viewport_index_value;
695

696
            v = ac_to_integer(&ctx->ac, v);
697
            v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, 0), "");
698
            v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
699
            pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
700
            pos_args[1].enabled_channels |= 1 << 2;
701
         }
702
      } else {
703
         if (shader->selector->info.writes_layer)
704
            pos_args[1].out[2] = layer_value;
705

706
         if (shader->selector->info.writes_viewport_index) {
707
            pos_args[1].out[3] = viewport_index_value;
708
            pos_args[1].enabled_channels |= 1 << 3;
709
         }
710
      }
711
   }
712

713
   for (i = 0; i < 4; i++)
714
      if (pos_args[i].out[0])
715
         shader->info.nr_pos_exports++;
716

717
   /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
718
    * Setting valid_mask=1 prevents it and has no other effect.
719
    */
720
   if (ctx->screen->info.chip_class == GFX10)
721
      pos_args[0].valid_mask = 1;
722

723
   pos_idx = 0;
724
   for (i = 0; i < 4; i++) {
725
      if (!pos_args[i].out[0])
726
         continue;
727

728
      /* Specify the target we are exporting */
729
      pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
730

731
      if (pos_idx == shader->info.nr_pos_exports) {
732
         /* Specify that this is the last export */
733
         pos_args[i].done = 1;
734

735
         /* If a shader has no param exports, rasterization can start before
736
          * the shader finishes and thus memory stores might not finish before
737
          * the pixel shader starts.
738
          *
739
          * VLOAD is for atomics with return.
740
          */
741
         if (ctx->screen->info.chip_class >= GFX10 &&
742
             !shader->info.nr_param_exports &&
743
             shader->selector->info.base.writes_memory)
744
            ac_build_waitcnt(&ctx->ac, AC_WAIT_VLOAD | AC_WAIT_VSTORE);
745
      }
746

747
      ac_build_export(&ctx->ac, &pos_args[i]);
748
   }
749

750
   /* Build parameter exports. */
751
   for (unsigned i = 0; i < shader->info.nr_param_exports; i++)
752
      ac_build_export(&ctx->ac, &param_exports[i]);
753
}
754

755
void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
756
{
757
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
758
   struct si_shader_info *info = &ctx->shader->selector->info;
759
   struct si_shader_output_values *outputs = NULL;
760
   int i, j;
761

762
   assert(!ctx->shader->is_gs_copy_shader);
763
   assert(info->num_outputs <= max_outputs);
764

765
   outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
766

767
   for (i = 0; i < info->num_outputs; i++) {
768
      outputs[i].semantic = info->output_semantic[i];
769

770
      for (j = 0; j < 4; j++) {
771
         outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
772
         outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
773
      }
774
   }
775

776
   if (!ctx->screen->use_ngg_streamout && ctx->shader->selector->so.num_outputs)
777
      si_llvm_emit_streamout(ctx, outputs, i, 0);
778

779
   /* Export PrimitiveID. */
780
   if (ctx->shader->key.mono.u.vs_export_prim_id) {
781
      outputs[i].semantic = VARYING_SLOT_PRIMITIVE_ID;
782
      outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
783
      for (j = 1; j < 4; j++)
784
         outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
785

786
      memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
787
      i++;
788
   }
789

790
   si_llvm_build_vs_exports(ctx, outputs, i);
791
   FREE(outputs);
792
}
793

794
static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
795
                                                  LLVMValueRef *addrs)
796
{
797
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
798
   struct si_shader_info *info = &ctx->shader->selector->info;
799
   LLVMValueRef pos[4] = {};
800

801
   assert(info->num_outputs <= max_outputs);
802

803
   for (unsigned i = 0; i < info->num_outputs; i++) {
804
      if (info->output_semantic[i] != VARYING_SLOT_POS)
805
         continue;
806

807
      for (unsigned chan = 0; chan < 4; chan++)
808
         pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
809
      break;
810
   }
811
   assert(pos[0] != NULL);
812

813
   /* Return the position output. */
814
   LLVMValueRef ret = ctx->return_value;
815
   for (unsigned chan = 0; chan < 4; chan++)
816
      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
817
   ctx->return_value = ret;
818
}
819

820
/**
821
 * Build the vertex shader prolog function.
822
 *
823
 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
824
 * All inputs are returned unmodified. The vertex load indices are
825
 * stored after them, which will be used by the API VS for fetching inputs.
826
 *
827
 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
828
 *   input_v0,
829
 *   input_v1,
830
 *   input_v2,
831
 *   input_v3,
832
 *   (VertexID + BaseVertex),
833
 *   (InstanceID + StartInstance),
834
 *   (InstanceID / 2 + StartInstance)
835
 */
836
void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
837
{
838
   LLVMTypeRef *returns;
839
   LLVMValueRef ret, func;
840
   int num_returns, i;
841
   unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
842
   unsigned num_input_vgprs =
843
      key->vs_prolog.num_merged_next_stage_vgprs + 4;
844
   struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
845
   struct ac_arg input_vgpr_param[10];
846
   LLVMValueRef input_vgprs[10];
847
   unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + num_input_vgprs;
848
   unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
849

850
   memset(&ctx->args, 0, sizeof(ctx->args));
851

852
   /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
853
   returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * sizeof(LLVMTypeRef));
854
   num_returns = 0;
855

856
   /* Declare input and output SGPRs. */
857
   for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
858
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &input_sgpr_param[i]);
859
      returns[num_returns++] = ctx->ac.i32;
860
   }
861

862
   struct ac_arg merged_wave_info = input_sgpr_param[3];
863

864
   /* Preloaded VGPRs (outputs must be floats) */
865
   for (i = 0; i < num_input_vgprs; i++) {
866
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
867
      returns[num_returns++] = ctx->ac.f32;
868
   }
869

870
   /* Vertex load indices. */
871
   for (i = 0; i < key->vs_prolog.num_inputs; i++)
872
      returns[num_returns++] = ctx->ac.f32;
873

874
   /* Create the function. */
875
   si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
876
   func = ctx->main_fn;
877

878
   for (i = 0; i < num_input_vgprs; i++) {
879
      input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
880
   }
881

882
   if (key->vs_prolog.num_merged_next_stage_vgprs) {
883
      if (!key->vs_prolog.is_monolithic)
884
         ac_init_exec_full_mask(&ctx->ac);
885

886
      if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) {
887
         /* If there are no HS threads, SPI loads the LS VGPRs
888
          * starting at VGPR 0. Shift them back to where they
889
          * belong.
890
          */
891
         LLVMValueRef has_hs_threads =
892
            LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
893
                          si_unpack_param(ctx, input_sgpr_param[3], 8, 8), ctx->ac.i32_0, "");
894

895
         for (i = 4; i > 0; --i) {
896
            input_vgprs[i + 1] = LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
897
                                                 input_vgprs[i + 1], input_vgprs[i - 1], "");
898
         }
899
      }
900
   }
901

902
   /* The culling code stored the LDS addresses of the VGPRs into those VGPRs. Load them. */
903
   if (key->vs_prolog.load_vgprs_after_culling) {
904
      for (i = 5; i <= 8; i++) {
905
         bool is_tes_rel_patch_id = i == 7;
906
         input_vgprs[i] = LLVMBuildIntToPtr(ctx->ac.builder, input_vgprs[i],
907
                                            LLVMPointerType(is_tes_rel_patch_id ? ctx->ac.i8 : ctx->ac.i32,
908
                                                            AC_ADDR_SPACE_LDS), "");
909
         input_vgprs[i] = LLVMBuildLoad(ctx->ac.builder, input_vgprs[i], "");
910
         if (is_tes_rel_patch_id)
911
            input_vgprs[i] = LLVMBuildZExt(ctx->ac.builder, input_vgprs[i], ctx->ac.i32, "");
912
      }
913
   }
914

915
   if (key->vs_prolog.gs_fast_launch_tri_list || key->vs_prolog.gs_fast_launch_tri_strip) {
916
      LLVMValueRef wave_id, thread_id_in_tg;
917

918
      wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
919
      thread_id_in_tg =
920
         ac_build_imad(&ctx->ac, wave_id, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
921
                       ac_get_thread_id(&ctx->ac));
922

923
      /* The GS fast launch initializes all VGPRs to the value of
924
       * the first thread, so we have to add the thread ID.
925
       *
926
       * Only these are initialized by the hw:
927
       *   VGPR2: Base Primitive ID
928
       *   VGPR5: Base Vertex ID
929
       *   VGPR6: Instance ID
930
       */
931

932
      /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
933
       * The NGG cull shader will read them from there.
934
       */
935
      if (key->vs_prolog.gs_fast_launch_tri_list) {
936
         input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx01_offset */
937
                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
938
                                        LLVMConstInt(ctx->ac.i32, 0, 0));
939
         input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx23_offset */
940
                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
941
                                        LLVMConstInt(ctx->ac.i32, 1, 0));
942
         input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx45_offset */
943
                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
944
                                        LLVMConstInt(ctx->ac.i32, 2, 0));
945
      } else {
946
         assert(key->vs_prolog.gs_fast_launch_tri_strip);
947
         LLVMBuilderRef builder = ctx->ac.builder;
948
         /* Triangle indices: */
949
         LLVMValueRef index[3] = {
950
            thread_id_in_tg,
951
            LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 1, 0), ""),
952
            LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 2, 0), ""),
953
         };
954
         LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, thread_id_in_tg, ctx->ac.i1, "");
955
         LLVMValueRef flatshade_first = LLVMBuildICmp(
956
            builder, LLVMIntEQ,
957
            si_unpack_param(ctx, input_sgpr_param[8 + SI_SGPR_VS_STATE_BITS], 4, 2),
958
            ctx->ac.i32_0, "");
959

960
         ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, index);
961
         input_vgprs[0] = index[0];
962
         input_vgprs[1] = index[1];
963
         input_vgprs[4] = index[2];
964
      }
965

966
      /* Triangles always have all edge flags set initially. */
967
      input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
968

969
      input_vgprs[2] =
970
         LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], thread_id_in_tg, ""); /* PrimID */
971
      input_vgprs[5] =
972
         LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */
973
      input_vgprs[8] = input_vgprs[6];                                       /* InstanceID */
974

975
      if (key->vs_prolog.gs_fast_launch_index_size_packed) {
976
         LLVMTypeRef index_type = ctx->ac.voidt;
977

978
         switch (key->vs_prolog.gs_fast_launch_index_size_packed) {
979
         case 1:
980
            index_type = ctx->ac.i8;
981
            break;
982
         case 2:
983
            index_type = ctx->ac.i16;
984
            break;
985
         case 3:
986
            index_type = ctx->ac.i32;
987
            break;
988
         default:
989
            unreachable("invalid gs_fast_launch_index_size_packed");
990
         }
991

992
         LLVMValueRef sgprs[2] = {
993
            ac_get_arg(&ctx->ac, input_sgpr_param[0]),
994
            ac_get_arg(&ctx->ac, input_sgpr_param[1]),
995
         };
996
         LLVMValueRef indices = ac_build_gather_values(&ctx->ac, sgprs, 2);
997
         indices = LLVMBuildBitCast(ctx->ac.builder, indices, ctx->ac.i64, "");
998
         indices = LLVMBuildIntToPtr(ctx->ac.builder, indices,
999
                                     LLVMPointerType(index_type, AC_ADDR_SPACE_CONST), "");
1000

1001
         LLVMValueRef vertex_id = ac_build_alloca_init(&ctx->ac, input_vgprs[5], "");
1002

1003
         /* if (is ES thread...) */
1004
         ac_build_ifcc(&ctx->ac,
1005
                       LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
1006
                                     si_unpack_param(ctx, merged_wave_info, 0, 8), ""), 0);
1007
         /* VertexID = indexBufferLoad(VertexID); */
1008
         LLVMValueRef index = LLVMBuildGEP(ctx->ac.builder, indices, &input_vgprs[5], 1, "");
1009
         index = LLVMBuildLoad(ctx->ac.builder, index, "");
1010
         index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i32, "");
1011
         LLVMBuildStore(ctx->ac.builder, index, vertex_id);
1012
         ac_build_endif(&ctx->ac, 0);
1013

1014
         input_vgprs[5] = LLVMBuildLoad(ctx->ac.builder, vertex_id, "");
1015
      }
1016
   }
1017

1018
   unsigned vertex_id_vgpr = first_vs_vgpr;
1019
   unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX10
1020
                                  ? first_vs_vgpr + 3
1021
                                  : first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
1022

1023
   ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
1024
   ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
1025

1026
   /* InstanceID = VertexID >> 16;
1027
    * VertexID   = VertexID & 0xffff;
1028
    */
1029
   if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
1030
      ctx->abi.instance_id =
1031
         LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
1032
      ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
1033
                                        LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
1034
   }
1035

1036
   /* Copy inputs to outputs. This should be no-op, as the registers match,
1037
    * but it will prevent the compiler from overwriting them unintentionally.
1038
    */
1039
   ret = ctx->return_value;
1040
   for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
1041
      LLVMValueRef p = LLVMGetParam(func, i);
1042
      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
1043
   }
1044
   for (i = 0; i < num_input_vgprs; i++) {
1045
      LLVMValueRef p = input_vgprs[i];
1046

1047
      if (i == vertex_id_vgpr)
1048
         p = ctx->abi.vertex_id;
1049
      else if (i == instance_id_vgpr)
1050
         p = ctx->abi.instance_id;
1051

1052
      p = ac_to_float(&ctx->ac, p);
1053
      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, key->vs_prolog.num_input_sgprs + i, "");
1054
   }
1055

1056
   /* Compute vertex load indices from instance divisors. */
1057
   LLVMValueRef instance_divisor_constbuf = NULL;
1058

1059
   if (key->vs_prolog.states.instance_divisor_is_fetched) {
1060
      LLVMValueRef list = si_prolog_get_internal_bindings(ctx);
1061
      LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
1062
      instance_divisor_constbuf = ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
1063
   }
1064

1065
   for (i = 0; i < key->vs_prolog.num_inputs; i++) {
1066
      bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i);
1067
      bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
1068
      LLVMValueRef index = NULL;
1069

1070
      if (divisor_is_one) {
1071
         index = ctx->abi.instance_id;
1072
      } else if (divisor_is_fetched) {
1073
         LLVMValueRef udiv_factors[4];
1074

1075
         for (unsigned j = 0; j < 4; j++) {
1076
            udiv_factors[j] = si_buffer_load_const(ctx, instance_divisor_constbuf,
1077
                                                   LLVMConstInt(ctx->ac.i32, i * 16 + j * 4, 0));
1078
            udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
1079
         }
1080
         /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
1081
          * Such InstanceID might not be achievable in a reasonable time though.
1082
          */
1083
         index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, udiv_factors[0],
1084
                                        udiv_factors[1], udiv_factors[2], udiv_factors[3]);
1085
      }
1086

1087
      if (divisor_is_one || divisor_is_fetched) {
1088
         /* Add StartInstance. */
1089
         index =
1090
            LLVMBuildAdd(ctx->ac.builder, index,
1091
                         LLVMGetParam(ctx->main_fn, user_sgpr_base + SI_SGPR_START_INSTANCE), "");
1092
      } else {
1093
         /* VertexID + BaseVertex */
1094
         index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
1095
                              LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), "");
1096
      }
1097

1098
      index = ac_to_float(&ctx->ac, index);
1099
      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, ctx->args.arg_count + i, "");
1100
   }
1101

1102
   si_llvm_build_ret(ctx, ret);
1103
}
1104

1105
static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi, bool non_indexed_is_zero)
1106
{
1107
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1108

1109
   /* This doesn't happen with GL: */
1110
   if (!non_indexed_is_zero)
1111
      return ac_get_arg(&ctx->ac, ctx->args.base_vertex);
1112

1113
   /* For non-indexed draws, the base vertex set by the driver
1114
    * (for direct draws) or the CP (for indirect draws) is the
1115
    * first vertex ID, but GLSL expects 0 to be returned.
1116
    */
1117
   LLVMValueRef indexed = si_unpack_param(ctx, ctx->vs_state_bits, 1, 1);
1118
   indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
1119

1120
   return LLVMBuildSelect(ctx->ac.builder, indexed, ac_get_arg(&ctx->ac, ctx->args.base_vertex),
1121
                          ctx->ac.i32_0, "");
1122
}
1123

1124
void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
1125
{
1126
   struct si_shader *shader = ctx->shader;
1127

1128
   if (shader->key.as_ls)
1129
      ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
1130
   else if (shader->key.as_es)
1131
      ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
1132
   else if (shader->key.opt.vs_as_prim_discard_cs)
1133
      ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
1134
   else if (ngg_cull_shader)
1135
      ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
1136
   else if (shader->key.as_ngg)
1137
      ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
1138
   else
1139
      ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
1140

1141
   ctx->abi.load_base_vertex = get_base_vertex;
1142
}
1143

1144
Product

Resources

Company